diff --git "a/checkpoint-21000/trainer_state.json" "b/checkpoint-21000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-21000/trainer_state.json"
@@ -0,0 +1,147033 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.21,
+  "eval_steps": 500,
+  "global_step": 21000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1e-05,
+      "grad_norm": 1.2723733186721802,
+      "learning_rate": 3e-06,
+      "loss": 10.8324,
+      "step": 1
+    },
+    {
+      "epoch": 2e-05,
+      "grad_norm": 1.2627283334732056,
+      "learning_rate": 6e-06,
+      "loss": 10.8325,
+      "step": 2
+    },
+    {
+      "epoch": 3e-05,
+      "grad_norm": 1.2797267436981201,
+      "learning_rate": 9e-06,
+      "loss": 10.8328,
+      "step": 3
+    },
+    {
+      "epoch": 4e-05,
+      "grad_norm": 1.2568743228912354,
+      "learning_rate": 1.2e-05,
+      "loss": 10.8307,
+      "step": 4
+    },
+    {
+      "epoch": 5e-05,
+      "grad_norm": 1.26583731174469,
+      "learning_rate": 1.5e-05,
+      "loss": 10.8273,
+      "step": 5
+    },
+    {
+      "epoch": 6e-05,
+      "grad_norm": 1.268788456916809,
+      "learning_rate": 1.8e-05,
+      "loss": 10.8268,
+      "step": 6
+    },
+    {
+      "epoch": 7e-05,
+      "grad_norm": 1.215290904045105,
+      "learning_rate": 2.1000000000000002e-05,
+      "loss": 10.8151,
+      "step": 7
+    },
+    {
+      "epoch": 8e-05,
+      "grad_norm": 1.1221675872802734,
+      "learning_rate": 2.4e-05,
+      "loss": 10.794,
+      "step": 8
+    },
+    {
+      "epoch": 9e-05,
+      "grad_norm": 1.0951769351959229,
+      "learning_rate": 2.7e-05,
+      "loss": 10.7888,
+      "step": 9
+    },
+    {
+      "epoch": 0.0001,
+      "grad_norm": 1.105851650238037,
+      "learning_rate": 3e-05,
+      "loss": 10.7765,
+      "step": 10
+    },
+    {
+      "epoch": 0.00011,
+      "grad_norm": 1.0884467363357544,
+      "learning_rate": 3.2999999999999996e-05,
+      "loss": 10.763,
+      "step": 11
+    },
+    {
+      "epoch": 0.00012,
+      "grad_norm": 1.0829719305038452,
+      "learning_rate": 3.6e-05,
+      "loss": 10.7508,
+      "step": 12
+    },
+    {
+      "epoch": 0.00013,
+      "grad_norm": 1.0506291389465332,
+      "learning_rate": 3.9e-05,
+      "loss": 10.7324,
+      "step": 13
+    },
+    {
+      "epoch": 0.00014,
+      "grad_norm": 1.037864089012146,
+      "learning_rate": 4.2000000000000004e-05,
+      "loss": 10.7208,
+      "step": 14
+    },
+    {
+      "epoch": 0.00015,
+      "grad_norm": 1.0115288496017456,
+      "learning_rate": 4.4999999999999996e-05,
+      "loss": 10.7117,
+      "step": 15
+    },
+    {
+      "epoch": 0.00016,
+      "grad_norm": 0.9676744341850281,
+      "learning_rate": 4.8e-05,
+      "loss": 10.6952,
+      "step": 16
+    },
+    {
+      "epoch": 0.00017,
+      "grad_norm": 0.9355509877204895,
+      "learning_rate": 5.1000000000000006e-05,
+      "loss": 10.6792,
+      "step": 17
+    },
+    {
+      "epoch": 0.00018,
+      "grad_norm": 0.9286826848983765,
+      "learning_rate": 5.4e-05,
+      "loss": 10.6649,
+      "step": 18
+    },
+    {
+      "epoch": 0.00019,
+      "grad_norm": 0.9110698699951172,
+      "learning_rate": 5.7e-05,
+      "loss": 10.6512,
+      "step": 19
+    },
+    {
+      "epoch": 0.0002,
+      "grad_norm": 0.9136782288551331,
+      "learning_rate": 6e-05,
+      "loss": 10.6404,
+      "step": 20
+    },
+    {
+      "epoch": 0.00021,
+      "grad_norm": 0.8971966505050659,
+      "learning_rate": 6.3e-05,
+      "loss": 10.6274,
+      "step": 21
+    },
+    {
+      "epoch": 0.00022,
+      "grad_norm": 0.8972620368003845,
+      "learning_rate": 6.599999999999999e-05,
+      "loss": 10.6136,
+      "step": 22
+    },
+    {
+      "epoch": 0.00023,
+      "grad_norm": 0.8984483480453491,
+      "learning_rate": 6.9e-05,
+      "loss": 10.6016,
+      "step": 23
+    },
+    {
+      "epoch": 0.00024,
+      "grad_norm": 0.8967456817626953,
+      "learning_rate": 7.2e-05,
+      "loss": 10.5894,
+      "step": 24
+    },
+    {
+      "epoch": 0.00025,
+      "grad_norm": 0.8972211480140686,
+      "learning_rate": 7.500000000000001e-05,
+      "loss": 10.5752,
+      "step": 25
+    },
+    {
+      "epoch": 0.00026,
+      "grad_norm": 0.9028114080429077,
+      "learning_rate": 7.8e-05,
+      "loss": 10.5611,
+      "step": 26
+    },
+    {
+      "epoch": 0.00027,
+      "grad_norm": 0.8926876783370972,
+      "learning_rate": 8.1e-05,
+      "loss": 10.5491,
+      "step": 27
+    },
+    {
+      "epoch": 0.00028,
+      "grad_norm": 0.8921052813529968,
+      "learning_rate": 8.400000000000001e-05,
+      "loss": 10.536,
+      "step": 28
+    },
+    {
+      "epoch": 0.00029,
+      "grad_norm": 0.8942669034004211,
+      "learning_rate": 8.7e-05,
+      "loss": 10.5219,
+      "step": 29
+    },
+    {
+      "epoch": 0.0003,
+      "grad_norm": 0.9005073308944702,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 10.5056,
+      "step": 30
+    },
+    {
+      "epoch": 0.00031,
+      "grad_norm": 0.8994124531745911,
+      "learning_rate": 9.3e-05,
+      "loss": 10.491,
+      "step": 31
+    },
+    {
+      "epoch": 0.00032,
+      "grad_norm": 0.8968571424484253,
+      "learning_rate": 9.6e-05,
+      "loss": 10.4763,
+      "step": 32
+    },
+    {
+      "epoch": 0.00033,
+      "grad_norm": 0.8976972103118896,
+      "learning_rate": 9.900000000000001e-05,
+      "loss": 10.4597,
+      "step": 33
+    },
+    {
+      "epoch": 0.00034,
+      "grad_norm": 0.8977769017219543,
+      "learning_rate": 0.00010200000000000001,
+      "loss": 10.4427,
+      "step": 34
+    },
+    {
+      "epoch": 0.00035,
+      "grad_norm": 0.902169406414032,
+      "learning_rate": 0.00010500000000000002,
+      "loss": 10.4252,
+      "step": 35
+    },
+    {
+      "epoch": 0.00036,
+      "grad_norm": 0.8990501165390015,
+      "learning_rate": 0.000108,
+      "loss": 10.4079,
+      "step": 36
+    },
+    {
+      "epoch": 0.00037,
+      "grad_norm": 0.8933607935905457,
+      "learning_rate": 0.000111,
+      "loss": 10.39,
+      "step": 37
+    },
+    {
+      "epoch": 0.00038,
+      "grad_norm": 0.8925058245658875,
+      "learning_rate": 0.000114,
+      "loss": 10.3704,
+      "step": 38
+    },
+    {
+      "epoch": 0.00039,
+      "grad_norm": 0.8942745923995972,
+      "learning_rate": 0.000117,
+      "loss": 10.3512,
+      "step": 39
+    },
+    {
+      "epoch": 0.0004,
+      "grad_norm": 0.8984111547470093,
+      "learning_rate": 0.00012,
+      "loss": 10.3286,
+      "step": 40
+    },
+    {
+      "epoch": 0.00041,
+      "grad_norm": 0.8943851590156555,
+      "learning_rate": 0.000123,
+      "loss": 10.3097,
+      "step": 41
+    },
+    {
+      "epoch": 0.00042,
+      "grad_norm": 0.8935915231704712,
+      "learning_rate": 0.000126,
+      "loss": 10.2894,
+      "step": 42
+    },
+    {
+      "epoch": 0.00043,
+      "grad_norm": 0.8975799679756165,
+      "learning_rate": 0.000129,
+      "loss": 10.2654,
+      "step": 43
+    },
+    {
+      "epoch": 0.00044,
+      "grad_norm": 0.8982045650482178,
+      "learning_rate": 0.00013199999999999998,
+      "loss": 10.2433,
+      "step": 44
+    },
+    {
+      "epoch": 0.00045,
+      "grad_norm": 0.9000449180603027,
+      "learning_rate": 0.000135,
+      "loss": 10.2204,
+      "step": 45
+    },
+    {
+      "epoch": 0.00046,
+      "grad_norm": 0.8900250792503357,
+      "learning_rate": 0.000138,
+      "loss": 10.1983,
+      "step": 46
+    },
+    {
+      "epoch": 0.00047,
+      "grad_norm": 0.8965498805046082,
+      "learning_rate": 0.000141,
+      "loss": 10.1723,
+      "step": 47
+    },
+    {
+      "epoch": 0.00048,
+      "grad_norm": 0.8975719213485718,
+      "learning_rate": 0.000144,
+      "loss": 10.149,
+      "step": 48
+    },
+    {
+      "epoch": 0.00049,
+      "grad_norm": 0.8933398127555847,
+      "learning_rate": 0.000147,
+      "loss": 10.1239,
+      "step": 49
+    },
+    {
+      "epoch": 0.0005,
+      "grad_norm": 0.8988479375839233,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 10.0972,
+      "step": 50
+    },
+    {
+      "epoch": 0.00051,
+      "grad_norm": 0.8983374834060669,
+      "learning_rate": 0.000153,
+      "loss": 10.0711,
+      "step": 51
+    },
+    {
+      "epoch": 0.00052,
+      "grad_norm": 0.8959178328514099,
+      "learning_rate": 0.000156,
+      "loss": 10.0437,
+      "step": 52
+    },
+    {
+      "epoch": 0.00053,
+      "grad_norm": 0.8871957063674927,
+      "learning_rate": 0.000159,
+      "loss": 10.0204,
+      "step": 53
+    },
+    {
+      "epoch": 0.00054,
+      "grad_norm": 0.9051761627197266,
+      "learning_rate": 0.000162,
+      "loss": 9.9878,
+      "step": 54
+    },
+    {
+      "epoch": 0.00055,
+      "grad_norm": 0.8952219486236572,
+      "learning_rate": 0.000165,
+      "loss": 9.963,
+      "step": 55
+    },
+    {
+      "epoch": 0.00056,
+      "grad_norm": 0.890164315700531,
+      "learning_rate": 0.00016800000000000002,
+      "loss": 9.9341,
+      "step": 56
+    },
+    {
+      "epoch": 0.00057,
+      "grad_norm": 0.8922548890113831,
+      "learning_rate": 0.000171,
+      "loss": 9.9069,
+      "step": 57
+    },
+    {
+      "epoch": 0.00058,
+      "grad_norm": 0.8901249766349792,
+      "learning_rate": 0.000174,
+      "loss": 9.882,
+      "step": 58
+    },
+    {
+      "epoch": 0.00059,
+      "grad_norm": 0.8989579677581787,
+      "learning_rate": 0.000177,
+      "loss": 9.8502,
+      "step": 59
+    },
+    {
+      "epoch": 0.0006,
+      "grad_norm": 0.8829832673072815,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 9.8242,
+      "step": 60
+    },
+    {
+      "epoch": 0.00061,
+      "grad_norm": 0.8862175345420837,
+      "learning_rate": 0.000183,
+      "loss": 9.7955,
+      "step": 61
+    },
+    {
+      "epoch": 0.00062,
+      "grad_norm": 0.8893216848373413,
+      "learning_rate": 0.000186,
+      "loss": 9.7648,
+      "step": 62
+    },
+    {
+      "epoch": 0.00063,
+      "grad_norm": 0.8881028294563293,
+      "learning_rate": 0.000189,
+      "loss": 9.7373,
+      "step": 63
+    },
+    {
+      "epoch": 0.00064,
+      "grad_norm": 0.8868633508682251,
+      "learning_rate": 0.000192,
+      "loss": 9.7068,
+      "step": 64
+    },
+    {
+      "epoch": 0.00065,
+      "grad_norm": 0.8924434185028076,
+      "learning_rate": 0.00019500000000000002,
+      "loss": 9.6743,
+      "step": 65
+    },
+    {
+      "epoch": 0.00066,
+      "grad_norm": 0.8872416019439697,
+      "learning_rate": 0.00019800000000000002,
+      "loss": 9.6503,
+      "step": 66
+    },
+    {
+      "epoch": 0.00067,
+      "grad_norm": 0.8866419196128845,
+      "learning_rate": 0.000201,
+      "loss": 9.62,
+      "step": 67
+    },
+    {
+      "epoch": 0.00068,
+      "grad_norm": 0.8931677937507629,
+      "learning_rate": 0.00020400000000000003,
+      "loss": 9.5881,
+      "step": 68
+    },
+    {
+      "epoch": 0.00069,
+      "grad_norm": 0.879610002040863,
+      "learning_rate": 0.00020700000000000002,
+      "loss": 9.5597,
+      "step": 69
+    },
+    {
+      "epoch": 0.0007,
+      "grad_norm": 0.8912403583526611,
+      "learning_rate": 0.00021000000000000004,
+      "loss": 9.5236,
+      "step": 70
+    },
+    {
+      "epoch": 0.00071,
+      "grad_norm": 0.8847392797470093,
+      "learning_rate": 0.00021299999999999997,
+      "loss": 9.4965,
+      "step": 71
+    },
+    {
+      "epoch": 0.00072,
+      "grad_norm": 0.8860267996788025,
+      "learning_rate": 0.000216,
+      "loss": 9.4698,
+      "step": 72
+    },
+    {
+      "epoch": 0.00073,
+      "grad_norm": 0.886963963508606,
+      "learning_rate": 0.00021899999999999998,
+      "loss": 9.4354,
+      "step": 73
+    },
+    {
+      "epoch": 0.00074,
+      "grad_norm": 0.8829045295715332,
+      "learning_rate": 0.000222,
+      "loss": 9.4089,
+      "step": 74
+    },
+    {
+      "epoch": 0.00075,
+      "grad_norm": 0.8822581768035889,
+      "learning_rate": 0.000225,
+      "loss": 9.3746,
+      "step": 75
+    },
+    {
+      "epoch": 0.00076,
+      "grad_norm": 0.8918945789337158,
+      "learning_rate": 0.000228,
+      "loss": 9.3485,
+      "step": 76
+    },
+    {
+      "epoch": 0.00077,
+      "grad_norm": 0.8851014375686646,
+      "learning_rate": 0.000231,
+      "loss": 9.3206,
+      "step": 77
+    },
+    {
+      "epoch": 0.00078,
+      "grad_norm": 0.8782386183738708,
+      "learning_rate": 0.000234,
+      "loss": 9.2909,
+      "step": 78
+    },
+    {
+      "epoch": 0.00079,
+      "grad_norm": 0.8847852349281311,
+      "learning_rate": 0.00023700000000000001,
+      "loss": 9.2539,
+      "step": 79
+    },
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.8866625428199768,
+      "learning_rate": 0.00024,
+      "loss": 9.2184,
+      "step": 80
+    },
+    {
+      "epoch": 0.00081,
+      "grad_norm": 0.8906494975090027,
+      "learning_rate": 0.00024300000000000002,
+      "loss": 9.1857,
+      "step": 81
+    },
+    {
+      "epoch": 0.00082,
+      "grad_norm": 0.8960816264152527,
+      "learning_rate": 0.000246,
+      "loss": 9.1611,
+      "step": 82
+    },
+    {
+      "epoch": 0.00083,
+      "grad_norm": 0.8946034908294678,
+      "learning_rate": 0.00024900000000000004,
+      "loss": 9.1247,
+      "step": 83
+    },
+    {
+      "epoch": 0.00084,
+      "grad_norm": 0.8957815170288086,
+      "learning_rate": 0.000252,
+      "loss": 9.0939,
+      "step": 84
+    },
+    {
+      "epoch": 0.00085,
+      "grad_norm": 0.8891732692718506,
+      "learning_rate": 0.000255,
+      "loss": 9.0702,
+      "step": 85
+    },
+    {
+      "epoch": 0.00086,
+      "grad_norm": 0.8951762318611145,
+      "learning_rate": 0.000258,
+      "loss": 9.0346,
+      "step": 86
+    },
+    {
+      "epoch": 0.00087,
+      "grad_norm": 0.8848313093185425,
+      "learning_rate": 0.000261,
+      "loss": 9.0125,
+      "step": 87
+    },
+    {
+      "epoch": 0.00088,
+      "grad_norm": 0.8860167860984802,
+      "learning_rate": 0.00026399999999999997,
+      "loss": 8.9751,
+      "step": 88
+    },
+    {
+      "epoch": 0.00089,
+      "grad_norm": 0.8798444271087646,
+      "learning_rate": 0.000267,
+      "loss": 8.9486,
+      "step": 89
+    },
+    {
+      "epoch": 0.0009,
+      "grad_norm": 0.8837233185768127,
+      "learning_rate": 0.00027,
+      "loss": 8.9221,
+      "step": 90
+    },
+    {
+      "epoch": 0.00091,
+      "grad_norm": 0.879225492477417,
+      "learning_rate": 0.000273,
+      "loss": 8.8916,
+      "step": 91
+    },
+    {
+      "epoch": 0.00092,
+      "grad_norm": 0.8843948245048523,
+      "learning_rate": 0.000276,
+      "loss": 8.8596,
+      "step": 92
+    },
+    {
+      "epoch": 0.00093,
+      "grad_norm": 0.882838785648346,
+      "learning_rate": 0.000279,
+      "loss": 8.8297,
+      "step": 93
+    },
+    {
+      "epoch": 0.00094,
+      "grad_norm": 0.8830418586730957,
+      "learning_rate": 0.000282,
+      "loss": 8.8034,
+      "step": 94
+    },
+    {
+      "epoch": 0.00095,
+      "grad_norm": 0.8770962357521057,
+      "learning_rate": 0.000285,
+      "loss": 8.7743,
+      "step": 95
+    },
+    {
+      "epoch": 0.00096,
+      "grad_norm": 0.8804563879966736,
+      "learning_rate": 0.000288,
+      "loss": 8.7444,
+      "step": 96
+    },
+    {
+      "epoch": 0.00097,
+      "grad_norm": 0.8753241300582886,
+      "learning_rate": 0.000291,
+      "loss": 8.7211,
+      "step": 97
+    },
+    {
+      "epoch": 0.00098,
+      "grad_norm": 0.8762865662574768,
+      "learning_rate": 0.000294,
+      "loss": 8.6826,
+      "step": 98
+    },
+    {
+      "epoch": 0.00099,
+      "grad_norm": 0.8762408494949341,
+      "learning_rate": 0.000297,
+      "loss": 8.6601,
+      "step": 99
+    },
+    {
+      "epoch": 0.001,
+      "grad_norm": 0.8741625547409058,
+      "learning_rate": 0.00030000000000000003,
+      "loss": 8.6324,
+      "step": 100
+    },
+    {
+      "epoch": 0.00101,
+      "grad_norm": 0.8789051175117493,
+      "learning_rate": 0.00030300000000000005,
+      "loss": 8.5981,
+      "step": 101
+    },
+    {
+      "epoch": 0.00102,
+      "grad_norm": 0.8656250834465027,
+      "learning_rate": 0.000306,
+      "loss": 8.5772,
+      "step": 102
+    },
+    {
+      "epoch": 0.00103,
+      "grad_norm": 0.8743636012077332,
+      "learning_rate": 0.000309,
+      "loss": 8.5519,
+      "step": 103
+    },
+    {
+      "epoch": 0.00104,
+      "grad_norm": 0.8708215951919556,
+      "learning_rate": 0.000312,
+      "loss": 8.5298,
+      "step": 104
+    },
+    {
+      "epoch": 0.00105,
+      "grad_norm": 0.8859707713127136,
+      "learning_rate": 0.000315,
+      "loss": 8.5033,
+      "step": 105
+    },
+    {
+      "epoch": 0.00106,
+      "grad_norm": 0.9068981409072876,
+      "learning_rate": 0.000318,
+      "loss": 8.4736,
+      "step": 106
+    },
+    {
+      "epoch": 0.00107,
+      "grad_norm": 0.9245584011077881,
+      "learning_rate": 0.000321,
+      "loss": 8.4478,
+      "step": 107
+    },
+    {
+      "epoch": 0.00108,
+      "grad_norm": 0.9128947257995605,
+      "learning_rate": 0.000324,
+      "loss": 8.4188,
+      "step": 108
+    },
+    {
+      "epoch": 0.00109,
+      "grad_norm": 0.853670060634613,
+      "learning_rate": 0.000327,
+      "loss": 8.3963,
+      "step": 109
+    },
+    {
+      "epoch": 0.0011,
+      "grad_norm": 0.8760496377944946,
+      "learning_rate": 0.00033,
+      "loss": 8.3734,
+      "step": 110
+    },
+    {
+      "epoch": 0.00111,
+      "grad_norm": 0.9078761339187622,
+      "learning_rate": 0.000333,
+      "loss": 8.3444,
+      "step": 111
+    },
+    {
+      "epoch": 0.00112,
+      "grad_norm": 0.866322934627533,
+      "learning_rate": 0.00033600000000000004,
+      "loss": 8.3207,
+      "step": 112
+    },
+    {
+      "epoch": 0.00113,
+      "grad_norm": 0.8490086197853088,
+      "learning_rate": 0.000339,
+      "loss": 8.2796,
+      "step": 113
+    },
+    {
+      "epoch": 0.00114,
+      "grad_norm": 0.8713237047195435,
+      "learning_rate": 0.000342,
+      "loss": 8.2694,
+      "step": 114
+    },
+    {
+      "epoch": 0.00115,
+      "grad_norm": 0.8709179162979126,
+      "learning_rate": 0.00034500000000000004,
+      "loss": 8.2404,
+      "step": 115
+    },
+    {
+      "epoch": 0.00116,
+      "grad_norm": 0.8300504684448242,
+      "learning_rate": 0.000348,
+      "loss": 8.214,
+      "step": 116
+    },
+    {
+      "epoch": 0.00117,
+      "grad_norm": 0.8302497863769531,
+      "learning_rate": 0.000351,
+      "loss": 8.1818,
+      "step": 117
+    },
+    {
+      "epoch": 0.00118,
+      "grad_norm": 0.8652266263961792,
+      "learning_rate": 0.000354,
+      "loss": 8.1647,
+      "step": 118
+    },
+    {
+      "epoch": 0.00119,
+      "grad_norm": 0.9069057106971741,
+      "learning_rate": 0.000357,
+      "loss": 8.1514,
+      "step": 119
+    },
+    {
+      "epoch": 0.0012,
+      "grad_norm": 0.9525896906852722,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 8.1239,
+      "step": 120
+    },
+    {
+      "epoch": 0.00121,
+      "grad_norm": 0.9400830268859863,
+      "learning_rate": 0.000363,
+      "loss": 8.095,
+      "step": 121
+    },
+    {
+      "epoch": 0.00122,
+      "grad_norm": 0.8189533948898315,
+      "learning_rate": 0.000366,
+      "loss": 8.0717,
+      "step": 122
+    },
+    {
+      "epoch": 0.00123,
+      "grad_norm": 0.8198633790016174,
+      "learning_rate": 0.000369,
+      "loss": 8.0459,
+      "step": 123
+    },
+    {
+      "epoch": 0.00124,
+      "grad_norm": 0.9460069537162781,
+      "learning_rate": 0.000372,
+      "loss": 8.0261,
+      "step": 124
+    },
+    {
+      "epoch": 0.00125,
+      "grad_norm": 1.0734294652938843,
+      "learning_rate": 0.000375,
+      "loss": 8.002,
+      "step": 125
+    },
+    {
+      "epoch": 0.00126,
+      "grad_norm": 0.9635769724845886,
+      "learning_rate": 0.000378,
+      "loss": 7.9871,
+      "step": 126
+    },
+    {
+      "epoch": 0.00127,
+      "grad_norm": 0.7875692844390869,
+      "learning_rate": 0.000381,
+      "loss": 7.9581,
+      "step": 127
+    },
+    {
+      "epoch": 0.00128,
+      "grad_norm": 0.8865201473236084,
+      "learning_rate": 0.000384,
+      "loss": 7.9374,
+      "step": 128
+    },
+    {
+      "epoch": 0.00129,
+      "grad_norm": 0.9998716115951538,
+      "learning_rate": 0.00038700000000000003,
+      "loss": 7.9265,
+      "step": 129
+    },
+    {
+      "epoch": 0.0013,
+      "grad_norm": 0.8098431825637817,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 7.8932,
+      "step": 130
+    },
+    {
+      "epoch": 0.00131,
+      "grad_norm": 0.8202345967292786,
+      "learning_rate": 0.000393,
+      "loss": 7.8568,
+      "step": 131
+    },
+    {
+      "epoch": 0.00132,
+      "grad_norm": 0.9445962905883789,
+      "learning_rate": 0.00039600000000000003,
+      "loss": 7.8481,
+      "step": 132
+    },
+    {
+      "epoch": 0.00133,
+      "grad_norm": 0.8225625157356262,
+      "learning_rate": 0.00039900000000000005,
+      "loss": 7.8198,
+      "step": 133
+    },
+    {
+      "epoch": 0.00134,
+      "grad_norm": 0.8087729811668396,
+      "learning_rate": 0.000402,
+      "loss": 7.8072,
+      "step": 134
+    },
+    {
+      "epoch": 0.00135,
+      "grad_norm": 0.7232753038406372,
+      "learning_rate": 0.00040500000000000003,
+      "loss": 7.7727,
+      "step": 135
+    },
+    {
+      "epoch": 0.00136,
+      "grad_norm": 0.8383559584617615,
+      "learning_rate": 0.00040800000000000005,
+      "loss": 7.7588,
+      "step": 136
+    },
+    {
+      "epoch": 0.00137,
+      "grad_norm": 0.8329391479492188,
+      "learning_rate": 0.000411,
+      "loss": 7.738,
+      "step": 137
+    },
+    {
+      "epoch": 0.00138,
+      "grad_norm": 0.8072112202644348,
+      "learning_rate": 0.00041400000000000003,
+      "loss": 7.7102,
+      "step": 138
+    },
+    {
+      "epoch": 0.00139,
+      "grad_norm": 0.7906625270843506,
+      "learning_rate": 0.00041700000000000005,
+      "loss": 7.6947,
+      "step": 139
+    },
+    {
+      "epoch": 0.0014,
+      "grad_norm": 0.7997886538505554,
+      "learning_rate": 0.00042000000000000007,
+      "loss": 7.6753,
+      "step": 140
+    },
+    {
+      "epoch": 0.00141,
+      "grad_norm": 0.9642479419708252,
+      "learning_rate": 0.000423,
+      "loss": 7.6647,
+      "step": 141
+    },
+    {
+      "epoch": 0.00142,
+      "grad_norm": 0.8051616549491882,
+      "learning_rate": 0.00042599999999999995,
+      "loss": 7.6463,
+      "step": 142
+    },
+    {
+      "epoch": 0.00143,
+      "grad_norm": 0.7901502251625061,
+      "learning_rate": 0.00042899999999999997,
+      "loss": 7.6185,
+      "step": 143
+    },
+    {
+      "epoch": 0.00144,
+      "grad_norm": 0.6530913710594177,
+      "learning_rate": 0.000432,
+      "loss": 7.5947,
+      "step": 144
+    },
+    {
+      "epoch": 0.00145,
+      "grad_norm": 0.6823164820671082,
+      "learning_rate": 0.000435,
+      "loss": 7.5887,
+      "step": 145
+    },
+    {
+      "epoch": 0.00146,
+      "grad_norm": 0.7043561935424805,
+      "learning_rate": 0.00043799999999999997,
+      "loss": 7.5506,
+      "step": 146
+    },
+    {
+      "epoch": 0.00147,
+      "grad_norm": 0.660875141620636,
+      "learning_rate": 0.000441,
+      "loss": 7.5403,
+      "step": 147
+    },
+    {
+      "epoch": 0.00148,
+      "grad_norm": 0.6651095747947693,
+      "learning_rate": 0.000444,
+      "loss": 7.527,
+      "step": 148
+    },
+    {
+      "epoch": 0.00149,
+      "grad_norm": 0.594466507434845,
+      "learning_rate": 0.00044699999999999997,
+      "loss": 7.5177,
+      "step": 149
+    },
+    {
+      "epoch": 0.0015,
+      "grad_norm": 0.640634298324585,
+      "learning_rate": 0.00045,
+      "loss": 7.4872,
+      "step": 150
+    },
+    {
+      "epoch": 0.00151,
+      "grad_norm": 0.6399310231208801,
+      "learning_rate": 0.000453,
+      "loss": 7.4669,
+      "step": 151
+    },
+    {
+      "epoch": 0.00152,
+      "grad_norm": 0.6032711863517761,
+      "learning_rate": 0.000456,
+      "loss": 7.4595,
+      "step": 152
+    },
+    {
+      "epoch": 0.00153,
+      "grad_norm": 0.8105739951133728,
+      "learning_rate": 0.000459,
+      "loss": 7.4504,
+      "step": 153
+    },
+    {
+      "epoch": 0.00154,
+      "grad_norm": 0.9096337556838989,
+      "learning_rate": 0.000462,
+      "loss": 7.4401,
+      "step": 154
+    },
+    {
+      "epoch": 0.00155,
+      "grad_norm": 0.8464334607124329,
+      "learning_rate": 0.000465,
+      "loss": 7.4179,
+      "step": 155
+    },
+    {
+      "epoch": 0.00156,
+      "grad_norm": 0.8338698148727417,
+      "learning_rate": 0.000468,
+      "loss": 7.392,
+      "step": 156
+    },
+    {
+      "epoch": 0.00157,
+      "grad_norm": 0.6862301230430603,
+      "learning_rate": 0.000471,
+      "loss": 7.3898,
+      "step": 157
+    },
+    {
+      "epoch": 0.00158,
+      "grad_norm": 0.6174972057342529,
+      "learning_rate": 0.00047400000000000003,
+      "loss": 7.3642,
+      "step": 158
+    },
+    {
+      "epoch": 0.00159,
+      "grad_norm": 0.7215908765792847,
+      "learning_rate": 0.000477,
+      "loss": 7.3548,
+      "step": 159
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.7243570685386658,
+      "learning_rate": 0.00048,
+      "loss": 7.3363,
+      "step": 160
+    },
+    {
+      "epoch": 0.00161,
+      "grad_norm": 0.5998020172119141,
+      "learning_rate": 0.00048300000000000003,
+      "loss": 7.3105,
+      "step": 161
+    },
+    {
+      "epoch": 0.00162,
+      "grad_norm": 0.6509896516799927,
+      "learning_rate": 0.00048600000000000005,
+      "loss": 7.3157,
+      "step": 162
+    },
+    {
+      "epoch": 0.00163,
+      "grad_norm": 0.645023763179779,
+      "learning_rate": 0.0004890000000000001,
+      "loss": 7.2995,
+      "step": 163
+    },
+    {
+      "epoch": 0.00164,
+      "grad_norm": 0.47143545746803284,
+      "learning_rate": 0.000492,
+      "loss": 7.2948,
+      "step": 164
+    },
+    {
+      "epoch": 0.00165,
+      "grad_norm": 0.6931191086769104,
+      "learning_rate": 0.000495,
+      "loss": 7.2744,
+      "step": 165
+    },
+    {
+      "epoch": 0.00166,
+      "grad_norm": 0.5705839991569519,
+      "learning_rate": 0.0004980000000000001,
+      "loss": 7.2462,
+      "step": 166
+    },
+    {
+      "epoch": 0.00167,
+      "grad_norm": 0.5678435564041138,
+      "learning_rate": 0.000501,
+      "loss": 7.2388,
+      "step": 167
+    },
+    {
+      "epoch": 0.00168,
+      "grad_norm": 0.4481411278247833,
+      "learning_rate": 0.000504,
+      "loss": 7.2152,
+      "step": 168
+    },
+    {
+      "epoch": 0.00169,
+      "grad_norm": 0.5297079086303711,
+      "learning_rate": 0.0005070000000000001,
+      "loss": 7.2111,
+      "step": 169
+    },
+    {
+      "epoch": 0.0017,
+      "grad_norm": 0.5522683262825012,
+      "learning_rate": 0.00051,
+      "loss": 7.1957,
+      "step": 170
+    },
+    {
+      "epoch": 0.00171,
+      "grad_norm": 0.4941532611846924,
+      "learning_rate": 0.000513,
+      "loss": 7.1764,
+      "step": 171
+    },
+    {
+      "epoch": 0.00172,
+      "grad_norm": 0.40358924865722656,
+      "learning_rate": 0.000516,
+      "loss": 7.1752,
+      "step": 172
+    },
+    {
+      "epoch": 0.00173,
+      "grad_norm": 0.43254554271698,
+      "learning_rate": 0.0005189999999999999,
+      "loss": 7.1608,
+      "step": 173
+    },
+    {
+      "epoch": 0.00174,
+      "grad_norm": 0.49833717942237854,
+      "learning_rate": 0.000522,
+      "loss": 7.1779,
+      "step": 174
+    },
+    {
+      "epoch": 0.00175,
+      "grad_norm": 0.469341903924942,
+      "learning_rate": 0.000525,
+      "loss": 7.1406,
+      "step": 175
+    },
+    {
+      "epoch": 0.00176,
+      "grad_norm": 0.46195128560066223,
+      "learning_rate": 0.0005279999999999999,
+      "loss": 7.1165,
+      "step": 176
+    },
+    {
+      "epoch": 0.00177,
+      "grad_norm": 0.4399634003639221,
+      "learning_rate": 0.000531,
+      "loss": 7.1206,
+      "step": 177
+    },
+    {
+      "epoch": 0.00178,
+      "grad_norm": 0.42299556732177734,
+      "learning_rate": 0.000534,
+      "loss": 7.1134,
+      "step": 178
+    },
+    {
+      "epoch": 0.00179,
+      "grad_norm": 0.3987540602684021,
+      "learning_rate": 0.000537,
+      "loss": 7.1017,
+      "step": 179
+    },
+    {
+      "epoch": 0.0018,
+      "grad_norm": 0.37395715713500977,
+      "learning_rate": 0.00054,
+      "loss": 7.074,
+      "step": 180
+    },
+    {
+      "epoch": 0.00181,
+      "grad_norm": 0.3351408541202545,
+      "learning_rate": 0.000543,
+      "loss": 7.0613,
+      "step": 181
+    },
+    {
+      "epoch": 0.00182,
+      "grad_norm": 0.3589305281639099,
+      "learning_rate": 0.000546,
+      "loss": 7.0589,
+      "step": 182
+    },
+    {
+      "epoch": 0.00183,
+      "grad_norm": 0.38710907101631165,
+      "learning_rate": 0.000549,
+      "loss": 7.0486,
+      "step": 183
+    },
+    {
+      "epoch": 0.00184,
+      "grad_norm": 0.4560106098651886,
+      "learning_rate": 0.000552,
+      "loss": 7.0384,
+      "step": 184
+    },
+    {
+      "epoch": 0.00185,
+      "grad_norm": 0.8466277718544006,
+      "learning_rate": 0.000555,
+      "loss": 7.0199,
+      "step": 185
+    },
+    {
+      "epoch": 0.00186,
+      "grad_norm": 1.4107517004013062,
+      "learning_rate": 0.000558,
+      "loss": 7.0664,
+      "step": 186
+    },
+    {
+      "epoch": 0.00187,
+      "grad_norm": 0.5632089376449585,
+      "learning_rate": 0.000561,
+      "loss": 7.004,
+      "step": 187
+    },
+    {
+      "epoch": 0.00188,
+      "grad_norm": 1.07405686378479,
+      "learning_rate": 0.000564,
+      "loss": 6.9945,
+      "step": 188
+    },
+    {
+      "epoch": 0.00189,
+      "grad_norm": 1.1665420532226562,
+      "learning_rate": 0.000567,
+      "loss": 7.0095,
+      "step": 189
+    },
+    {
+      "epoch": 0.0019,
+      "grad_norm": 0.4235672950744629,
+      "learning_rate": 0.00057,
+      "loss": 6.962,
+      "step": 190
+    },
+    {
+      "epoch": 0.00191,
+      "grad_norm": 1.2953448295593262,
+      "learning_rate": 0.000573,
+      "loss": 6.9853,
+      "step": 191
+    },
+    {
+      "epoch": 0.00192,
+      "grad_norm": 0.5110867023468018,
+      "learning_rate": 0.000576,
+      "loss": 6.9512,
+      "step": 192
+    },
+    {
+      "epoch": 0.00193,
+      "grad_norm": 0.7966066002845764,
+      "learning_rate": 0.000579,
+      "loss": 6.9656,
+      "step": 193
+    },
+    {
+      "epoch": 0.00194,
+      "grad_norm": 0.5008851289749146,
+      "learning_rate": 0.000582,
+      "loss": 6.9458,
+      "step": 194
+    },
+    {
+      "epoch": 0.00195,
+      "grad_norm": 0.714582085609436,
+      "learning_rate": 0.000585,
+      "loss": 6.9325,
+      "step": 195
+    },
+    {
+      "epoch": 0.00196,
+      "grad_norm": 0.48010018467903137,
+      "learning_rate": 0.000588,
+      "loss": 6.917,
+      "step": 196
+    },
+    {
+      "epoch": 0.00197,
+      "grad_norm": 0.5283955335617065,
+      "learning_rate": 0.000591,
+      "loss": 6.9149,
+      "step": 197
+    },
+    {
+      "epoch": 0.00198,
+      "grad_norm": 0.5033705830574036,
+      "learning_rate": 0.000594,
+      "loss": 6.9041,
+      "step": 198
+    },
+    {
+      "epoch": 0.00199,
+      "grad_norm": 0.40711161494255066,
+      "learning_rate": 0.0005970000000000001,
+      "loss": 6.8819,
+      "step": 199
+    },
+    {
+      "epoch": 0.002,
+      "grad_norm": 0.4253259003162384,
+      "learning_rate": 0.0006000000000000001,
+      "loss": 6.8842,
+      "step": 200
+    },
+    {
+      "epoch": 0.00201,
+      "grad_norm": 0.347766637802124,
+      "learning_rate": 0.000603,
+      "loss": 6.8566,
+      "step": 201
+    },
+    {
+      "epoch": 0.00202,
+      "grad_norm": 0.4044833779335022,
+      "learning_rate": 0.0006060000000000001,
+      "loss": 6.8448,
+      "step": 202
+    },
+    {
+      "epoch": 0.00203,
+      "grad_norm": 0.3598291575908661,
+      "learning_rate": 0.0006090000000000001,
+      "loss": 6.8489,
+      "step": 203
+    },
+    {
+      "epoch": 0.00204,
+      "grad_norm": 0.35803648829460144,
+      "learning_rate": 0.000612,
+      "loss": 6.8337,
+      "step": 204
+    },
+    {
+      "epoch": 0.00205,
+      "grad_norm": 0.3630695044994354,
+      "learning_rate": 0.000615,
+      "loss": 6.82,
+      "step": 205
+    },
+    {
+      "epoch": 0.00206,
+      "grad_norm": 0.3439967930316925,
+      "learning_rate": 0.000618,
+      "loss": 6.8125,
+      "step": 206
+    },
+    {
+      "epoch": 0.00207,
+      "grad_norm": 0.3219742476940155,
+      "learning_rate": 0.000621,
+      "loss": 6.8026,
+      "step": 207
+    },
+    {
+      "epoch": 0.00208,
+      "grad_norm": 0.42236459255218506,
+      "learning_rate": 0.000624,
+      "loss": 6.8006,
+      "step": 208
+    },
+    {
+      "epoch": 0.00209,
+      "grad_norm": 0.44003602862358093,
+      "learning_rate": 0.000627,
+      "loss": 6.7802,
+      "step": 209
+    },
+    {
+      "epoch": 0.0021,
+      "grad_norm": 0.6629877686500549,
+      "learning_rate": 0.00063,
+      "loss": 6.7737,
+      "step": 210
+    },
+    {
+      "epoch": 0.00211,
+      "grad_norm": 0.9569016695022583,
+      "learning_rate": 0.000633,
+      "loss": 6.7754,
+      "step": 211
+    },
+    {
+      "epoch": 0.00212,
+      "grad_norm": 1.1293443441390991,
+      "learning_rate": 0.000636,
+      "loss": 6.7746,
+      "step": 212
+    },
+    {
+      "epoch": 0.00213,
+      "grad_norm": 0.43011096119880676,
+      "learning_rate": 0.000639,
+      "loss": 6.7385,
+      "step": 213
+    },
+    {
+      "epoch": 0.00214,
+      "grad_norm": 0.6478229761123657,
+      "learning_rate": 0.000642,
+      "loss": 6.7415,
+      "step": 214
+    },
+    {
+      "epoch": 0.00215,
+      "grad_norm": 0.6323032975196838,
+      "learning_rate": 0.000645,
+      "loss": 6.7384,
+      "step": 215
+    },
+    {
+      "epoch": 0.00216,
+      "grad_norm": 0.441693514585495,
+      "learning_rate": 0.000648,
+      "loss": 6.7285,
+      "step": 216
+    },
+    {
+      "epoch": 0.00217,
+      "grad_norm": 0.5594473481178284,
+      "learning_rate": 0.000651,
+      "loss": 6.7033,
+      "step": 217
+    },
+    {
+      "epoch": 0.00218,
+      "grad_norm": 0.5135915279388428,
+      "learning_rate": 0.000654,
+      "loss": 6.7073,
+      "step": 218
+    },
+    {
+      "epoch": 0.00219,
+      "grad_norm": 0.4307027757167816,
+      "learning_rate": 0.000657,
+      "loss": 6.6782,
+      "step": 219
+    },
+    {
+      "epoch": 0.0022,
+      "grad_norm": 0.45137229561805725,
+      "learning_rate": 0.00066,
+      "loss": 6.6953,
+      "step": 220
+    },
+    {
+      "epoch": 0.00221,
+      "grad_norm": 0.4729914963245392,
+      "learning_rate": 0.0006630000000000001,
+      "loss": 6.6737,
+      "step": 221
+    },
+    {
+      "epoch": 0.00222,
+      "grad_norm": 0.47246506810188293,
+      "learning_rate": 0.000666,
+      "loss": 6.6615,
+      "step": 222
+    },
+    {
+      "epoch": 0.00223,
+      "grad_norm": 0.36316192150115967,
+      "learning_rate": 0.000669,
+      "loss": 6.6543,
+      "step": 223
+    },
+    {
+      "epoch": 0.00224,
+      "grad_norm": 0.4332623779773712,
+      "learning_rate": 0.0006720000000000001,
+      "loss": 6.6525,
+      "step": 224
+    },
+    {
+      "epoch": 0.00225,
+      "grad_norm": 0.41814228892326355,
+      "learning_rate": 0.000675,
+      "loss": 6.639,
+      "step": 225
+    },
+    {
+      "epoch": 0.00226,
+      "grad_norm": 0.42956992983818054,
+      "learning_rate": 0.000678,
+      "loss": 6.634,
+      "step": 226
+    },
+    {
+      "epoch": 0.00227,
+      "grad_norm": 0.40267884731292725,
+      "learning_rate": 0.0006810000000000001,
+      "loss": 6.6072,
+      "step": 227
+    },
+    {
+      "epoch": 0.00228,
+      "grad_norm": 0.4361991882324219,
+      "learning_rate": 0.000684,
+      "loss": 6.6099,
+      "step": 228
+    },
+    {
+      "epoch": 0.00229,
+      "grad_norm": 0.47655290365219116,
+      "learning_rate": 0.000687,
+      "loss": 6.6086,
+      "step": 229
+    },
+    {
+      "epoch": 0.0023,
+      "grad_norm": 0.5011177659034729,
+      "learning_rate": 0.0006900000000000001,
+      "loss": 6.5846,
+      "step": 230
+    },
+    {
+      "epoch": 0.00231,
+      "grad_norm": 0.5389447212219238,
+      "learning_rate": 0.000693,
+      "loss": 6.5719,
+      "step": 231
+    },
+    {
+      "epoch": 0.00232,
+      "grad_norm": 0.5394959449768066,
+      "learning_rate": 0.000696,
+      "loss": 6.5809,
+      "step": 232
+    },
+    {
+      "epoch": 0.00233,
+      "grad_norm": 0.49784839153289795,
+      "learning_rate": 0.0006990000000000001,
+      "loss": 6.5675,
+      "step": 233
+    },
+    {
+      "epoch": 0.00234,
+      "grad_norm": 0.42049404978752136,
+      "learning_rate": 0.000702,
+      "loss": 6.5581,
+      "step": 234
+    },
+    {
+      "epoch": 0.00235,
+      "grad_norm": 0.5810425281524658,
+      "learning_rate": 0.000705,
+      "loss": 6.5463,
+      "step": 235
+    },
+    {
+      "epoch": 0.00236,
+      "grad_norm": 0.6429721117019653,
+      "learning_rate": 0.000708,
+      "loss": 6.5306,
+      "step": 236
+    },
+    {
+      "epoch": 0.00237,
+      "grad_norm": 0.6626091003417969,
+      "learning_rate": 0.0007109999999999999,
+      "loss": 6.5402,
+      "step": 237
+    },
+    {
+      "epoch": 0.00238,
+      "grad_norm": 0.5873957872390747,
+      "learning_rate": 0.000714,
+      "loss": 6.5299,
+      "step": 238
+    },
+    {
+      "epoch": 0.00239,
+      "grad_norm": 0.4890768527984619,
+      "learning_rate": 0.000717,
+      "loss": 6.5085,
+      "step": 239
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.6101468801498413,
+      "learning_rate": 0.0007199999999999999,
+      "loss": 6.5266,
+      "step": 240
+    },
+    {
+      "epoch": 0.00241,
+      "grad_norm": 0.5837545394897461,
+      "learning_rate": 0.000723,
+      "loss": 6.5125,
+      "step": 241
+    },
+    {
+      "epoch": 0.00242,
+      "grad_norm": 0.38101619482040405,
+      "learning_rate": 0.000726,
+      "loss": 6.481,
+      "step": 242
+    },
+    {
+      "epoch": 0.00243,
+      "grad_norm": 0.5516716837882996,
+      "learning_rate": 0.000729,
+      "loss": 6.4712,
+      "step": 243
+    },
+    {
+      "epoch": 0.00244,
+      "grad_norm": 0.6402163505554199,
+      "learning_rate": 0.000732,
+      "loss": 6.4787,
+      "step": 244
+    },
+    {
+      "epoch": 0.00245,
+      "grad_norm": 0.661472737789154,
+      "learning_rate": 0.000735,
+      "loss": 6.4666,
+      "step": 245
+    },
+    {
+      "epoch": 0.00246,
+      "grad_norm": 0.8242950439453125,
+      "learning_rate": 0.000738,
+      "loss": 6.457,
+      "step": 246
+    },
+    {
+      "epoch": 0.00247,
+      "grad_norm": 0.8979260921478271,
+      "learning_rate": 0.000741,
+      "loss": 6.4532,
+      "step": 247
+    },
+    {
+      "epoch": 0.00248,
+      "grad_norm": 0.7822521924972534,
+      "learning_rate": 0.000744,
+      "loss": 6.4595,
+      "step": 248
+    },
+    {
+      "epoch": 0.00249,
+      "grad_norm": 0.5830389261245728,
+      "learning_rate": 0.000747,
+      "loss": 6.4336,
+      "step": 249
+    },
+    {
+      "epoch": 0.0025,
+      "grad_norm": 0.5053834319114685,
+      "learning_rate": 0.00075,
+      "loss": 6.436,
+      "step": 250
+    },
+    {
+      "epoch": 0.00251,
+      "grad_norm": 0.6169653534889221,
+      "learning_rate": 0.000753,
+      "loss": 6.4018,
+      "step": 251
+    },
+    {
+      "epoch": 0.00252,
+      "grad_norm": 0.693821370601654,
+      "learning_rate": 0.000756,
+      "loss": 6.411,
+      "step": 252
+    },
+    {
+      "epoch": 0.00253,
+      "grad_norm": 0.7529676556587219,
+      "learning_rate": 0.000759,
+      "loss": 6.4053,
+      "step": 253
+    },
+    {
+      "epoch": 0.00254,
+      "grad_norm": 0.7882714867591858,
+      "learning_rate": 0.000762,
+      "loss": 6.3993,
+      "step": 254
+    },
+    {
+      "epoch": 0.00255,
+      "grad_norm": 0.8540387153625488,
+      "learning_rate": 0.0007650000000000001,
+      "loss": 6.399,
+      "step": 255
+    },
+    {
+      "epoch": 0.00256,
+      "grad_norm": 0.8460838198661804,
+      "learning_rate": 0.000768,
+      "loss": 6.3805,
+      "step": 256
+    },
+    {
+      "epoch": 0.00257,
+      "grad_norm": 0.7798411250114441,
+      "learning_rate": 0.000771,
+      "loss": 6.383,
+      "step": 257
+    },
+    {
+      "epoch": 0.00258,
+      "grad_norm": 0.7431406378746033,
+      "learning_rate": 0.0007740000000000001,
+      "loss": 6.362,
+      "step": 258
+    },
+    {
+      "epoch": 0.00259,
+      "grad_norm": 0.9195736646652222,
+      "learning_rate": 0.000777,
+      "loss": 6.3634,
+      "step": 259
+    },
+    {
+      "epoch": 0.0026,
+      "grad_norm": 0.9989089965820312,
+      "learning_rate": 0.0007800000000000001,
+      "loss": 6.3605,
+      "step": 260
+    },
+    {
+      "epoch": 0.00261,
+      "grad_norm": 0.8553241491317749,
+      "learning_rate": 0.0007830000000000001,
+      "loss": 6.3627,
+      "step": 261
+    },
+    {
+      "epoch": 0.00262,
+      "grad_norm": 0.8126974701881409,
+      "learning_rate": 0.000786,
+      "loss": 6.3448,
+      "step": 262
+    },
+    {
+      "epoch": 0.00263,
+      "grad_norm": 0.5453911423683167,
+      "learning_rate": 0.0007890000000000001,
+      "loss": 6.3416,
+      "step": 263
+    },
+    {
+      "epoch": 0.00264,
+      "grad_norm": 0.5339396595954895,
+      "learning_rate": 0.0007920000000000001,
+      "loss": 6.3252,
+      "step": 264
+    },
+    {
+      "epoch": 0.00265,
+      "grad_norm": 0.561769425868988,
+      "learning_rate": 0.000795,
+      "loss": 6.3181,
+      "step": 265
+    },
+    {
+      "epoch": 0.00266,
+      "grad_norm": 0.47888532280921936,
+      "learning_rate": 0.0007980000000000001,
+      "loss": 6.3105,
+      "step": 266
+    },
+    {
+      "epoch": 0.00267,
+      "grad_norm": 0.4935484528541565,
+      "learning_rate": 0.0008010000000000001,
+      "loss": 6.3049,
+      "step": 267
+    },
+    {
+      "epoch": 0.00268,
+      "grad_norm": 0.4065157175064087,
+      "learning_rate": 0.000804,
+      "loss": 6.2969,
+      "step": 268
+    },
+    {
+      "epoch": 0.00269,
+      "grad_norm": 0.5361817479133606,
+      "learning_rate": 0.0008070000000000001,
+      "loss": 6.2818,
+      "step": 269
+    },
+    {
+      "epoch": 0.0027,
+      "grad_norm": 0.6360214352607727,
+      "learning_rate": 0.0008100000000000001,
+      "loss": 6.2858,
+      "step": 270
+    },
+    {
+      "epoch": 0.00271,
+      "grad_norm": 0.6580653190612793,
+      "learning_rate": 0.000813,
+      "loss": 6.2898,
+      "step": 271
+    },
+    {
+      "epoch": 0.00272,
+      "grad_norm": 0.719866931438446,
+      "learning_rate": 0.0008160000000000001,
+      "loss": 6.2764,
+      "step": 272
+    },
+    {
+      "epoch": 0.00273,
+      "grad_norm": 0.726635754108429,
+      "learning_rate": 0.0008190000000000001,
+      "loss": 6.2654,
+      "step": 273
+    },
+    {
+      "epoch": 0.00274,
+      "grad_norm": 0.5728192925453186,
+      "learning_rate": 0.000822,
+      "loss": 6.2437,
+      "step": 274
+    },
+    {
+      "epoch": 0.00275,
+      "grad_norm": 0.469969242811203,
+      "learning_rate": 0.0008250000000000001,
+      "loss": 6.2475,
+      "step": 275
+    },
+    {
+      "epoch": 0.00276,
+      "grad_norm": 0.5783148407936096,
+      "learning_rate": 0.0008280000000000001,
+      "loss": 6.2405,
+      "step": 276
+    },
+    {
+      "epoch": 0.00277,
+      "grad_norm": 0.5995691418647766,
+      "learning_rate": 0.0008310000000000001,
+      "loss": 6.2356,
+      "step": 277
+    },
+    {
+      "epoch": 0.00278,
+      "grad_norm": 0.5173709988594055,
+      "learning_rate": 0.0008340000000000001,
+      "loss": 6.2181,
+      "step": 278
+    },
+    {
+      "epoch": 0.00279,
+      "grad_norm": 0.47013920545578003,
+      "learning_rate": 0.0008370000000000001,
+      "loss": 6.2175,
+      "step": 279
+    },
+    {
+      "epoch": 0.0028,
+      "grad_norm": 0.38072702288627625,
+      "learning_rate": 0.0008400000000000001,
+      "loss": 6.1988,
+      "step": 280
+    },
+    {
+      "epoch": 0.00281,
+      "grad_norm": 0.44907790422439575,
+      "learning_rate": 0.0008430000000000001,
+      "loss": 6.1893,
+      "step": 281
+    },
+    {
+      "epoch": 0.00282,
+      "grad_norm": 0.40965142846107483,
+      "learning_rate": 0.000846,
+      "loss": 6.1869,
+      "step": 282
+    },
+    {
+      "epoch": 0.00283,
+      "grad_norm": 0.48822489380836487,
+      "learning_rate": 0.0008489999999999999,
+      "loss": 6.1783,
+      "step": 283
+    },
+    {
+      "epoch": 0.00284,
+      "grad_norm": 0.726660966873169,
+      "learning_rate": 0.0008519999999999999,
+      "loss": 6.1771,
+      "step": 284
+    },
+    {
+      "epoch": 0.00285,
+      "grad_norm": 1.0991517305374146,
+      "learning_rate": 0.000855,
+      "loss": 6.1879,
+      "step": 285
+    },
+    {
+      "epoch": 0.00286,
+      "grad_norm": 0.9898068904876709,
+      "learning_rate": 0.0008579999999999999,
+      "loss": 6.1694,
+      "step": 286
+    },
+    {
+      "epoch": 0.00287,
+      "grad_norm": 0.9177366495132446,
+      "learning_rate": 0.000861,
+      "loss": 6.1687,
+      "step": 287
+    },
+    {
+      "epoch": 0.00288,
+      "grad_norm": 1.3172835111618042,
+      "learning_rate": 0.000864,
+      "loss": 6.175,
+      "step": 288
+    },
+    {
+      "epoch": 0.00289,
+      "grad_norm": 1.0531185865402222,
+      "learning_rate": 0.0008669999999999999,
+      "loss": 6.1733,
+      "step": 289
+    },
+    {
+      "epoch": 0.0029,
+      "grad_norm": 0.9814063310623169,
+      "learning_rate": 0.00087,
+      "loss": 6.1665,
+      "step": 290
+    },
+    {
+      "epoch": 0.00291,
+      "grad_norm": 0.7696391344070435,
+      "learning_rate": 0.000873,
+      "loss": 6.1455,
+      "step": 291
+    },
+    {
+      "epoch": 0.00292,
+      "grad_norm": 0.8531065583229065,
+      "learning_rate": 0.0008759999999999999,
+      "loss": 6.15,
+      "step": 292
+    },
+    {
+      "epoch": 0.00293,
+      "grad_norm": 0.7760049700737,
+      "learning_rate": 0.000879,
+      "loss": 6.1273,
+      "step": 293
+    },
+    {
+      "epoch": 0.00294,
+      "grad_norm": 0.7517282366752625,
+      "learning_rate": 0.000882,
+      "loss": 6.1432,
+      "step": 294
+    },
+    {
+      "epoch": 0.00295,
+      "grad_norm": 0.5758442282676697,
+      "learning_rate": 0.0008849999999999999,
+      "loss": 6.106,
+      "step": 295
+    },
+    {
+      "epoch": 0.00296,
+      "grad_norm": 0.5470280647277832,
+      "learning_rate": 0.000888,
+      "loss": 6.1152,
+      "step": 296
+    },
+    {
+      "epoch": 0.00297,
+      "grad_norm": 0.46315857768058777,
+      "learning_rate": 0.000891,
+      "loss": 6.0848,
+      "step": 297
+    },
+    {
+      "epoch": 0.00298,
+      "grad_norm": 0.52577805519104,
+      "learning_rate": 0.0008939999999999999,
+      "loss": 6.0996,
+      "step": 298
+    },
+    {
+      "epoch": 0.00299,
+      "grad_norm": 0.5289214253425598,
+      "learning_rate": 0.000897,
+      "loss": 6.0753,
+      "step": 299
+    },
+    {
+      "epoch": 0.003,
+      "grad_norm": 0.39721718430519104,
+      "learning_rate": 0.0009,
+      "loss": 6.0729,
+      "step": 300
+    },
+    {
+      "epoch": 0.00301,
+      "grad_norm": 0.42188870906829834,
+      "learning_rate": 0.0009029999999999999,
+      "loss": 6.0649,
+      "step": 301
+    },
+    {
+      "epoch": 0.00302,
+      "grad_norm": 0.49844104051589966,
+      "learning_rate": 0.000906,
+      "loss": 6.0504,
+      "step": 302
+    },
+    {
+      "epoch": 0.00303,
+      "grad_norm": 0.5113502144813538,
+      "learning_rate": 0.000909,
+      "loss": 6.0629,
+      "step": 303
+    },
+    {
+      "epoch": 0.00304,
+      "grad_norm": 0.6390882730484009,
+      "learning_rate": 0.000912,
+      "loss": 6.0593,
+      "step": 304
+    },
+    {
+      "epoch": 0.00305,
+      "grad_norm": 0.8851528763771057,
+      "learning_rate": 0.000915,
+      "loss": 6.0668,
+      "step": 305
+    },
+    {
+      "epoch": 0.00306,
+      "grad_norm": 0.9017530083656311,
+      "learning_rate": 0.000918,
+      "loss": 6.0552,
+      "step": 306
+    },
+    {
+      "epoch": 0.00307,
+      "grad_norm": 0.563444197177887,
+      "learning_rate": 0.000921,
+      "loss": 6.0386,
+      "step": 307
+    },
+    {
+      "epoch": 0.00308,
+      "grad_norm": 0.6002116203308105,
+      "learning_rate": 0.000924,
+      "loss": 6.0248,
+      "step": 308
+    },
+    {
+      "epoch": 0.00309,
+      "grad_norm": 0.7035393118858337,
+      "learning_rate": 0.000927,
+      "loss": 6.0412,
+      "step": 309
+    },
+    {
+      "epoch": 0.0031,
+      "grad_norm": 1.01050865650177,
+      "learning_rate": 0.00093,
+      "loss": 6.0314,
+      "step": 310
+    },
+    {
+      "epoch": 0.00311,
+      "grad_norm": 1.16908860206604,
+      "learning_rate": 0.000933,
+      "loss": 6.0397,
+      "step": 311
+    },
+    {
+      "epoch": 0.00312,
+      "grad_norm": 0.6785458922386169,
+      "learning_rate": 0.000936,
+      "loss": 6.0006,
+      "step": 312
+    },
+    {
+      "epoch": 0.00313,
+      "grad_norm": 0.7975029349327087,
+      "learning_rate": 0.0009390000000000001,
+      "loss": 6.0267,
+      "step": 313
+    },
+    {
+      "epoch": 0.00314,
+      "grad_norm": 0.6784241795539856,
+      "learning_rate": 0.000942,
+      "loss": 6.0253,
+      "step": 314
+    },
+    {
+      "epoch": 0.00315,
+      "grad_norm": 0.5287242531776428,
+      "learning_rate": 0.000945,
+      "loss": 5.9989,
+      "step": 315
+    },
+    {
+      "epoch": 0.00316,
+      "grad_norm": 0.5889810919761658,
+      "learning_rate": 0.0009480000000000001,
+      "loss": 5.9738,
+      "step": 316
+    },
+    {
+      "epoch": 0.00317,
+      "grad_norm": 0.8596201539039612,
+      "learning_rate": 0.000951,
+      "loss": 5.9963,
+      "step": 317
+    },
+    {
+      "epoch": 0.00318,
+      "grad_norm": 1.220719575881958,
+      "learning_rate": 0.000954,
+      "loss": 6.0052,
+      "step": 318
+    },
+    {
+      "epoch": 0.00319,
+      "grad_norm": 0.7490801215171814,
+      "learning_rate": 0.0009570000000000001,
+      "loss": 5.9868,
+      "step": 319
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.6210083365440369,
+      "learning_rate": 0.00096,
+      "loss": 5.9801,
+      "step": 320
+    },
+    {
+      "epoch": 0.00321,
+      "grad_norm": 0.5945920944213867,
+      "learning_rate": 0.000963,
+      "loss": 5.97,
+      "step": 321
+    },
+    {
+      "epoch": 0.00322,
+      "grad_norm": 0.6791667342185974,
+      "learning_rate": 0.0009660000000000001,
+      "loss": 5.9544,
+      "step": 322
+    },
+    {
+      "epoch": 0.00323,
+      "grad_norm": 0.9637515544891357,
+      "learning_rate": 0.000969,
+      "loss": 5.9627,
+      "step": 323
+    },
+    {
+      "epoch": 0.00324,
+      "grad_norm": 1.1402119398117065,
+      "learning_rate": 0.0009720000000000001,
+      "loss": 5.972,
+      "step": 324
+    },
+    {
+      "epoch": 0.00325,
+      "grad_norm": 1.0057023763656616,
+      "learning_rate": 0.0009750000000000001,
+      "loss": 5.9423,
+      "step": 325
+    },
+    {
+      "epoch": 0.00326,
+      "grad_norm": 0.5953328609466553,
+      "learning_rate": 0.0009780000000000001,
+      "loss": 5.942,
+      "step": 326
+    },
+    {
+      "epoch": 0.00327,
+      "grad_norm": 0.7124008536338806,
+      "learning_rate": 0.000981,
+      "loss": 5.9465,
+      "step": 327
+    },
+    {
+      "epoch": 0.00328,
+      "grad_norm": 0.7318410277366638,
+      "learning_rate": 0.000984,
+      "loss": 5.9241,
+      "step": 328
+    },
+    {
+      "epoch": 0.00329,
+      "grad_norm": 0.6503687500953674,
+      "learning_rate": 0.000987,
+      "loss": 5.922,
+      "step": 329
+    },
+    {
+      "epoch": 0.0033,
+      "grad_norm": 0.6151977181434631,
+      "learning_rate": 0.00099,
+      "loss": 5.9159,
+      "step": 330
+    },
+    {
+      "epoch": 0.00331,
+      "grad_norm": 0.49422070384025574,
+      "learning_rate": 0.0009930000000000002,
+      "loss": 5.9144,
+      "step": 331
+    },
+    {
+      "epoch": 0.00332,
+      "grad_norm": 0.563934326171875,
+      "learning_rate": 0.0009960000000000001,
+      "loss": 5.9008,
+      "step": 332
+    },
+    {
+      "epoch": 0.00333,
+      "grad_norm": 0.5146680474281311,
+      "learning_rate": 0.000999,
+      "loss": 5.8812,
+      "step": 333
+    },
+    {
+      "epoch": 0.00334,
+      "grad_norm": 0.5699781775474548,
+      "learning_rate": 0.001002,
+      "loss": 5.8922,
+      "step": 334
+    },
+    {
+      "epoch": 0.00335,
+      "grad_norm": 0.628279983997345,
+      "learning_rate": 0.001005,
+      "loss": 5.8934,
+      "step": 335
+    },
+    {
+      "epoch": 0.00336,
+      "grad_norm": 0.638155996799469,
+      "learning_rate": 0.001008,
+      "loss": 5.8854,
+      "step": 336
+    },
+    {
+      "epoch": 0.00337,
+      "grad_norm": 0.5850276947021484,
+      "learning_rate": 0.0010110000000000002,
+      "loss": 5.8631,
+      "step": 337
+    },
+    {
+      "epoch": 0.00338,
+      "grad_norm": 0.5985286831855774,
+      "learning_rate": 0.0010140000000000001,
+      "loss": 5.879,
+      "step": 338
+    },
+    {
+      "epoch": 0.00339,
+      "grad_norm": 0.9502546787261963,
+      "learning_rate": 0.0010170000000000001,
+      "loss": 5.8893,
+      "step": 339
+    },
+    {
+      "epoch": 0.0034,
+      "grad_norm": 1.3471951484680176,
+      "learning_rate": 0.00102,
+      "loss": 5.8789,
+      "step": 340
+    },
+    {
+      "epoch": 0.00341,
+      "grad_norm": 0.6621291041374207,
+      "learning_rate": 0.001023,
+      "loss": 5.8449,
+      "step": 341
+    },
+    {
+      "epoch": 0.00342,
+      "grad_norm": 0.8053567409515381,
+      "learning_rate": 0.001026,
+      "loss": 5.8551,
+      "step": 342
+    },
+    {
+      "epoch": 0.00343,
+      "grad_norm": 1.0873545408248901,
+      "learning_rate": 0.0010290000000000002,
+      "loss": 5.8656,
+      "step": 343
+    },
+    {
+      "epoch": 0.00344,
+      "grad_norm": 1.1315717697143555,
+      "learning_rate": 0.001032,
+      "loss": 5.8613,
+      "step": 344
+    },
+    {
+      "epoch": 0.00345,
+      "grad_norm": 0.7909812927246094,
+      "learning_rate": 0.001035,
+      "loss": 5.8449,
+      "step": 345
+    },
+    {
+      "epoch": 0.00346,
+      "grad_norm": 0.8379076719284058,
+      "learning_rate": 0.0010379999999999999,
+      "loss": 5.8518,
+      "step": 346
+    },
+    {
+      "epoch": 0.00347,
+      "grad_norm": 0.5857303738594055,
+      "learning_rate": 0.001041,
+      "loss": 5.8133,
+      "step": 347
+    },
+    {
+      "epoch": 0.00348,
+      "grad_norm": 0.6455392241477966,
+      "learning_rate": 0.001044,
+      "loss": 5.8262,
+      "step": 348
+    },
+    {
+      "epoch": 0.00349,
+      "grad_norm": 0.6315843462944031,
+      "learning_rate": 0.001047,
+      "loss": 5.8046,
+      "step": 349
+    },
+    {
+      "epoch": 0.0035,
+      "grad_norm": 0.6185011863708496,
+      "learning_rate": 0.00105,
+      "loss": 5.8282,
+      "step": 350
+    },
+    {
+      "epoch": 0.00351,
+      "grad_norm": 0.49840763211250305,
+      "learning_rate": 0.001053,
+      "loss": 5.8234,
+      "step": 351
+    },
+    {
+      "epoch": 0.00352,
+      "grad_norm": 0.43951740860939026,
+      "learning_rate": 0.0010559999999999999,
+      "loss": 5.7984,
+      "step": 352
+    },
+    {
+      "epoch": 0.00353,
+      "grad_norm": 0.4452185034751892,
+      "learning_rate": 0.001059,
+      "loss": 5.7993,
+      "step": 353
+    },
+    {
+      "epoch": 0.00354,
+      "grad_norm": 0.43185603618621826,
+      "learning_rate": 0.001062,
+      "loss": 5.7703,
+      "step": 354
+    },
+    {
+      "epoch": 0.00355,
+      "grad_norm": 0.4053448736667633,
+      "learning_rate": 0.001065,
+      "loss": 5.7848,
+      "step": 355
+    },
+    {
+      "epoch": 0.00356,
+      "grad_norm": 0.48363247513771057,
+      "learning_rate": 0.001068,
+      "loss": 5.777,
+      "step": 356
+    },
+    {
+      "epoch": 0.00357,
+      "grad_norm": 0.5601730346679688,
+      "learning_rate": 0.001071,
+      "loss": 5.7693,
+      "step": 357
+    },
+    {
+      "epoch": 0.00358,
+      "grad_norm": 0.7239671349525452,
+      "learning_rate": 0.001074,
+      "loss": 5.7638,
+      "step": 358
+    },
+    {
+      "epoch": 0.00359,
+      "grad_norm": 0.8531132936477661,
+      "learning_rate": 0.001077,
+      "loss": 5.76,
+      "step": 359
+    },
+    {
+      "epoch": 0.0036,
+      "grad_norm": 1.0332695245742798,
+      "learning_rate": 0.00108,
+      "loss": 5.7701,
+      "step": 360
+    },
+    {
+      "epoch": 0.00361,
+      "grad_norm": 1.3098387718200684,
+      "learning_rate": 0.001083,
+      "loss": 5.7755,
+      "step": 361
+    },
+    {
+      "epoch": 0.00362,
+      "grad_norm": 0.7546947598457336,
+      "learning_rate": 0.001086,
+      "loss": 5.7474,
+      "step": 362
+    },
+    {
+      "epoch": 0.00363,
+      "grad_norm": 0.6924042701721191,
+      "learning_rate": 0.001089,
+      "loss": 5.7394,
+      "step": 363
+    },
+    {
+      "epoch": 0.00364,
+      "grad_norm": 0.87959223985672,
+      "learning_rate": 0.001092,
+      "loss": 5.7603,
+      "step": 364
+    },
+    {
+      "epoch": 0.00365,
+      "grad_norm": 1.037275791168213,
+      "learning_rate": 0.001095,
+      "loss": 5.7581,
+      "step": 365
+    },
+    {
+      "epoch": 0.00366,
+      "grad_norm": 1.056171178817749,
+      "learning_rate": 0.001098,
+      "loss": 5.7464,
+      "step": 366
+    },
+    {
+      "epoch": 0.00367,
+      "grad_norm": 1.0635496377944946,
+      "learning_rate": 0.001101,
+      "loss": 5.7478,
+      "step": 367
+    },
+    {
+      "epoch": 0.00368,
+      "grad_norm": 0.9701796174049377,
+      "learning_rate": 0.001104,
+      "loss": 5.7478,
+      "step": 368
+    },
+    {
+      "epoch": 0.00369,
+      "grad_norm": 0.7430213689804077,
+      "learning_rate": 0.001107,
+      "loss": 5.745,
+      "step": 369
+    },
+    {
+      "epoch": 0.0037,
+      "grad_norm": 0.7348084449768066,
+      "learning_rate": 0.00111,
+      "loss": 5.7264,
+      "step": 370
+    },
+    {
+      "epoch": 0.00371,
+      "grad_norm": 0.8778790831565857,
+      "learning_rate": 0.001113,
+      "loss": 5.6986,
+      "step": 371
+    },
+    {
+      "epoch": 0.00372,
+      "grad_norm": 1.160132884979248,
+      "learning_rate": 0.001116,
+      "loss": 5.7343,
+      "step": 372
+    },
+    {
+      "epoch": 0.00373,
+      "grad_norm": 0.8288450241088867,
+      "learning_rate": 0.001119,
+      "loss": 5.7145,
+      "step": 373
+    },
+    {
+      "epoch": 0.00374,
+      "grad_norm": 0.7645081281661987,
+      "learning_rate": 0.001122,
+      "loss": 5.7142,
+      "step": 374
+    },
+    {
+      "epoch": 0.00375,
+      "grad_norm": 0.8342962265014648,
+      "learning_rate": 0.0011250000000000001,
+      "loss": 5.7215,
+      "step": 375
+    },
+    {
+      "epoch": 0.00376,
+      "grad_norm": 0.8966416716575623,
+      "learning_rate": 0.001128,
+      "loss": 5.7142,
+      "step": 376
+    },
+    {
+      "epoch": 0.00377,
+      "grad_norm": 1.1411352157592773,
+      "learning_rate": 0.001131,
+      "loss": 5.7202,
+      "step": 377
+    },
+    {
+      "epoch": 0.00378,
+      "grad_norm": 1.0639731884002686,
+      "learning_rate": 0.001134,
+      "loss": 5.7166,
+      "step": 378
+    },
+    {
+      "epoch": 0.00379,
+      "grad_norm": 1.0386251211166382,
+      "learning_rate": 0.001137,
+      "loss": 5.701,
+      "step": 379
+    },
+    {
+      "epoch": 0.0038,
+      "grad_norm": 0.8551567196846008,
+      "learning_rate": 0.00114,
+      "loss": 5.7166,
+      "step": 380
+    },
+    {
+      "epoch": 0.00381,
+      "grad_norm": 1.171457290649414,
+      "learning_rate": 0.0011430000000000001,
+      "loss": 5.693,
+      "step": 381
+    },
+    {
+      "epoch": 0.00382,
+      "grad_norm": 0.9382472634315491,
+      "learning_rate": 0.001146,
+      "loss": 5.6768,
+      "step": 382
+    },
+    {
+      "epoch": 0.00383,
+      "grad_norm": 1.011130452156067,
+      "learning_rate": 0.001149,
+      "loss": 5.6945,
+      "step": 383
+    },
+    {
+      "epoch": 0.00384,
+      "grad_norm": 0.7897657155990601,
+      "learning_rate": 0.001152,
+      "loss": 5.6883,
+      "step": 384
+    },
+    {
+      "epoch": 0.00385,
+      "grad_norm": 0.6210044622421265,
+      "learning_rate": 0.001155,
+      "loss": 5.673,
+      "step": 385
+    },
+    {
+      "epoch": 0.00386,
+      "grad_norm": 0.5515205264091492,
+      "learning_rate": 0.001158,
+      "loss": 5.6723,
+      "step": 386
+    },
+    {
+      "epoch": 0.00387,
+      "grad_norm": 0.5881383419036865,
+      "learning_rate": 0.0011610000000000001,
+      "loss": 5.6493,
+      "step": 387
+    },
+    {
+      "epoch": 0.00388,
+      "grad_norm": 0.4862520396709442,
+      "learning_rate": 0.001164,
+      "loss": 5.6435,
+      "step": 388
+    },
+    {
+      "epoch": 0.00389,
+      "grad_norm": 0.4490566551685333,
+      "learning_rate": 0.001167,
+      "loss": 5.649,
+      "step": 389
+    },
+    {
+      "epoch": 0.0039,
+      "grad_norm": 0.4499252140522003,
+      "learning_rate": 0.00117,
+      "loss": 5.6299,
+      "step": 390
+    },
+    {
+      "epoch": 0.00391,
+      "grad_norm": 0.3959794044494629,
+      "learning_rate": 0.001173,
+      "loss": 5.6342,
+      "step": 391
+    },
+    {
+      "epoch": 0.00392,
+      "grad_norm": 0.41570809483528137,
+      "learning_rate": 0.001176,
+      "loss": 5.6441,
+      "step": 392
+    },
+    {
+      "epoch": 0.00393,
+      "grad_norm": 0.4579019248485565,
+      "learning_rate": 0.0011790000000000001,
+      "loss": 5.615,
+      "step": 393
+    },
+    {
+      "epoch": 0.00394,
+      "grad_norm": 0.5718971490859985,
+      "learning_rate": 0.001182,
+      "loss": 5.6031,
+      "step": 394
+    },
+    {
+      "epoch": 0.00395,
+      "grad_norm": 0.8492469191551208,
+      "learning_rate": 0.001185,
+      "loss": 5.5961,
+      "step": 395
+    },
+    {
+      "epoch": 0.00396,
+      "grad_norm": 1.173663854598999,
+      "learning_rate": 0.001188,
+      "loss": 5.6242,
+      "step": 396
+    },
+    {
+      "epoch": 0.00397,
+      "grad_norm": 0.828730046749115,
+      "learning_rate": 0.001191,
+      "loss": 5.6071,
+      "step": 397
+    },
+    {
+      "epoch": 0.00398,
+      "grad_norm": 1.2022807598114014,
+      "learning_rate": 0.0011940000000000002,
+      "loss": 5.6278,
+      "step": 398
+    },
+    {
+      "epoch": 0.00399,
+      "grad_norm": 0.9577529430389404,
+      "learning_rate": 0.0011970000000000001,
+      "loss": 5.6234,
+      "step": 399
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 1.0115303993225098,
+      "learning_rate": 0.0012000000000000001,
+      "loss": 5.6174,
+      "step": 400
+    },
+    {
+      "epoch": 0.00401,
+      "grad_norm": 0.9447324872016907,
+      "learning_rate": 0.001203,
+      "loss": 5.594,
+      "step": 401
+    },
+    {
+      "epoch": 0.00402,
+      "grad_norm": 0.8248692154884338,
+      "learning_rate": 0.001206,
+      "loss": 5.5987,
+      "step": 402
+    },
+    {
+      "epoch": 0.00403,
+      "grad_norm": 1.0206302404403687,
+      "learning_rate": 0.001209,
+      "loss": 5.6026,
+      "step": 403
+    },
+    {
+      "epoch": 0.00404,
+      "grad_norm": 1.090654730796814,
+      "learning_rate": 0.0012120000000000002,
+      "loss": 5.6036,
+      "step": 404
+    },
+    {
+      "epoch": 0.00405,
+      "grad_norm": 0.84739750623703,
+      "learning_rate": 0.0012150000000000002,
+      "loss": 5.5901,
+      "step": 405
+    },
+    {
+      "epoch": 0.00406,
+      "grad_norm": 0.9469770193099976,
+      "learning_rate": 0.0012180000000000001,
+      "loss": 5.5817,
+      "step": 406
+    },
+    {
+      "epoch": 0.00407,
+      "grad_norm": 0.9283969402313232,
+      "learning_rate": 0.0012209999999999999,
+      "loss": 5.5746,
+      "step": 407
+    },
+    {
+      "epoch": 0.00408,
+      "grad_norm": 0.8949937224388123,
+      "learning_rate": 0.001224,
+      "loss": 5.5962,
+      "step": 408
+    },
+    {
+      "epoch": 0.00409,
+      "grad_norm": 0.8844306468963623,
+      "learning_rate": 0.001227,
+      "loss": 5.573,
+      "step": 409
+    },
+    {
+      "epoch": 0.0041,
+      "grad_norm": 0.8062122464179993,
+      "learning_rate": 0.00123,
+      "loss": 5.5618,
+      "step": 410
+    },
+    {
+      "epoch": 0.00411,
+      "grad_norm": 0.8730618357658386,
+      "learning_rate": 0.001233,
+      "loss": 5.5872,
+      "step": 411
+    },
+    {
+      "epoch": 0.00412,
+      "grad_norm": 0.665547788143158,
+      "learning_rate": 0.001236,
+      "loss": 5.5644,
+      "step": 412
+    },
+    {
+      "epoch": 0.00413,
+      "grad_norm": 0.8426138758659363,
+      "learning_rate": 0.0012389999999999999,
+      "loss": 5.5674,
+      "step": 413
+    },
+    {
+      "epoch": 0.00414,
+      "grad_norm": 1.051952838897705,
+      "learning_rate": 0.001242,
+      "loss": 5.5591,
+      "step": 414
+    },
+    {
+      "epoch": 0.00415,
+      "grad_norm": 0.7785534858703613,
+      "learning_rate": 0.001245,
+      "loss": 5.5542,
+      "step": 415
+    },
+    {
+      "epoch": 0.00416,
+      "grad_norm": 0.5227160453796387,
+      "learning_rate": 0.001248,
+      "loss": 5.5279,
+      "step": 416
+    },
+    {
+      "epoch": 0.00417,
+      "grad_norm": 0.5770328044891357,
+      "learning_rate": 0.001251,
+      "loss": 5.5611,
+      "step": 417
+    },
+    {
+      "epoch": 0.00418,
+      "grad_norm": 0.4929839074611664,
+      "learning_rate": 0.001254,
+      "loss": 5.5305,
+      "step": 418
+    },
+    {
+      "epoch": 0.00419,
+      "grad_norm": 0.4660792648792267,
+      "learning_rate": 0.0012569999999999999,
+      "loss": 5.5168,
+      "step": 419
+    },
+    {
+      "epoch": 0.0042,
+      "grad_norm": 0.5160586833953857,
+      "learning_rate": 0.00126,
+      "loss": 5.5326,
+      "step": 420
+    },
+    {
+      "epoch": 0.00421,
+      "grad_norm": 0.5846797823905945,
+      "learning_rate": 0.001263,
+      "loss": 5.5249,
+      "step": 421
+    },
+    {
+      "epoch": 0.00422,
+      "grad_norm": 0.6270997524261475,
+      "learning_rate": 0.001266,
+      "loss": 5.5159,
+      "step": 422
+    },
+    {
+      "epoch": 0.00423,
+      "grad_norm": 0.6081735491752625,
+      "learning_rate": 0.001269,
+      "loss": 5.5118,
+      "step": 423
+    },
+    {
+      "epoch": 0.00424,
+      "grad_norm": 0.557420551776886,
+      "learning_rate": 0.001272,
+      "loss": 5.5105,
+      "step": 424
+    },
+    {
+      "epoch": 0.00425,
+      "grad_norm": 0.821638286113739,
+      "learning_rate": 0.001275,
+      "loss": 5.5176,
+      "step": 425
+    },
+    {
+      "epoch": 0.00426,
+      "grad_norm": 1.0497279167175293,
+      "learning_rate": 0.001278,
+      "loss": 5.5294,
+      "step": 426
+    },
+    {
+      "epoch": 0.00427,
+      "grad_norm": 0.8568355441093445,
+      "learning_rate": 0.001281,
+      "loss": 5.5072,
+      "step": 427
+    },
+    {
+      "epoch": 0.00428,
+      "grad_norm": 0.9392327070236206,
+      "learning_rate": 0.001284,
+      "loss": 5.488,
+      "step": 428
+    },
+    {
+      "epoch": 0.00429,
+      "grad_norm": 0.8972091674804688,
+      "learning_rate": 0.001287,
+      "loss": 5.5041,
+      "step": 429
+    },
+    {
+      "epoch": 0.0043,
+      "grad_norm": 0.8478754162788391,
+      "learning_rate": 0.00129,
+      "loss": 5.4993,
+      "step": 430
+    },
+    {
+      "epoch": 0.00431,
+      "grad_norm": 1.0406945943832397,
+      "learning_rate": 0.001293,
+      "loss": 5.4959,
+      "step": 431
+    },
+    {
+      "epoch": 0.00432,
+      "grad_norm": 0.8937470316886902,
+      "learning_rate": 0.001296,
+      "loss": 5.5023,
+      "step": 432
+    },
+    {
+      "epoch": 0.00433,
+      "grad_norm": 0.7159745693206787,
+      "learning_rate": 0.001299,
+      "loss": 5.4892,
+      "step": 433
+    },
+    {
+      "epoch": 0.00434,
+      "grad_norm": 0.6872638463973999,
+      "learning_rate": 0.001302,
+      "loss": 5.4897,
+      "step": 434
+    },
+    {
+      "epoch": 0.00435,
+      "grad_norm": 0.7336323857307434,
+      "learning_rate": 0.001305,
+      "loss": 5.4832,
+      "step": 435
+    },
+    {
+      "epoch": 0.00436,
+      "grad_norm": 0.727497935295105,
+      "learning_rate": 0.001308,
+      "loss": 5.4615,
+      "step": 436
+    },
+    {
+      "epoch": 0.00437,
+      "grad_norm": 0.8729015588760376,
+      "learning_rate": 0.001311,
+      "loss": 5.4821,
+      "step": 437
+    },
+    {
+      "epoch": 0.00438,
+      "grad_norm": 1.0639538764953613,
+      "learning_rate": 0.001314,
+      "loss": 5.4733,
+      "step": 438
+    },
+    {
+      "epoch": 0.00439,
+      "grad_norm": 0.9613514542579651,
+      "learning_rate": 0.001317,
+      "loss": 5.5005,
+      "step": 439
+    },
+    {
+      "epoch": 0.0044,
+      "grad_norm": 0.7993902564048767,
+      "learning_rate": 0.00132,
+      "loss": 5.4683,
+      "step": 440
+    },
+    {
+      "epoch": 0.00441,
+      "grad_norm": 0.6981948018074036,
+      "learning_rate": 0.001323,
+      "loss": 5.4617,
+      "step": 441
+    },
+    {
+      "epoch": 0.00442,
+      "grad_norm": 0.5610657334327698,
+      "learning_rate": 0.0013260000000000001,
+      "loss": 5.4493,
+      "step": 442
+    },
+    {
+      "epoch": 0.00443,
+      "grad_norm": 0.6162020564079285,
+      "learning_rate": 0.001329,
+      "loss": 5.4545,
+      "step": 443
+    },
+    {
+      "epoch": 0.00444,
+      "grad_norm": 0.5797529816627502,
+      "learning_rate": 0.001332,
+      "loss": 5.4538,
+      "step": 444
+    },
+    {
+      "epoch": 0.00445,
+      "grad_norm": 0.551798939704895,
+      "learning_rate": 0.001335,
+      "loss": 5.4358,
+      "step": 445
+    },
+    {
+      "epoch": 0.00446,
+      "grad_norm": 0.48300743103027344,
+      "learning_rate": 0.001338,
+      "loss": 5.4369,
+      "step": 446
+    },
+    {
+      "epoch": 0.00447,
+      "grad_norm": 0.5713039040565491,
+      "learning_rate": 0.001341,
+      "loss": 5.4366,
+      "step": 447
+    },
+    {
+      "epoch": 0.00448,
+      "grad_norm": 0.7566826939582825,
+      "learning_rate": 0.0013440000000000001,
+      "loss": 5.4234,
+      "step": 448
+    },
+    {
+      "epoch": 0.00449,
+      "grad_norm": 1.1563501358032227,
+      "learning_rate": 0.001347,
+      "loss": 5.4418,
+      "step": 449
+    },
+    {
+      "epoch": 0.0045,
+      "grad_norm": 1.132352590560913,
+      "learning_rate": 0.00135,
+      "loss": 5.4433,
+      "step": 450
+    },
+    {
+      "epoch": 0.00451,
+      "grad_norm": 1.0986182689666748,
+      "learning_rate": 0.001353,
+      "loss": 5.4341,
+      "step": 451
+    },
+    {
+      "epoch": 0.00452,
+      "grad_norm": 1.066072702407837,
+      "learning_rate": 0.001356,
+      "loss": 5.4212,
+      "step": 452
+    },
+    {
+      "epoch": 0.00453,
+      "grad_norm": 0.9297358989715576,
+      "learning_rate": 0.001359,
+      "loss": 5.4103,
+      "step": 453
+    },
+    {
+      "epoch": 0.00454,
+      "grad_norm": 0.9204379320144653,
+      "learning_rate": 0.0013620000000000001,
+      "loss": 5.4294,
+      "step": 454
+    },
+    {
+      "epoch": 0.00455,
+      "grad_norm": 1.0156644582748413,
+      "learning_rate": 0.0013650000000000001,
+      "loss": 5.4283,
+      "step": 455
+    },
+    {
+      "epoch": 0.00456,
+      "grad_norm": 1.070080041885376,
+      "learning_rate": 0.001368,
+      "loss": 5.4307,
+      "step": 456
+    },
+    {
+      "epoch": 0.00457,
+      "grad_norm": 0.7985509634017944,
+      "learning_rate": 0.001371,
+      "loss": 5.4131,
+      "step": 457
+    },
+    {
+      "epoch": 0.00458,
+      "grad_norm": 0.8899184465408325,
+      "learning_rate": 0.001374,
+      "loss": 5.4253,
+      "step": 458
+    },
+    {
+      "epoch": 0.00459,
+      "grad_norm": 1.0247424840927124,
+      "learning_rate": 0.0013770000000000002,
+      "loss": 5.4196,
+      "step": 459
+    },
+    {
+      "epoch": 0.0046,
+      "grad_norm": 0.8818691968917847,
+      "learning_rate": 0.0013800000000000002,
+      "loss": 5.4023,
+      "step": 460
+    },
+    {
+      "epoch": 0.00461,
+      "grad_norm": 0.9398977160453796,
+      "learning_rate": 0.0013830000000000001,
+      "loss": 5.4028,
+      "step": 461
+    },
+    {
+      "epoch": 0.00462,
+      "grad_norm": 0.9095609784126282,
+      "learning_rate": 0.001386,
+      "loss": 5.4141,
+      "step": 462
+    },
+    {
+      "epoch": 0.00463,
+      "grad_norm": 0.7167875170707703,
+      "learning_rate": 0.001389,
+      "loss": 5.3816,
+      "step": 463
+    },
+    {
+      "epoch": 0.00464,
+      "grad_norm": 0.8864797353744507,
+      "learning_rate": 0.001392,
+      "loss": 5.4115,
+      "step": 464
+    },
+    {
+      "epoch": 0.00465,
+      "grad_norm": 0.8739372491836548,
+      "learning_rate": 0.0013950000000000002,
+      "loss": 5.3819,
+      "step": 465
+    },
+    {
+      "epoch": 0.00466,
+      "grad_norm": 0.808113157749176,
+      "learning_rate": 0.0013980000000000002,
+      "loss": 5.3947,
+      "step": 466
+    },
+    {
+      "epoch": 0.00467,
+      "grad_norm": 0.878140389919281,
+      "learning_rate": 0.0014010000000000001,
+      "loss": 5.3861,
+      "step": 467
+    },
+    {
+      "epoch": 0.00468,
+      "grad_norm": 1.0618577003479004,
+      "learning_rate": 0.001404,
+      "loss": 5.3874,
+      "step": 468
+    },
+    {
+      "epoch": 0.00469,
+      "grad_norm": 0.8678603172302246,
+      "learning_rate": 0.001407,
+      "loss": 5.3793,
+      "step": 469
+    },
+    {
+      "epoch": 0.0047,
+      "grad_norm": 1.0598393678665161,
+      "learning_rate": 0.00141,
+      "loss": 5.397,
+      "step": 470
+    },
+    {
+      "epoch": 0.00471,
+      "grad_norm": 0.9507164359092712,
+      "learning_rate": 0.001413,
+      "loss": 5.3744,
+      "step": 471
+    },
+    {
+      "epoch": 0.00472,
+      "grad_norm": 0.891309916973114,
+      "learning_rate": 0.001416,
+      "loss": 5.3876,
+      "step": 472
+    },
+    {
+      "epoch": 0.00473,
+      "grad_norm": 0.9032427072525024,
+      "learning_rate": 0.001419,
+      "loss": 5.3933,
+      "step": 473
+    },
+    {
+      "epoch": 0.00474,
+      "grad_norm": 1.2588310241699219,
+      "learning_rate": 0.0014219999999999999,
+      "loss": 5.3882,
+      "step": 474
+    },
+    {
+      "epoch": 0.00475,
+      "grad_norm": 0.8014562129974365,
+      "learning_rate": 0.001425,
+      "loss": 5.367,
+      "step": 475
+    },
+    {
+      "epoch": 0.00476,
+      "grad_norm": 0.7612058520317078,
+      "learning_rate": 0.001428,
+      "loss": 5.3988,
+      "step": 476
+    },
+    {
+      "epoch": 0.00477,
+      "grad_norm": 0.6699860095977783,
+      "learning_rate": 0.001431,
+      "loss": 5.3462,
+      "step": 477
+    },
+    {
+      "epoch": 0.00478,
+      "grad_norm": 0.7476372718811035,
+      "learning_rate": 0.001434,
+      "loss": 5.3761,
+      "step": 478
+    },
+    {
+      "epoch": 0.00479,
+      "grad_norm": 0.7134982347488403,
+      "learning_rate": 0.001437,
+      "loss": 5.3522,
+      "step": 479
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.5948371887207031,
+      "learning_rate": 0.0014399999999999999,
+      "loss": 5.343,
+      "step": 480
+    },
+    {
+      "epoch": 0.00481,
+      "grad_norm": 0.4980184733867645,
+      "learning_rate": 0.001443,
+      "loss": 5.3421,
+      "step": 481
+    },
+    {
+      "epoch": 0.00482,
+      "grad_norm": 0.49235713481903076,
+      "learning_rate": 0.001446,
+      "loss": 5.3164,
+      "step": 482
+    },
+    {
+      "epoch": 0.00483,
+      "grad_norm": 0.5064442157745361,
+      "learning_rate": 0.001449,
+      "loss": 5.327,
+      "step": 483
+    },
+    {
+      "epoch": 0.00484,
+      "grad_norm": 0.506096601486206,
+      "learning_rate": 0.001452,
+      "loss": 5.3265,
+      "step": 484
+    },
+    {
+      "epoch": 0.00485,
+      "grad_norm": 0.5634677410125732,
+      "learning_rate": 0.001455,
+      "loss": 5.3337,
+      "step": 485
+    },
+    {
+      "epoch": 0.00486,
+      "grad_norm": 0.5971659421920776,
+      "learning_rate": 0.001458,
+      "loss": 5.306,
+      "step": 486
+    },
+    {
+      "epoch": 0.00487,
+      "grad_norm": 0.5582761168479919,
+      "learning_rate": 0.001461,
+      "loss": 5.3211,
+      "step": 487
+    },
+    {
+      "epoch": 0.00488,
+      "grad_norm": 0.5640081763267517,
+      "learning_rate": 0.001464,
+      "loss": 5.2971,
+      "step": 488
+    },
+    {
+      "epoch": 0.00489,
+      "grad_norm": 0.6793756484985352,
+      "learning_rate": 0.001467,
+      "loss": 5.2905,
+      "step": 489
+    },
+    {
+      "epoch": 0.0049,
+      "grad_norm": 0.7497550249099731,
+      "learning_rate": 0.00147,
+      "loss": 5.3082,
+      "step": 490
+    },
+    {
+      "epoch": 0.00491,
+      "grad_norm": 0.7264507412910461,
+      "learning_rate": 0.001473,
+      "loss": 5.3062,
+      "step": 491
+    },
+    {
+      "epoch": 0.00492,
+      "grad_norm": 0.5965330004692078,
+      "learning_rate": 0.001476,
+      "loss": 5.3011,
+      "step": 492
+    },
+    {
+      "epoch": 0.00493,
+      "grad_norm": 0.8212659358978271,
+      "learning_rate": 0.001479,
+      "loss": 5.3006,
+      "step": 493
+    },
+    {
+      "epoch": 0.00494,
+      "grad_norm": 1.2706849575042725,
+      "learning_rate": 0.001482,
+      "loss": 5.3221,
+      "step": 494
+    },
+    {
+      "epoch": 0.00495,
+      "grad_norm": 0.9726585149765015,
+      "learning_rate": 0.001485,
+      "loss": 5.3189,
+      "step": 495
+    },
+    {
+      "epoch": 0.00496,
+      "grad_norm": 1.0229647159576416,
+      "learning_rate": 0.001488,
+      "loss": 5.3095,
+      "step": 496
+    },
+    {
+      "epoch": 0.00497,
+      "grad_norm": 0.9751450419425964,
+      "learning_rate": 0.001491,
+      "loss": 5.294,
+      "step": 497
+    },
+    {
+      "epoch": 0.00498,
+      "grad_norm": 0.9788212776184082,
+      "learning_rate": 0.001494,
+      "loss": 5.3219,
+      "step": 498
+    },
+    {
+      "epoch": 0.00499,
+      "grad_norm": 0.897365391254425,
+      "learning_rate": 0.001497,
+      "loss": 5.2943,
+      "step": 499
+    },
+    {
+      "epoch": 0.005,
+      "grad_norm": 0.8972038626670837,
+      "learning_rate": 0.0015,
+      "loss": 5.2998,
+      "step": 500
+    },
+    {
+      "epoch": 0.00501,
+      "grad_norm": 1.123322606086731,
+      "learning_rate": 0.001503,
+      "loss": 5.3107,
+      "step": 501
+    },
+    {
+      "epoch": 0.00502,
+      "grad_norm": 1.085119366645813,
+      "learning_rate": 0.001506,
+      "loss": 5.3147,
+      "step": 502
+    },
+    {
+      "epoch": 0.00503,
+      "grad_norm": 0.9612423181533813,
+      "learning_rate": 0.0015090000000000001,
+      "loss": 5.3162,
+      "step": 503
+    },
+    {
+      "epoch": 0.00504,
+      "grad_norm": 1.0992624759674072,
+      "learning_rate": 0.001512,
+      "loss": 5.3083,
+      "step": 504
+    },
+    {
+      "epoch": 0.00505,
+      "grad_norm": 0.9857581257820129,
+      "learning_rate": 0.001515,
+      "loss": 5.2946,
+      "step": 505
+    },
+    {
+      "epoch": 0.00506,
+      "grad_norm": 1.1049542427062988,
+      "learning_rate": 0.001518,
+      "loss": 5.3003,
+      "step": 506
+    },
+    {
+      "epoch": 0.00507,
+      "grad_norm": 0.8998914957046509,
+      "learning_rate": 0.001521,
+      "loss": 5.298,
+      "step": 507
+    },
+    {
+      "epoch": 0.00508,
+      "grad_norm": 0.6991334557533264,
+      "learning_rate": 0.001524,
+      "loss": 5.2862,
+      "step": 508
+    },
+    {
+      "epoch": 0.00509,
+      "grad_norm": 0.7653549313545227,
+      "learning_rate": 0.0015270000000000001,
+      "loss": 5.2723,
+      "step": 509
+    },
+    {
+      "epoch": 0.0051,
+      "grad_norm": 0.7315691113471985,
+      "learning_rate": 0.0015300000000000001,
+      "loss": 5.288,
+      "step": 510
+    },
+    {
+      "epoch": 0.00511,
+      "grad_norm": 0.7975103855133057,
+      "learning_rate": 0.001533,
+      "loss": 5.2648,
+      "step": 511
+    },
+    {
+      "epoch": 0.00512,
+      "grad_norm": 0.9781049489974976,
+      "learning_rate": 0.001536,
+      "loss": 5.2672,
+      "step": 512
+    },
+    {
+      "epoch": 0.00513,
+      "grad_norm": 1.084666132926941,
+      "learning_rate": 0.001539,
+      "loss": 5.2896,
+      "step": 513
+    },
+    {
+      "epoch": 0.00514,
+      "grad_norm": 0.9010921120643616,
+      "learning_rate": 0.001542,
+      "loss": 5.2706,
+      "step": 514
+    },
+    {
+      "epoch": 0.00515,
+      "grad_norm": 0.96586012840271,
+      "learning_rate": 0.0015450000000000001,
+      "loss": 5.2764,
+      "step": 515
+    },
+    {
+      "epoch": 0.00516,
+      "grad_norm": 0.9655681848526001,
+      "learning_rate": 0.0015480000000000001,
+      "loss": 5.2769,
+      "step": 516
+    },
+    {
+      "epoch": 0.00517,
+      "grad_norm": 0.8448560833930969,
+      "learning_rate": 0.001551,
+      "loss": 5.2634,
+      "step": 517
+    },
+    {
+      "epoch": 0.00518,
+      "grad_norm": 0.7809770703315735,
+      "learning_rate": 0.001554,
+      "loss": 5.2515,
+      "step": 518
+    },
+    {
+      "epoch": 0.00519,
+      "grad_norm": 0.913107693195343,
+      "learning_rate": 0.001557,
+      "loss": 5.2572,
+      "step": 519
+    },
+    {
+      "epoch": 0.0052,
+      "grad_norm": 0.9221185445785522,
+      "learning_rate": 0.0015600000000000002,
+      "loss": 5.2658,
+      "step": 520
+    },
+    {
+      "epoch": 0.00521,
+      "grad_norm": 0.8511962294578552,
+      "learning_rate": 0.0015630000000000002,
+      "loss": 5.2423,
+      "step": 521
+    },
+    {
+      "epoch": 0.00522,
+      "grad_norm": 0.7266805171966553,
+      "learning_rate": 0.0015660000000000001,
+      "loss": 5.2383,
+      "step": 522
+    },
+    {
+      "epoch": 0.00523,
+      "grad_norm": 0.8215247392654419,
+      "learning_rate": 0.001569,
+      "loss": 5.2547,
+      "step": 523
+    },
+    {
+      "epoch": 0.00524,
+      "grad_norm": 0.8742693066596985,
+      "learning_rate": 0.001572,
+      "loss": 5.2653,
+      "step": 524
+    },
+    {
+      "epoch": 0.00525,
+      "grad_norm": 0.6882407069206238,
+      "learning_rate": 0.001575,
+      "loss": 5.2501,
+      "step": 525
+    },
+    {
+      "epoch": 0.00526,
+      "grad_norm": 0.7224147319793701,
+      "learning_rate": 0.0015780000000000002,
+      "loss": 5.2254,
+      "step": 526
+    },
+    {
+      "epoch": 0.00527,
+      "grad_norm": 0.6562958359718323,
+      "learning_rate": 0.0015810000000000002,
+      "loss": 5.2385,
+      "step": 527
+    },
+    {
+      "epoch": 0.00528,
+      "grad_norm": 0.6051112413406372,
+      "learning_rate": 0.0015840000000000001,
+      "loss": 5.2165,
+      "step": 528
+    },
+    {
+      "epoch": 0.00529,
+      "grad_norm": 0.647803008556366,
+      "learning_rate": 0.001587,
+      "loss": 5.2244,
+      "step": 529
+    },
+    {
+      "epoch": 0.0053,
+      "grad_norm": 0.6608071327209473,
+      "learning_rate": 0.00159,
+      "loss": 5.2339,
+      "step": 530
+    },
+    {
+      "epoch": 0.00531,
+      "grad_norm": 0.6765715479850769,
+      "learning_rate": 0.001593,
+      "loss": 5.2181,
+      "step": 531
+    },
+    {
+      "epoch": 0.00532,
+      "grad_norm": 0.7380223870277405,
+      "learning_rate": 0.0015960000000000002,
+      "loss": 5.2256,
+      "step": 532
+    },
+    {
+      "epoch": 0.00533,
+      "grad_norm": 0.7506837248802185,
+      "learning_rate": 0.0015990000000000002,
+      "loss": 5.2236,
+      "step": 533
+    },
+    {
+      "epoch": 0.00534,
+      "grad_norm": 0.577497661113739,
+      "learning_rate": 0.0016020000000000001,
+      "loss": 5.2066,
+      "step": 534
+    },
+    {
+      "epoch": 0.00535,
+      "grad_norm": 0.4974853992462158,
+      "learning_rate": 0.001605,
+      "loss": 5.2096,
+      "step": 535
+    },
+    {
+      "epoch": 0.00536,
+      "grad_norm": 0.5754765272140503,
+      "learning_rate": 0.001608,
+      "loss": 5.2121,
+      "step": 536
+    },
+    {
+      "epoch": 0.00537,
+      "grad_norm": 0.6681102514266968,
+      "learning_rate": 0.0016110000000000002,
+      "loss": 5.2067,
+      "step": 537
+    },
+    {
+      "epoch": 0.00538,
+      "grad_norm": 0.8286970257759094,
+      "learning_rate": 0.0016140000000000002,
+      "loss": 5.1882,
+      "step": 538
+    },
+    {
+      "epoch": 0.00539,
+      "grad_norm": 1.0212045907974243,
+      "learning_rate": 0.0016170000000000002,
+      "loss": 5.193,
+      "step": 539
+    },
+    {
+      "epoch": 0.0054,
+      "grad_norm": 1.0495171546936035,
+      "learning_rate": 0.0016200000000000001,
+      "loss": 5.2061,
+      "step": 540
+    },
+    {
+      "epoch": 0.00541,
+      "grad_norm": 0.9756328463554382,
+      "learning_rate": 0.001623,
+      "loss": 5.1953,
+      "step": 541
+    },
+    {
+      "epoch": 0.00542,
+      "grad_norm": 1.024538278579712,
+      "learning_rate": 0.001626,
+      "loss": 5.2247,
+      "step": 542
+    },
+    {
+      "epoch": 0.00543,
+      "grad_norm": 1.005081057548523,
+      "learning_rate": 0.0016290000000000002,
+      "loss": 5.1883,
+      "step": 543
+    },
+    {
+      "epoch": 0.00544,
+      "grad_norm": 1.1540062427520752,
+      "learning_rate": 0.0016320000000000002,
+      "loss": 5.2159,
+      "step": 544
+    },
+    {
+      "epoch": 0.00545,
+      "grad_norm": 1.0779460668563843,
+      "learning_rate": 0.0016350000000000002,
+      "loss": 5.2271,
+      "step": 545
+    },
+    {
+      "epoch": 0.00546,
+      "grad_norm": 0.9231882691383362,
+      "learning_rate": 0.0016380000000000001,
+      "loss": 5.1917,
+      "step": 546
+    },
+    {
+      "epoch": 0.00547,
+      "grad_norm": 0.9463688731193542,
+      "learning_rate": 0.001641,
+      "loss": 5.1943,
+      "step": 547
+    },
+    {
+      "epoch": 0.00548,
+      "grad_norm": 0.8151195645332336,
+      "learning_rate": 0.001644,
+      "loss": 5.1898,
+      "step": 548
+    },
+    {
+      "epoch": 0.00549,
+      "grad_norm": 0.6324855089187622,
+      "learning_rate": 0.0016470000000000002,
+      "loss": 5.1802,
+      "step": 549
+    },
+    {
+      "epoch": 0.0055,
+      "grad_norm": 0.7012510299682617,
+      "learning_rate": 0.0016500000000000002,
+      "loss": 5.1963,
+      "step": 550
+    },
+    {
+      "epoch": 0.00551,
+      "grad_norm": 0.6119561195373535,
+      "learning_rate": 0.0016530000000000002,
+      "loss": 5.1867,
+      "step": 551
+    },
+    {
+      "epoch": 0.00552,
+      "grad_norm": 0.6565516591072083,
+      "learning_rate": 0.0016560000000000001,
+      "loss": 5.1814,
+      "step": 552
+    },
+    {
+      "epoch": 0.00553,
+      "grad_norm": 0.60141921043396,
+      "learning_rate": 0.001659,
+      "loss": 5.1716,
+      "step": 553
+    },
+    {
+      "epoch": 0.00554,
+      "grad_norm": 0.5632038116455078,
+      "learning_rate": 0.0016620000000000003,
+      "loss": 5.1664,
+      "step": 554
+    },
+    {
+      "epoch": 0.00555,
+      "grad_norm": 0.7644810080528259,
+      "learning_rate": 0.0016650000000000002,
+      "loss": 5.1782,
+      "step": 555
+    },
+    {
+      "epoch": 0.00556,
+      "grad_norm": 0.8742493987083435,
+      "learning_rate": 0.0016680000000000002,
+      "loss": 5.1659,
+      "step": 556
+    },
+    {
+      "epoch": 0.00557,
+      "grad_norm": 0.8544741272926331,
+      "learning_rate": 0.0016710000000000002,
+      "loss": 5.1502,
+      "step": 557
+    },
+    {
+      "epoch": 0.00558,
+      "grad_norm": 0.9127110838890076,
+      "learning_rate": 0.0016740000000000001,
+      "loss": 5.182,
+      "step": 558
+    },
+    {
+      "epoch": 0.00559,
+      "grad_norm": 0.9648676514625549,
+      "learning_rate": 0.001677,
+      "loss": 5.1678,
+      "step": 559
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.8821786642074585,
+      "learning_rate": 0.0016800000000000003,
+      "loss": 5.1427,
+      "step": 560
+    },
+    {
+      "epoch": 0.00561,
+      "grad_norm": 0.891691267490387,
+      "learning_rate": 0.0016830000000000003,
+      "loss": 5.1733,
+      "step": 561
+    },
+    {
+      "epoch": 0.00562,
+      "grad_norm": 0.9146907925605774,
+      "learning_rate": 0.0016860000000000002,
+      "loss": 5.1636,
+      "step": 562
+    },
+    {
+      "epoch": 0.00563,
+      "grad_norm": 0.9244825839996338,
+      "learning_rate": 0.001689,
+      "loss": 5.1722,
+      "step": 563
+    },
+    {
+      "epoch": 0.00564,
+      "grad_norm": 0.7839826345443726,
+      "learning_rate": 0.001692,
+      "loss": 5.1431,
+      "step": 564
+    },
+    {
+      "epoch": 0.00565,
+      "grad_norm": 1.062433123588562,
+      "learning_rate": 0.001695,
+      "loss": 5.1564,
+      "step": 565
+    },
+    {
+      "epoch": 0.00566,
+      "grad_norm": 0.8471325635910034,
+      "learning_rate": 0.0016979999999999999,
+      "loss": 5.1512,
+      "step": 566
+    },
+    {
+      "epoch": 0.00567,
+      "grad_norm": 0.8534058928489685,
+      "learning_rate": 0.0017009999999999998,
+      "loss": 5.1473,
+      "step": 567
+    },
+    {
+      "epoch": 0.00568,
+      "grad_norm": 0.8670461177825928,
+      "learning_rate": 0.0017039999999999998,
+      "loss": 5.1567,
+      "step": 568
+    },
+    {
+      "epoch": 0.00569,
+      "grad_norm": 0.9165869951248169,
+      "learning_rate": 0.001707,
+      "loss": 5.1375,
+      "step": 569
+    },
+    {
+      "epoch": 0.0057,
+      "grad_norm": 1.1377770900726318,
+      "learning_rate": 0.00171,
+      "loss": 5.1664,
+      "step": 570
+    },
+    {
+      "epoch": 0.00571,
+      "grad_norm": 0.8107508420944214,
+      "learning_rate": 0.001713,
+      "loss": 5.1589,
+      "step": 571
+    },
+    {
+      "epoch": 0.00572,
+      "grad_norm": 0.7913006544113159,
+      "learning_rate": 0.0017159999999999999,
+      "loss": 5.1345,
+      "step": 572
+    },
+    {
+      "epoch": 0.00573,
+      "grad_norm": 0.7625595927238464,
+      "learning_rate": 0.0017189999999999998,
+      "loss": 5.1388,
+      "step": 573
+    },
+    {
+      "epoch": 0.00574,
+      "grad_norm": 0.8708691596984863,
+      "learning_rate": 0.001722,
+      "loss": 5.1477,
+      "step": 574
+    },
+    {
+      "epoch": 0.00575,
+      "grad_norm": 0.8358116149902344,
+      "learning_rate": 0.001725,
+      "loss": 5.126,
+      "step": 575
+    },
+    {
+      "epoch": 0.00576,
+      "grad_norm": 0.7503964900970459,
+      "learning_rate": 0.001728,
+      "loss": 5.1165,
+      "step": 576
+    },
+    {
+      "epoch": 0.00577,
+      "grad_norm": 0.743698000907898,
+      "learning_rate": 0.001731,
+      "loss": 5.1434,
+      "step": 577
+    },
+    {
+      "epoch": 0.00578,
+      "grad_norm": 0.7937496900558472,
+      "learning_rate": 0.0017339999999999999,
+      "loss": 5.1222,
+      "step": 578
+    },
+    {
+      "epoch": 0.00579,
+      "grad_norm": 0.7887423038482666,
+      "learning_rate": 0.0017369999999999998,
+      "loss": 5.1084,
+      "step": 579
+    },
+    {
+      "epoch": 0.0058,
+      "grad_norm": 0.7772188186645508,
+      "learning_rate": 0.00174,
+      "loss": 5.1272,
+      "step": 580
+    },
+    {
+      "epoch": 0.00581,
+      "grad_norm": 0.8483501076698303,
+      "learning_rate": 0.001743,
+      "loss": 5.1251,
+      "step": 581
+    },
+    {
+      "epoch": 0.00582,
+      "grad_norm": 1.0748672485351562,
+      "learning_rate": 0.001746,
+      "loss": 5.1133,
+      "step": 582
+    },
+    {
+      "epoch": 0.00583,
+      "grad_norm": 0.9845912456512451,
+      "learning_rate": 0.001749,
+      "loss": 5.1338,
+      "step": 583
+    },
+    {
+      "epoch": 0.00584,
+      "grad_norm": 1.0171496868133545,
+      "learning_rate": 0.0017519999999999999,
+      "loss": 5.1328,
+      "step": 584
+    },
+    {
+      "epoch": 0.00585,
+      "grad_norm": 0.932063102722168,
+      "learning_rate": 0.0017549999999999998,
+      "loss": 5.1125,
+      "step": 585
+    },
+    {
+      "epoch": 0.00586,
+      "grad_norm": 1.0053131580352783,
+      "learning_rate": 0.001758,
+      "loss": 5.1277,
+      "step": 586
+    },
+    {
+      "epoch": 0.00587,
+      "grad_norm": 1.0553542375564575,
+      "learning_rate": 0.001761,
+      "loss": 5.1113,
+      "step": 587
+    },
+    {
+      "epoch": 0.00588,
+      "grad_norm": 0.9641870260238647,
+      "learning_rate": 0.001764,
+      "loss": 5.1209,
+      "step": 588
+    },
+    {
+      "epoch": 0.00589,
+      "grad_norm": 0.7223602533340454,
+      "learning_rate": 0.001767,
+      "loss": 5.1139,
+      "step": 589
+    },
+    {
+      "epoch": 0.0059,
+      "grad_norm": 0.7580032348632812,
+      "learning_rate": 0.0017699999999999999,
+      "loss": 5.1016,
+      "step": 590
+    },
+    {
+      "epoch": 0.00591,
+      "grad_norm": 0.7553709149360657,
+      "learning_rate": 0.001773,
+      "loss": 5.097,
+      "step": 591
+    },
+    {
+      "epoch": 0.00592,
+      "grad_norm": 0.7395292520523071,
+      "learning_rate": 0.001776,
+      "loss": 5.122,
+      "step": 592
+    },
+    {
+      "epoch": 0.00593,
+      "grad_norm": 0.6007040739059448,
+      "learning_rate": 0.001779,
+      "loss": 5.1118,
+      "step": 593
+    },
+    {
+      "epoch": 0.00594,
+      "grad_norm": 0.6126047372817993,
+      "learning_rate": 0.001782,
+      "loss": 5.1081,
+      "step": 594
+    },
+    {
+      "epoch": 0.00595,
+      "grad_norm": 0.607721745967865,
+      "learning_rate": 0.001785,
+      "loss": 5.0957,
+      "step": 595
+    },
+    {
+      "epoch": 0.00596,
+      "grad_norm": 0.6556451916694641,
+      "learning_rate": 0.0017879999999999999,
+      "loss": 5.0948,
+      "step": 596
+    },
+    {
+      "epoch": 0.00597,
+      "grad_norm": 0.7172878384590149,
+      "learning_rate": 0.001791,
+      "loss": 5.0729,
+      "step": 597
+    },
+    {
+      "epoch": 0.00598,
+      "grad_norm": 0.6043835878372192,
+      "learning_rate": 0.001794,
+      "loss": 5.0805,
+      "step": 598
+    },
+    {
+      "epoch": 0.00599,
+      "grad_norm": 0.5666232109069824,
+      "learning_rate": 0.001797,
+      "loss": 5.0796,
+      "step": 599
+    },
+    {
+      "epoch": 0.006,
+      "grad_norm": 0.5673431754112244,
+      "learning_rate": 0.0018,
+      "loss": 5.0764,
+      "step": 600
+    },
+    {
+      "epoch": 0.00601,
+      "grad_norm": 0.5798671841621399,
+      "learning_rate": 0.001803,
+      "loss": 5.0708,
+      "step": 601
+    },
+    {
+      "epoch": 0.00602,
+      "grad_norm": 0.520574152469635,
+      "learning_rate": 0.0018059999999999999,
+      "loss": 5.0544,
+      "step": 602
+    },
+    {
+      "epoch": 0.00603,
+      "grad_norm": 0.5210540294647217,
+      "learning_rate": 0.001809,
+      "loss": 5.0732,
+      "step": 603
+    },
+    {
+      "epoch": 0.00604,
+      "grad_norm": 0.6759857535362244,
+      "learning_rate": 0.001812,
+      "loss": 5.0576,
+      "step": 604
+    },
+    {
+      "epoch": 0.00605,
+      "grad_norm": 0.7568657994270325,
+      "learning_rate": 0.001815,
+      "loss": 5.0701,
+      "step": 605
+    },
+    {
+      "epoch": 0.00606,
+      "grad_norm": 0.7632762789726257,
+      "learning_rate": 0.001818,
+      "loss": 5.0711,
+      "step": 606
+    },
+    {
+      "epoch": 0.00607,
+      "grad_norm": 0.788451611995697,
+      "learning_rate": 0.001821,
+      "loss": 5.073,
+      "step": 607
+    },
+    {
+      "epoch": 0.00608,
+      "grad_norm": 0.763842761516571,
+      "learning_rate": 0.001824,
+      "loss": 5.0575,
+      "step": 608
+    },
+    {
+      "epoch": 0.00609,
+      "grad_norm": 0.825861930847168,
+      "learning_rate": 0.001827,
+      "loss": 5.0665,
+      "step": 609
+    },
+    {
+      "epoch": 0.0061,
+      "grad_norm": 1.0143935680389404,
+      "learning_rate": 0.00183,
+      "loss": 5.0488,
+      "step": 610
+    },
+    {
+      "epoch": 0.00611,
+      "grad_norm": 1.1116124391555786,
+      "learning_rate": 0.001833,
+      "loss": 5.0562,
+      "step": 611
+    },
+    {
+      "epoch": 0.00612,
+      "grad_norm": 1.0579830408096313,
+      "learning_rate": 0.001836,
+      "loss": 5.0621,
+      "step": 612
+    },
+    {
+      "epoch": 0.00613,
+      "grad_norm": 1.2180272340774536,
+      "learning_rate": 0.001839,
+      "loss": 5.069,
+      "step": 613
+    },
+    {
+      "epoch": 0.00614,
+      "grad_norm": 0.7525346875190735,
+      "learning_rate": 0.001842,
+      "loss": 5.0757,
+      "step": 614
+    },
+    {
+      "epoch": 0.00615,
+      "grad_norm": 0.833108127117157,
+      "learning_rate": 0.001845,
+      "loss": 5.0659,
+      "step": 615
+    },
+    {
+      "epoch": 0.00616,
+      "grad_norm": 0.7170072793960571,
+      "learning_rate": 0.001848,
+      "loss": 5.0473,
+      "step": 616
+    },
+    {
+      "epoch": 0.00617,
+      "grad_norm": 0.718910276889801,
+      "learning_rate": 0.001851,
+      "loss": 5.0608,
+      "step": 617
+    },
+    {
+      "epoch": 0.00618,
+      "grad_norm": 0.6572015881538391,
+      "learning_rate": 0.001854,
+      "loss": 5.0425,
+      "step": 618
+    },
+    {
+      "epoch": 0.00619,
+      "grad_norm": 0.7236103415489197,
+      "learning_rate": 0.001857,
+      "loss": 5.0504,
+      "step": 619
+    },
+    {
+      "epoch": 0.0062,
+      "grad_norm": 0.8058017492294312,
+      "learning_rate": 0.00186,
+      "loss": 5.0386,
+      "step": 620
+    },
+    {
+      "epoch": 0.00621,
+      "grad_norm": 0.861880898475647,
+      "learning_rate": 0.001863,
+      "loss": 5.0409,
+      "step": 621
+    },
+    {
+      "epoch": 0.00622,
+      "grad_norm": 1.0328248739242554,
+      "learning_rate": 0.001866,
+      "loss": 5.0489,
+      "step": 622
+    },
+    {
+      "epoch": 0.00623,
+      "grad_norm": 0.8494102358818054,
+      "learning_rate": 0.001869,
+      "loss": 5.0318,
+      "step": 623
+    },
+    {
+      "epoch": 0.00624,
+      "grad_norm": 0.9827755093574524,
+      "learning_rate": 0.001872,
+      "loss": 5.0476,
+      "step": 624
+    },
+    {
+      "epoch": 0.00625,
+      "grad_norm": 1.0741342306137085,
+      "learning_rate": 0.001875,
+      "loss": 5.0686,
+      "step": 625
+    },
+    {
+      "epoch": 0.00626,
+      "grad_norm": 0.7305473685264587,
+      "learning_rate": 0.0018780000000000001,
+      "loss": 5.0304,
+      "step": 626
+    },
+    {
+      "epoch": 0.00627,
+      "grad_norm": 0.8084409832954407,
+      "learning_rate": 0.001881,
+      "loss": 5.0255,
+      "step": 627
+    },
+    {
+      "epoch": 0.00628,
+      "grad_norm": 1.0902513265609741,
+      "learning_rate": 0.001884,
+      "loss": 5.0666,
+      "step": 628
+    },
+    {
+      "epoch": 0.00629,
+      "grad_norm": 1.036152958869934,
+      "learning_rate": 0.001887,
+      "loss": 5.0439,
+      "step": 629
+    },
+    {
+      "epoch": 0.0063,
+      "grad_norm": 1.0308260917663574,
+      "learning_rate": 0.00189,
+      "loss": 5.0356,
+      "step": 630
+    },
+    {
+      "epoch": 0.00631,
+      "grad_norm": 0.984308123588562,
+      "learning_rate": 0.0018930000000000002,
+      "loss": 5.0356,
+      "step": 631
+    },
+    {
+      "epoch": 0.00632,
+      "grad_norm": 0.8649469017982483,
+      "learning_rate": 0.0018960000000000001,
+      "loss": 5.0479,
+      "step": 632
+    },
+    {
+      "epoch": 0.00633,
+      "grad_norm": 0.8776420950889587,
+      "learning_rate": 0.001899,
+      "loss": 5.0459,
+      "step": 633
+    },
+    {
+      "epoch": 0.00634,
+      "grad_norm": 0.8304409980773926,
+      "learning_rate": 0.001902,
+      "loss": 5.037,
+      "step": 634
+    },
+    {
+      "epoch": 0.00635,
+      "grad_norm": 0.7966147065162659,
+      "learning_rate": 0.001905,
+      "loss": 5.0227,
+      "step": 635
+    },
+    {
+      "epoch": 0.00636,
+      "grad_norm": 0.9172542095184326,
+      "learning_rate": 0.001908,
+      "loss": 5.0337,
+      "step": 636
+    },
+    {
+      "epoch": 0.00637,
+      "grad_norm": 1.3219475746154785,
+      "learning_rate": 0.0019110000000000002,
+      "loss": 5.0715,
+      "step": 637
+    },
+    {
+      "epoch": 0.00638,
+      "grad_norm": 0.7924789190292358,
+      "learning_rate": 0.0019140000000000001,
+      "loss": 5.0347,
+      "step": 638
+    },
+    {
+      "epoch": 0.00639,
+      "grad_norm": 0.8469759225845337,
+      "learning_rate": 0.001917,
+      "loss": 5.0139,
+      "step": 639
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.9544380307197571,
+      "learning_rate": 0.00192,
+      "loss": 5.0191,
+      "step": 640
+    },
+    {
+      "epoch": 0.00641,
+      "grad_norm": 1.0876184701919556,
+      "learning_rate": 0.001923,
+      "loss": 5.0379,
+      "step": 641
+    },
+    {
+      "epoch": 0.00642,
+      "grad_norm": 0.8299298286437988,
+      "learning_rate": 0.001926,
+      "loss": 5.0146,
+      "step": 642
+    },
+    {
+      "epoch": 0.00643,
+      "grad_norm": 0.9603999257087708,
+      "learning_rate": 0.0019290000000000002,
+      "loss": 5.0399,
+      "step": 643
+    },
+    {
+      "epoch": 0.00644,
+      "grad_norm": 0.7977001070976257,
+      "learning_rate": 0.0019320000000000001,
+      "loss": 5.0094,
+      "step": 644
+    },
+    {
+      "epoch": 0.00645,
+      "grad_norm": 0.7244200706481934,
+      "learning_rate": 0.001935,
+      "loss": 5.0161,
+      "step": 645
+    },
+    {
+      "epoch": 0.00646,
+      "grad_norm": 0.7832402586936951,
+      "learning_rate": 0.001938,
+      "loss": 5.0193,
+      "step": 646
+    },
+    {
+      "epoch": 0.00647,
+      "grad_norm": 0.8586620092391968,
+      "learning_rate": 0.001941,
+      "loss": 5.0299,
+      "step": 647
+    },
+    {
+      "epoch": 0.00648,
+      "grad_norm": 0.8153418302536011,
+      "learning_rate": 0.0019440000000000002,
+      "loss": 5.0163,
+      "step": 648
+    },
+    {
+      "epoch": 0.00649,
+      "grad_norm": 0.766000509262085,
+      "learning_rate": 0.0019470000000000002,
+      "loss": 4.9982,
+      "step": 649
+    },
+    {
+      "epoch": 0.0065,
+      "grad_norm": 0.7875446677207947,
+      "learning_rate": 0.0019500000000000001,
+      "loss": 5.0238,
+      "step": 650
+    },
+    {
+      "epoch": 0.00651,
+      "grad_norm": 0.7245673537254333,
+      "learning_rate": 0.001953,
+      "loss": 5.0037,
+      "step": 651
+    },
+    {
+      "epoch": 0.00652,
+      "grad_norm": 0.7840576767921448,
+      "learning_rate": 0.0019560000000000003,
+      "loss": 4.9987,
+      "step": 652
+    },
+    {
+      "epoch": 0.00653,
+      "grad_norm": 0.8112754225730896,
+      "learning_rate": 0.0019590000000000002,
+      "loss": 4.9969,
+      "step": 653
+    },
+    {
+      "epoch": 0.00654,
+      "grad_norm": 0.6785946488380432,
+      "learning_rate": 0.001962,
+      "loss": 4.9786,
+      "step": 654
+    },
+    {
+      "epoch": 0.00655,
+      "grad_norm": 0.6141355633735657,
+      "learning_rate": 0.001965,
+      "loss": 4.9897,
+      "step": 655
+    },
+    {
+      "epoch": 0.00656,
+      "grad_norm": 0.647098958492279,
+      "learning_rate": 0.001968,
+      "loss": 4.982,
+      "step": 656
+    },
+    {
+      "epoch": 0.00657,
+      "grad_norm": 0.6601396203041077,
+      "learning_rate": 0.001971,
+      "loss": 4.9729,
+      "step": 657
+    },
+    {
+      "epoch": 0.00658,
+      "grad_norm": 0.5589995384216309,
+      "learning_rate": 0.001974,
+      "loss": 4.9927,
+      "step": 658
+    },
+    {
+      "epoch": 0.00659,
+      "grad_norm": 0.551213264465332,
+      "learning_rate": 0.001977,
+      "loss": 4.972,
+      "step": 659
+    },
+    {
+      "epoch": 0.0066,
+      "grad_norm": 0.5106261968612671,
+      "learning_rate": 0.00198,
+      "loss": 4.963,
+      "step": 660
+    },
+    {
+      "epoch": 0.00661,
+      "grad_norm": 0.49552980065345764,
+      "learning_rate": 0.001983,
+      "loss": 4.9449,
+      "step": 661
+    },
+    {
+      "epoch": 0.00662,
+      "grad_norm": 0.4502437114715576,
+      "learning_rate": 0.0019860000000000004,
+      "loss": 4.9438,
+      "step": 662
+    },
+    {
+      "epoch": 0.00663,
+      "grad_norm": 0.4717095196247101,
+      "learning_rate": 0.0019890000000000003,
+      "loss": 4.9599,
+      "step": 663
+    },
+    {
+      "epoch": 0.00664,
+      "grad_norm": 0.5179165005683899,
+      "learning_rate": 0.0019920000000000003,
+      "loss": 4.9431,
+      "step": 664
+    },
+    {
+      "epoch": 0.00665,
+      "grad_norm": 0.6156288385391235,
+      "learning_rate": 0.0019950000000000002,
+      "loss": 4.9222,
+      "step": 665
+    },
+    {
+      "epoch": 0.00666,
+      "grad_norm": 0.6403276920318604,
+      "learning_rate": 0.001998,
+      "loss": 4.9456,
+      "step": 666
+    },
+    {
+      "epoch": 0.00667,
+      "grad_norm": 0.6612551808357239,
+      "learning_rate": 0.002001,
+      "loss": 4.9436,
+      "step": 667
+    },
+    {
+      "epoch": 0.00668,
+      "grad_norm": 0.6370317339897156,
+      "learning_rate": 0.002004,
+      "loss": 4.9618,
+      "step": 668
+    },
+    {
+      "epoch": 0.00669,
+      "grad_norm": 0.6632084250450134,
+      "learning_rate": 0.002007,
+      "loss": 4.9618,
+      "step": 669
+    },
+    {
+      "epoch": 0.0067,
+      "grad_norm": 0.6647160649299622,
+      "learning_rate": 0.00201,
+      "loss": 4.9534,
+      "step": 670
+    },
+    {
+      "epoch": 0.00671,
+      "grad_norm": 0.6171524524688721,
+      "learning_rate": 0.002013,
+      "loss": 4.9269,
+      "step": 671
+    },
+    {
+      "epoch": 0.00672,
+      "grad_norm": 0.6804357171058655,
+      "learning_rate": 0.002016,
+      "loss": 4.9362,
+      "step": 672
+    },
+    {
+      "epoch": 0.00673,
+      "grad_norm": 0.7436751127243042,
+      "learning_rate": 0.002019,
+      "loss": 4.9351,
+      "step": 673
+    },
+    {
+      "epoch": 0.00674,
+      "grad_norm": 0.8610292077064514,
+      "learning_rate": 0.0020220000000000004,
+      "loss": 4.9433,
+      "step": 674
+    },
+    {
+      "epoch": 0.00675,
+      "grad_norm": 1.046964168548584,
+      "learning_rate": 0.0020250000000000003,
+      "loss": 4.9648,
+      "step": 675
+    },
+    {
+      "epoch": 0.00676,
+      "grad_norm": 0.9578864574432373,
+      "learning_rate": 0.0020280000000000003,
+      "loss": 4.9492,
+      "step": 676
+    },
+    {
+      "epoch": 0.00677,
+      "grad_norm": 1.1337146759033203,
+      "learning_rate": 0.0020310000000000003,
+      "loss": 4.9695,
+      "step": 677
+    },
+    {
+      "epoch": 0.00678,
+      "grad_norm": 1.0358091592788696,
+      "learning_rate": 0.0020340000000000002,
+      "loss": 4.9763,
+      "step": 678
+    },
+    {
+      "epoch": 0.00679,
+      "grad_norm": 1.230987787246704,
+      "learning_rate": 0.002037,
+      "loss": 4.9735,
+      "step": 679
+    },
+    {
+      "epoch": 0.0068,
+      "grad_norm": 0.9104715585708618,
+      "learning_rate": 0.00204,
+      "loss": 4.9643,
+      "step": 680
+    },
+    {
+      "epoch": 0.00681,
+      "grad_norm": 1.1940183639526367,
+      "learning_rate": 0.002043,
+      "loss": 4.9659,
+      "step": 681
+    },
+    {
+      "epoch": 0.00682,
+      "grad_norm": 1.0067143440246582,
+      "learning_rate": 0.002046,
+      "loss": 4.9735,
+      "step": 682
+    },
+    {
+      "epoch": 0.00683,
+      "grad_norm": 1.224305510520935,
+      "learning_rate": 0.002049,
+      "loss": 4.9612,
+      "step": 683
+    },
+    {
+      "epoch": 0.00684,
+      "grad_norm": 0.9917755126953125,
+      "learning_rate": 0.002052,
+      "loss": 4.9694,
+      "step": 684
+    },
+    {
+      "epoch": 0.00685,
+      "grad_norm": 0.9187195897102356,
+      "learning_rate": 0.0020550000000000004,
+      "loss": 4.964,
+      "step": 685
+    },
+    {
+      "epoch": 0.00686,
+      "grad_norm": 1.035937786102295,
+      "learning_rate": 0.0020580000000000004,
+      "loss": 4.9577,
+      "step": 686
+    },
+    {
+      "epoch": 0.00687,
+      "grad_norm": 1.0923206806182861,
+      "learning_rate": 0.0020610000000000003,
+      "loss": 4.9579,
+      "step": 687
+    },
+    {
+      "epoch": 0.00688,
+      "grad_norm": 0.8355166912078857,
+      "learning_rate": 0.002064,
+      "loss": 4.9524,
+      "step": 688
+    },
+    {
+      "epoch": 0.00689,
+      "grad_norm": 0.7577447891235352,
+      "learning_rate": 0.002067,
+      "loss": 4.9417,
+      "step": 689
+    },
+    {
+      "epoch": 0.0069,
+      "grad_norm": 0.7621678709983826,
+      "learning_rate": 0.00207,
+      "loss": 4.9421,
+      "step": 690
+    },
+    {
+      "epoch": 0.00691,
+      "grad_norm": 0.6061983108520508,
+      "learning_rate": 0.0020729999999999998,
+      "loss": 4.9082,
+      "step": 691
+    },
+    {
+      "epoch": 0.00692,
+      "grad_norm": 0.591027021408081,
+      "learning_rate": 0.0020759999999999997,
+      "loss": 4.9292,
+      "step": 692
+    },
+    {
+      "epoch": 0.00693,
+      "grad_norm": 0.5834758281707764,
+      "learning_rate": 0.0020789999999999997,
+      "loss": 4.9222,
+      "step": 693
+    },
+    {
+      "epoch": 0.00694,
+      "grad_norm": 0.7208871841430664,
+      "learning_rate": 0.002082,
+      "loss": 4.9071,
+      "step": 694
+    },
+    {
+      "epoch": 0.00695,
+      "grad_norm": 0.8771729469299316,
+      "learning_rate": 0.002085,
+      "loss": 4.9171,
+      "step": 695
+    },
+    {
+      "epoch": 0.00696,
+      "grad_norm": 0.9051836133003235,
+      "learning_rate": 0.002088,
+      "loss": 4.9182,
+      "step": 696
+    },
+    {
+      "epoch": 0.00697,
+      "grad_norm": 1.1665294170379639,
+      "learning_rate": 0.002091,
+      "loss": 4.9339,
+      "step": 697
+    },
+    {
+      "epoch": 0.00698,
+      "grad_norm": 1.092050313949585,
+      "learning_rate": 0.002094,
+      "loss": 4.9261,
+      "step": 698
+    },
+    {
+      "epoch": 0.00699,
+      "grad_norm": 0.8412545323371887,
+      "learning_rate": 0.002097,
+      "loss": 4.9128,
+      "step": 699
+    },
+    {
+      "epoch": 0.007,
+      "grad_norm": 0.6720849871635437,
+      "learning_rate": 0.0021,
+      "loss": 4.9176,
+      "step": 700
+    },
+    {
+      "epoch": 0.00701,
+      "grad_norm": 0.563408613204956,
+      "learning_rate": 0.002103,
+      "loss": 4.8944,
+      "step": 701
+    },
+    {
+      "epoch": 0.00702,
+      "grad_norm": 0.5357261300086975,
+      "learning_rate": 0.002106,
+      "loss": 4.893,
+      "step": 702
+    },
+    {
+      "epoch": 0.00703,
+      "grad_norm": 0.5667074918746948,
+      "learning_rate": 0.0021089999999999998,
+      "loss": 4.8766,
+      "step": 703
+    },
+    {
+      "epoch": 0.00704,
+      "grad_norm": 0.597253143787384,
+      "learning_rate": 0.0021119999999999997,
+      "loss": 4.8825,
+      "step": 704
+    },
+    {
+      "epoch": 0.00705,
+      "grad_norm": 0.6211616396903992,
+      "learning_rate": 0.002115,
+      "loss": 4.8877,
+      "step": 705
+    },
+    {
+      "epoch": 0.00706,
+      "grad_norm": 0.687544047832489,
+      "learning_rate": 0.002118,
+      "loss": 4.8929,
+      "step": 706
+    },
+    {
+      "epoch": 0.00707,
+      "grad_norm": 0.7621776461601257,
+      "learning_rate": 0.002121,
+      "loss": 4.8688,
+      "step": 707
+    },
+    {
+      "epoch": 0.00708,
+      "grad_norm": 0.9027195572853088,
+      "learning_rate": 0.002124,
+      "loss": 4.8567,
+      "step": 708
+    },
+    {
+      "epoch": 0.00709,
+      "grad_norm": 0.9446965456008911,
+      "learning_rate": 0.002127,
+      "loss": 4.8935,
+      "step": 709
+    },
+    {
+      "epoch": 0.0071,
+      "grad_norm": 0.8206554055213928,
+      "learning_rate": 0.00213,
+      "loss": 4.8706,
+      "step": 710
+    },
+    {
+      "epoch": 0.00711,
+      "grad_norm": 0.8629757165908813,
+      "learning_rate": 0.002133,
+      "loss": 4.8565,
+      "step": 711
+    },
+    {
+      "epoch": 0.00712,
+      "grad_norm": 0.7438434362411499,
+      "learning_rate": 0.002136,
+      "loss": 4.869,
+      "step": 712
+    },
+    {
+      "epoch": 0.00713,
+      "grad_norm": 0.7951372861862183,
+      "learning_rate": 0.002139,
+      "loss": 4.8846,
+      "step": 713
+    },
+    {
+      "epoch": 0.00714,
+      "grad_norm": 0.9020676016807556,
+      "learning_rate": 0.002142,
+      "loss": 4.8773,
+      "step": 714
+    },
+    {
+      "epoch": 0.00715,
+      "grad_norm": 1.064352035522461,
+      "learning_rate": 0.0021449999999999998,
+      "loss": 4.8624,
+      "step": 715
+    },
+    {
+      "epoch": 0.00716,
+      "grad_norm": 0.7318432927131653,
+      "learning_rate": 0.002148,
+      "loss": 4.8409,
+      "step": 716
+    },
+    {
+      "epoch": 0.00717,
+      "grad_norm": 0.886417031288147,
+      "learning_rate": 0.002151,
+      "loss": 4.8705,
+      "step": 717
+    },
+    {
+      "epoch": 0.00718,
+      "grad_norm": 0.8509985208511353,
+      "learning_rate": 0.002154,
+      "loss": 4.8473,
+      "step": 718
+    },
+    {
+      "epoch": 0.00719,
+      "grad_norm": 0.8979188203811646,
+      "learning_rate": 0.002157,
+      "loss": 4.8656,
+      "step": 719
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 1.0766488313674927,
+      "learning_rate": 0.00216,
+      "loss": 4.8622,
+      "step": 720
+    },
+    {
+      "epoch": 0.00721,
+      "grad_norm": 1.0330792665481567,
+      "learning_rate": 0.002163,
+      "loss": 4.8716,
+      "step": 721
+    },
+    {
+      "epoch": 0.00722,
+      "grad_norm": 1.097432255744934,
+      "learning_rate": 0.002166,
+      "loss": 4.8541,
+      "step": 722
+    },
+    {
+      "epoch": 0.00723,
+      "grad_norm": 1.0046334266662598,
+      "learning_rate": 0.002169,
+      "loss": 4.8596,
+      "step": 723
+    },
+    {
+      "epoch": 0.00724,
+      "grad_norm": 1.0316451787948608,
+      "learning_rate": 0.002172,
+      "loss": 4.8595,
+      "step": 724
+    },
+    {
+      "epoch": 0.00725,
+      "grad_norm": 0.7518482804298401,
+      "learning_rate": 0.002175,
+      "loss": 4.8464,
+      "step": 725
+    },
+    {
+      "epoch": 0.00726,
+      "grad_norm": 0.8229779005050659,
+      "learning_rate": 0.002178,
+      "loss": 4.8512,
+      "step": 726
+    },
+    {
+      "epoch": 0.00727,
+      "grad_norm": 0.9577723741531372,
+      "learning_rate": 0.0021809999999999998,
+      "loss": 4.841,
+      "step": 727
+    },
+    {
+      "epoch": 0.00728,
+      "grad_norm": 0.9698926210403442,
+      "learning_rate": 0.002184,
+      "loss": 4.8594,
+      "step": 728
+    },
+    {
+      "epoch": 0.00729,
+      "grad_norm": 1.1789532899856567,
+      "learning_rate": 0.002187,
+      "loss": 4.8599,
+      "step": 729
+    },
+    {
+      "epoch": 0.0073,
+      "grad_norm": 1.0923309326171875,
+      "learning_rate": 0.00219,
+      "loss": 4.8656,
+      "step": 730
+    },
+    {
+      "epoch": 0.00731,
+      "grad_norm": 1.0404242277145386,
+      "learning_rate": 0.002193,
+      "loss": 4.8544,
+      "step": 731
+    },
+    {
+      "epoch": 0.00732,
+      "grad_norm": 0.8924814462661743,
+      "learning_rate": 0.002196,
+      "loss": 4.8537,
+      "step": 732
+    },
+    {
+      "epoch": 0.00733,
+      "grad_norm": 0.7918877005577087,
+      "learning_rate": 0.002199,
+      "loss": 4.8326,
+      "step": 733
+    },
+    {
+      "epoch": 0.00734,
+      "grad_norm": 0.8231402635574341,
+      "learning_rate": 0.002202,
+      "loss": 4.8485,
+      "step": 734
+    },
+    {
+      "epoch": 0.00735,
+      "grad_norm": 1.1126084327697754,
+      "learning_rate": 0.002205,
+      "loss": 4.8661,
+      "step": 735
+    },
+    {
+      "epoch": 0.00736,
+      "grad_norm": 1.1928813457489014,
+      "learning_rate": 0.002208,
+      "loss": 4.857,
+      "step": 736
+    },
+    {
+      "epoch": 0.00737,
+      "grad_norm": 0.9305274486541748,
+      "learning_rate": 0.002211,
+      "loss": 4.8183,
+      "step": 737
+    },
+    {
+      "epoch": 0.00738,
+      "grad_norm": 1.2922184467315674,
+      "learning_rate": 0.002214,
+      "loss": 4.8313,
+      "step": 738
+    },
+    {
+      "epoch": 0.00739,
+      "grad_norm": 1.0224477052688599,
+      "learning_rate": 0.0022170000000000002,
+      "loss": 4.8302,
+      "step": 739
+    },
+    {
+      "epoch": 0.0074,
+      "grad_norm": 0.8254541158676147,
+      "learning_rate": 0.00222,
+      "loss": 4.8183,
+      "step": 740
+    },
+    {
+      "epoch": 0.00741,
+      "grad_norm": 0.8494399785995483,
+      "learning_rate": 0.002223,
+      "loss": 4.7936,
+      "step": 741
+    },
+    {
+      "epoch": 0.00742,
+      "grad_norm": 0.8097528219223022,
+      "learning_rate": 0.002226,
+      "loss": 4.8203,
+      "step": 742
+    },
+    {
+      "epoch": 0.00743,
+      "grad_norm": 0.7318201065063477,
+      "learning_rate": 0.002229,
+      "loss": 4.782,
+      "step": 743
+    },
+    {
+      "epoch": 0.00744,
+      "grad_norm": 0.770041286945343,
+      "learning_rate": 0.002232,
+      "loss": 4.7897,
+      "step": 744
+    },
+    {
+      "epoch": 0.00745,
+      "grad_norm": 0.8400176167488098,
+      "learning_rate": 0.002235,
+      "loss": 4.793,
+      "step": 745
+    },
+    {
+      "epoch": 0.00746,
+      "grad_norm": 0.8187500834465027,
+      "learning_rate": 0.002238,
+      "loss": 4.7927,
+      "step": 746
+    },
+    {
+      "epoch": 0.00747,
+      "grad_norm": 0.8396742343902588,
+      "learning_rate": 0.002241,
+      "loss": 4.7802,
+      "step": 747
+    },
+    {
+      "epoch": 0.00748,
+      "grad_norm": 0.8425725698471069,
+      "learning_rate": 0.002244,
+      "loss": 4.7863,
+      "step": 748
+    },
+    {
+      "epoch": 0.00749,
+      "grad_norm": 0.9793819189071655,
+      "learning_rate": 0.002247,
+      "loss": 4.771,
+      "step": 749
+    },
+    {
+      "epoch": 0.0075,
+      "grad_norm": 0.9097406268119812,
+      "learning_rate": 0.0022500000000000003,
+      "loss": 4.8036,
+      "step": 750
+    },
+    {
+      "epoch": 0.00751,
+      "grad_norm": 1.1564972400665283,
+      "learning_rate": 0.0022530000000000002,
+      "loss": 4.8008,
+      "step": 751
+    },
+    {
+      "epoch": 0.00752,
+      "grad_norm": 1.1269639730453491,
+      "learning_rate": 0.002256,
+      "loss": 4.7666,
+      "step": 752
+    },
+    {
+      "epoch": 0.00753,
+      "grad_norm": 0.7554891705513,
+      "learning_rate": 0.002259,
+      "loss": 4.7895,
+      "step": 753
+    },
+    {
+      "epoch": 0.00754,
+      "grad_norm": 0.6434245705604553,
+      "learning_rate": 0.002262,
+      "loss": 4.7936,
+      "step": 754
+    },
+    {
+      "epoch": 0.00755,
+      "grad_norm": 0.6667845249176025,
+      "learning_rate": 0.002265,
+      "loss": 4.7629,
+      "step": 755
+    },
+    {
+      "epoch": 0.00756,
+      "grad_norm": 0.6454316973686218,
+      "learning_rate": 0.002268,
+      "loss": 4.7431,
+      "step": 756
+    },
+    {
+      "epoch": 0.00757,
+      "grad_norm": 0.5992982387542725,
+      "learning_rate": 0.002271,
+      "loss": 4.7549,
+      "step": 757
+    },
+    {
+      "epoch": 0.00758,
+      "grad_norm": 0.5403345227241516,
+      "learning_rate": 0.002274,
+      "loss": 4.7325,
+      "step": 758
+    },
+    {
+      "epoch": 0.00759,
+      "grad_norm": 0.6695131063461304,
+      "learning_rate": 0.002277,
+      "loss": 4.754,
+      "step": 759
+    },
+    {
+      "epoch": 0.0076,
+      "grad_norm": 0.8114665150642395,
+      "learning_rate": 0.00228,
+      "loss": 4.7449,
+      "step": 760
+    },
+    {
+      "epoch": 0.00761,
+      "grad_norm": 0.8682258129119873,
+      "learning_rate": 0.002283,
+      "loss": 4.7184,
+      "step": 761
+    },
+    {
+      "epoch": 0.00762,
+      "grad_norm": 0.9619300961494446,
+      "learning_rate": 0.0022860000000000003,
+      "loss": 4.7088,
+      "step": 762
+    },
+    {
+      "epoch": 0.00763,
+      "grad_norm": 1.0561281442642212,
+      "learning_rate": 0.0022890000000000002,
+      "loss": 4.7527,
+      "step": 763
+    },
+    {
+      "epoch": 0.00764,
+      "grad_norm": 1.0605075359344482,
+      "learning_rate": 0.002292,
+      "loss": 4.7609,
+      "step": 764
+    },
+    {
+      "epoch": 0.00765,
+      "grad_norm": 0.8591569066047668,
+      "learning_rate": 0.002295,
+      "loss": 4.7468,
+      "step": 765
+    },
+    {
+      "epoch": 0.00766,
+      "grad_norm": 0.8702619075775146,
+      "learning_rate": 0.002298,
+      "loss": 4.7378,
+      "step": 766
+    },
+    {
+      "epoch": 0.00767,
+      "grad_norm": 0.9817199110984802,
+      "learning_rate": 0.002301,
+      "loss": 4.7481,
+      "step": 767
+    },
+    {
+      "epoch": 0.00768,
+      "grad_norm": 1.0741162300109863,
+      "learning_rate": 0.002304,
+      "loss": 4.7612,
+      "step": 768
+    },
+    {
+      "epoch": 0.00769,
+      "grad_norm": 0.8188871145248413,
+      "learning_rate": 0.002307,
+      "loss": 4.7592,
+      "step": 769
+    },
+    {
+      "epoch": 0.0077,
+      "grad_norm": 0.7658451795578003,
+      "learning_rate": 0.00231,
+      "loss": 4.7148,
+      "step": 770
+    },
+    {
+      "epoch": 0.00771,
+      "grad_norm": 0.8762615323066711,
+      "learning_rate": 0.002313,
+      "loss": 4.754,
+      "step": 771
+    },
+    {
+      "epoch": 0.00772,
+      "grad_norm": 0.9279188513755798,
+      "learning_rate": 0.002316,
+      "loss": 4.7389,
+      "step": 772
+    },
+    {
+      "epoch": 0.00773,
+      "grad_norm": 0.8405673503875732,
+      "learning_rate": 0.0023190000000000003,
+      "loss": 4.737,
+      "step": 773
+    },
+    {
+      "epoch": 0.00774,
+      "grad_norm": 0.777439296245575,
+      "learning_rate": 0.0023220000000000003,
+      "loss": 4.7452,
+      "step": 774
+    },
+    {
+      "epoch": 0.00775,
+      "grad_norm": 0.8980410099029541,
+      "learning_rate": 0.0023250000000000002,
+      "loss": 4.7436,
+      "step": 775
+    },
+    {
+      "epoch": 0.00776,
+      "grad_norm": 1.0412962436676025,
+      "learning_rate": 0.002328,
+      "loss": 4.7539,
+      "step": 776
+    },
+    {
+      "epoch": 0.00777,
+      "grad_norm": 0.9281516075134277,
+      "learning_rate": 0.002331,
+      "loss": 4.7268,
+      "step": 777
+    },
+    {
+      "epoch": 0.00778,
+      "grad_norm": 0.7459467649459839,
+      "learning_rate": 0.002334,
+      "loss": 4.7271,
+      "step": 778
+    },
+    {
+      "epoch": 0.00779,
+      "grad_norm": 0.7607200145721436,
+      "learning_rate": 0.002337,
+      "loss": 4.689,
+      "step": 779
+    },
+    {
+      "epoch": 0.0078,
+      "grad_norm": 0.7529038190841675,
+      "learning_rate": 0.00234,
+      "loss": 4.706,
+      "step": 780
+    },
+    {
+      "epoch": 0.00781,
+      "grad_norm": 0.7776694297790527,
+      "learning_rate": 0.002343,
+      "loss": 4.7072,
+      "step": 781
+    },
+    {
+      "epoch": 0.00782,
+      "grad_norm": 0.7648219466209412,
+      "learning_rate": 0.002346,
+      "loss": 4.6998,
+      "step": 782
+    },
+    {
+      "epoch": 0.00783,
+      "grad_norm": 0.8795627355575562,
+      "learning_rate": 0.002349,
+      "loss": 4.6893,
+      "step": 783
+    },
+    {
+      "epoch": 0.00784,
+      "grad_norm": 0.9756646156311035,
+      "learning_rate": 0.002352,
+      "loss": 4.706,
+      "step": 784
+    },
+    {
+      "epoch": 0.00785,
+      "grad_norm": 1.03944993019104,
+      "learning_rate": 0.0023550000000000003,
+      "loss": 4.7094,
+      "step": 785
+    },
+    {
+      "epoch": 0.00786,
+      "grad_norm": 0.9842208027839661,
+      "learning_rate": 0.0023580000000000003,
+      "loss": 4.7094,
+      "step": 786
+    },
+    {
+      "epoch": 0.00787,
+      "grad_norm": 0.9913274049758911,
+      "learning_rate": 0.0023610000000000003,
+      "loss": 4.7124,
+      "step": 787
+    },
+    {
+      "epoch": 0.00788,
+      "grad_norm": 1.0203496217727661,
+      "learning_rate": 0.002364,
+      "loss": 4.711,
+      "step": 788
+    },
+    {
+      "epoch": 0.00789,
+      "grad_norm": 1.0524691343307495,
+      "learning_rate": 0.002367,
+      "loss": 4.7436,
+      "step": 789
+    },
+    {
+      "epoch": 0.0079,
+      "grad_norm": 0.8448042273521423,
+      "learning_rate": 0.00237,
+      "loss": 4.7139,
+      "step": 790
+    },
+    {
+      "epoch": 0.00791,
+      "grad_norm": 0.737777054309845,
+      "learning_rate": 0.002373,
+      "loss": 4.6607,
+      "step": 791
+    },
+    {
+      "epoch": 0.00792,
+      "grad_norm": 0.8730551600456238,
+      "learning_rate": 0.002376,
+      "loss": 4.6988,
+      "step": 792
+    },
+    {
+      "epoch": 0.00793,
+      "grad_norm": 1.0680596828460693,
+      "learning_rate": 0.002379,
+      "loss": 4.6938,
+      "step": 793
+    },
+    {
+      "epoch": 0.00794,
+      "grad_norm": 0.876390814781189,
+      "learning_rate": 0.002382,
+      "loss": 4.7016,
+      "step": 794
+    },
+    {
+      "epoch": 0.00795,
+      "grad_norm": 0.8010908961296082,
+      "learning_rate": 0.002385,
+      "loss": 4.6743,
+      "step": 795
+    },
+    {
+      "epoch": 0.00796,
+      "grad_norm": 0.8452677130699158,
+      "learning_rate": 0.0023880000000000004,
+      "loss": 4.6712,
+      "step": 796
+    },
+    {
+      "epoch": 0.00797,
+      "grad_norm": 0.7743445038795471,
+      "learning_rate": 0.0023910000000000003,
+      "loss": 4.7081,
+      "step": 797
+    },
+    {
+      "epoch": 0.00798,
+      "grad_norm": 0.7820720076560974,
+      "learning_rate": 0.0023940000000000003,
+      "loss": 4.6744,
+      "step": 798
+    },
+    {
+      "epoch": 0.00799,
+      "grad_norm": 0.8602663278579712,
+      "learning_rate": 0.0023970000000000003,
+      "loss": 4.703,
+      "step": 799
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.83580082654953,
+      "learning_rate": 0.0024000000000000002,
+      "loss": 4.6615,
+      "step": 800
+    },
+    {
+      "epoch": 0.00801,
+      "grad_norm": 0.8171262145042419,
+      "learning_rate": 0.002403,
+      "loss": 4.6877,
+      "step": 801
+    },
+    {
+      "epoch": 0.00802,
+      "grad_norm": 0.8578851819038391,
+      "learning_rate": 0.002406,
+      "loss": 4.6772,
+      "step": 802
+    },
+    {
+      "epoch": 0.00803,
+      "grad_norm": 0.9747350215911865,
+      "learning_rate": 0.002409,
+      "loss": 4.662,
+      "step": 803
+    },
+    {
+      "epoch": 0.00804,
+      "grad_norm": 0.9363577961921692,
+      "learning_rate": 0.002412,
+      "loss": 4.6595,
+      "step": 804
+    },
+    {
+      "epoch": 0.00805,
+      "grad_norm": 0.9166631102561951,
+      "learning_rate": 0.002415,
+      "loss": 4.6601,
+      "step": 805
+    },
+    {
+      "epoch": 0.00806,
+      "grad_norm": 0.860556960105896,
+      "learning_rate": 0.002418,
+      "loss": 4.681,
+      "step": 806
+    },
+    {
+      "epoch": 0.00807,
+      "grad_norm": 0.8737242817878723,
+      "learning_rate": 0.0024210000000000004,
+      "loss": 4.6779,
+      "step": 807
+    },
+    {
+      "epoch": 0.00808,
+      "grad_norm": 0.8125926852226257,
+      "learning_rate": 0.0024240000000000004,
+      "loss": 4.6555,
+      "step": 808
+    },
+    {
+      "epoch": 0.00809,
+      "grad_norm": 0.9335473775863647,
+      "learning_rate": 0.0024270000000000003,
+      "loss": 4.6575,
+      "step": 809
+    },
+    {
+      "epoch": 0.0081,
+      "grad_norm": 0.7656557559967041,
+      "learning_rate": 0.0024300000000000003,
+      "loss": 4.6752,
+      "step": 810
+    },
+    {
+      "epoch": 0.00811,
+      "grad_norm": 0.8129197359085083,
+      "learning_rate": 0.0024330000000000003,
+      "loss": 4.6545,
+      "step": 811
+    },
+    {
+      "epoch": 0.00812,
+      "grad_norm": 0.7713532447814941,
+      "learning_rate": 0.0024360000000000002,
+      "loss": 4.6279,
+      "step": 812
+    },
+    {
+      "epoch": 0.00813,
+      "grad_norm": 0.8149270415306091,
+      "learning_rate": 0.0024389999999999998,
+      "loss": 4.6613,
+      "step": 813
+    },
+    {
+      "epoch": 0.00814,
+      "grad_norm": 0.866010308265686,
+      "learning_rate": 0.0024419999999999997,
+      "loss": 4.6696,
+      "step": 814
+    },
+    {
+      "epoch": 0.00815,
+      "grad_norm": 0.7402296662330627,
+      "learning_rate": 0.0024449999999999997,
+      "loss": 4.6746,
+      "step": 815
+    },
+    {
+      "epoch": 0.00816,
+      "grad_norm": 0.6451212167739868,
+      "learning_rate": 0.002448,
+      "loss": 4.6403,
+      "step": 816
+    },
+    {
+      "epoch": 0.00817,
+      "grad_norm": 0.7935466170310974,
+      "learning_rate": 0.002451,
+      "loss": 4.6538,
+      "step": 817
+    },
+    {
+      "epoch": 0.00818,
+      "grad_norm": 1.0368677377700806,
+      "learning_rate": 0.002454,
+      "loss": 4.6444,
+      "step": 818
+    },
+    {
+      "epoch": 0.00819,
+      "grad_norm": 1.1921635866165161,
+      "learning_rate": 0.002457,
+      "loss": 4.6695,
+      "step": 819
+    },
+    {
+      "epoch": 0.0082,
+      "grad_norm": 0.9146779179573059,
+      "learning_rate": 0.00246,
+      "loss": 4.6473,
+      "step": 820
+    },
+    {
+      "epoch": 0.00821,
+      "grad_norm": 0.8097479939460754,
+      "learning_rate": 0.002463,
+      "loss": 4.6216,
+      "step": 821
+    },
+    {
+      "epoch": 0.00822,
+      "grad_norm": 0.8611756563186646,
+      "learning_rate": 0.002466,
+      "loss": 4.6241,
+      "step": 822
+    },
+    {
+      "epoch": 0.00823,
+      "grad_norm": 0.9131811857223511,
+      "learning_rate": 0.002469,
+      "loss": 4.6307,
+      "step": 823
+    },
+    {
+      "epoch": 0.00824,
+      "grad_norm": 0.9472024440765381,
+      "learning_rate": 0.002472,
+      "loss": 4.6551,
+      "step": 824
+    },
+    {
+      "epoch": 0.00825,
+      "grad_norm": 1.1296700239181519,
+      "learning_rate": 0.0024749999999999998,
+      "loss": 4.6738,
+      "step": 825
+    },
+    {
+      "epoch": 0.00826,
+      "grad_norm": 0.9912372827529907,
+      "learning_rate": 0.0024779999999999997,
+      "loss": 4.6298,
+      "step": 826
+    },
+    {
+      "epoch": 0.00827,
+      "grad_norm": 0.9669742584228516,
+      "learning_rate": 0.002481,
+      "loss": 4.6558,
+      "step": 827
+    },
+    {
+      "epoch": 0.00828,
+      "grad_norm": 0.7501344680786133,
+      "learning_rate": 0.002484,
+      "loss": 4.6425,
+      "step": 828
+    },
+    {
+      "epoch": 0.00829,
+      "grad_norm": 0.6985933184623718,
+      "learning_rate": 0.002487,
+      "loss": 4.6351,
+      "step": 829
+    },
+    {
+      "epoch": 0.0083,
+      "grad_norm": 0.6681656837463379,
+      "learning_rate": 0.00249,
+      "loss": 4.6406,
+      "step": 830
+    },
+    {
+      "epoch": 0.00831,
+      "grad_norm": 0.7142512202262878,
+      "learning_rate": 0.002493,
+      "loss": 4.6218,
+      "step": 831
+    },
+    {
+      "epoch": 0.00832,
+      "grad_norm": 0.7265256643295288,
+      "learning_rate": 0.002496,
+      "loss": 4.6212,
+      "step": 832
+    },
+    {
+      "epoch": 0.00833,
+      "grad_norm": 0.7133427262306213,
+      "learning_rate": 0.002499,
+      "loss": 4.6125,
+      "step": 833
+    },
+    {
+      "epoch": 0.00834,
+      "grad_norm": 0.7037473917007446,
+      "learning_rate": 0.002502,
+      "loss": 4.6151,
+      "step": 834
+    },
+    {
+      "epoch": 0.00835,
+      "grad_norm": 0.9004167914390564,
+      "learning_rate": 0.002505,
+      "loss": 4.6154,
+      "step": 835
+    },
+    {
+      "epoch": 0.00836,
+      "grad_norm": 0.9459953904151917,
+      "learning_rate": 0.002508,
+      "loss": 4.6497,
+      "step": 836
+    },
+    {
+      "epoch": 0.00837,
+      "grad_norm": 0.9802148342132568,
+      "learning_rate": 0.0025109999999999998,
+      "loss": 4.5975,
+      "step": 837
+    },
+    {
+      "epoch": 0.00838,
+      "grad_norm": 0.8809778690338135,
+      "learning_rate": 0.0025139999999999997,
+      "loss": 4.6166,
+      "step": 838
+    },
+    {
+      "epoch": 0.00839,
+      "grad_norm": 0.792102038860321,
+      "learning_rate": 0.002517,
+      "loss": 4.6155,
+      "step": 839
+    },
+    {
+      "epoch": 0.0084,
+      "grad_norm": 0.7744638323783875,
+      "learning_rate": 0.00252,
+      "loss": 4.5999,
+      "step": 840
+    },
+    {
+      "epoch": 0.00841,
+      "grad_norm": 0.791641116142273,
+      "learning_rate": 0.002523,
+      "loss": 4.5943,
+      "step": 841
+    },
+    {
+      "epoch": 0.00842,
+      "grad_norm": 0.8285142183303833,
+      "learning_rate": 0.002526,
+      "loss": 4.5937,
+      "step": 842
+    },
+    {
+      "epoch": 0.00843,
+      "grad_norm": 0.9900093078613281,
+      "learning_rate": 0.002529,
+      "loss": 4.6057,
+      "step": 843
+    },
+    {
+      "epoch": 0.00844,
+      "grad_norm": 0.984833300113678,
+      "learning_rate": 0.002532,
+      "loss": 4.6269,
+      "step": 844
+    },
+    {
+      "epoch": 0.00845,
+      "grad_norm": 0.8697680234909058,
+      "learning_rate": 0.002535,
+      "loss": 4.5878,
+      "step": 845
+    },
+    {
+      "epoch": 0.00846,
+      "grad_norm": 0.8182123899459839,
+      "learning_rate": 0.002538,
+      "loss": 4.6097,
+      "step": 846
+    },
+    {
+      "epoch": 0.00847,
+      "grad_norm": 0.7307525277137756,
+      "learning_rate": 0.002541,
+      "loss": 4.5888,
+      "step": 847
+    },
+    {
+      "epoch": 0.00848,
+      "grad_norm": 0.7021674513816833,
+      "learning_rate": 0.002544,
+      "loss": 4.5844,
+      "step": 848
+    },
+    {
+      "epoch": 0.00849,
+      "grad_norm": 0.6287952661514282,
+      "learning_rate": 0.002547,
+      "loss": 4.5759,
+      "step": 849
+    },
+    {
+      "epoch": 0.0085,
+      "grad_norm": 0.5557106733322144,
+      "learning_rate": 0.00255,
+      "loss": 4.5834,
+      "step": 850
+    },
+    {
+      "epoch": 0.00851,
+      "grad_norm": 0.5636370778083801,
+      "learning_rate": 0.002553,
+      "loss": 4.5977,
+      "step": 851
+    },
+    {
+      "epoch": 0.00852,
+      "grad_norm": 0.6509172320365906,
+      "learning_rate": 0.002556,
+      "loss": 4.5892,
+      "step": 852
+    },
+    {
+      "epoch": 0.00853,
+      "grad_norm": 0.6835383176803589,
+      "learning_rate": 0.002559,
+      "loss": 4.5743,
+      "step": 853
+    },
+    {
+      "epoch": 0.00854,
+      "grad_norm": 0.6499077081680298,
+      "learning_rate": 0.002562,
+      "loss": 4.5629,
+      "step": 854
+    },
+    {
+      "epoch": 0.00855,
+      "grad_norm": 0.5777466297149658,
+      "learning_rate": 0.002565,
+      "loss": 4.5388,
+      "step": 855
+    },
+    {
+      "epoch": 0.00856,
+      "grad_norm": 0.6060221195220947,
+      "learning_rate": 0.002568,
+      "loss": 4.5724,
+      "step": 856
+    },
+    {
+      "epoch": 0.00857,
+      "grad_norm": 0.6906097531318665,
+      "learning_rate": 0.002571,
+      "loss": 4.5615,
+      "step": 857
+    },
+    {
+      "epoch": 0.00858,
+      "grad_norm": 0.7318242788314819,
+      "learning_rate": 0.002574,
+      "loss": 4.5575,
+      "step": 858
+    },
+    {
+      "epoch": 0.00859,
+      "grad_norm": 0.6517086029052734,
+      "learning_rate": 0.002577,
+      "loss": 4.5498,
+      "step": 859
+    },
+    {
+      "epoch": 0.0086,
+      "grad_norm": 0.6514325737953186,
+      "learning_rate": 0.00258,
+      "loss": 4.5756,
+      "step": 860
+    },
+    {
+      "epoch": 0.00861,
+      "grad_norm": 0.7899180054664612,
+      "learning_rate": 0.0025830000000000002,
+      "loss": 4.594,
+      "step": 861
+    },
+    {
+      "epoch": 0.00862,
+      "grad_norm": 0.874433696269989,
+      "learning_rate": 0.002586,
+      "loss": 4.5519,
+      "step": 862
+    },
+    {
+      "epoch": 0.00863,
+      "grad_norm": 0.8605120182037354,
+      "learning_rate": 0.002589,
+      "loss": 4.5772,
+      "step": 863
+    },
+    {
+      "epoch": 0.00864,
+      "grad_norm": 0.9631415009498596,
+      "learning_rate": 0.002592,
+      "loss": 4.5403,
+      "step": 864
+    },
+    {
+      "epoch": 0.00865,
+      "grad_norm": 1.1293022632598877,
+      "learning_rate": 0.002595,
+      "loss": 4.5658,
+      "step": 865
+    },
+    {
+      "epoch": 0.00866,
+      "grad_norm": 1.0691903829574585,
+      "learning_rate": 0.002598,
+      "loss": 4.5962,
+      "step": 866
+    },
+    {
+      "epoch": 0.00867,
+      "grad_norm": 0.9885998368263245,
+      "learning_rate": 0.002601,
+      "loss": 4.5818,
+      "step": 867
+    },
+    {
+      "epoch": 0.00868,
+      "grad_norm": 1.3688061237335205,
+      "learning_rate": 0.002604,
+      "loss": 4.5938,
+      "step": 868
+    },
+    {
+      "epoch": 0.00869,
+      "grad_norm": 0.8156008124351501,
+      "learning_rate": 0.002607,
+      "loss": 4.5787,
+      "step": 869
+    },
+    {
+      "epoch": 0.0087,
+      "grad_norm": 0.8631764054298401,
+      "learning_rate": 0.00261,
+      "loss": 4.5676,
+      "step": 870
+    },
+    {
+      "epoch": 0.00871,
+      "grad_norm": 0.9996145963668823,
+      "learning_rate": 0.002613,
+      "loss": 4.5932,
+      "step": 871
+    },
+    {
+      "epoch": 0.00872,
+      "grad_norm": 0.972501814365387,
+      "learning_rate": 0.002616,
+      "loss": 4.5764,
+      "step": 872
+    },
+    {
+      "epoch": 0.00873,
+      "grad_norm": 1.0140340328216553,
+      "learning_rate": 0.0026190000000000002,
+      "loss": 4.5878,
+      "step": 873
+    },
+    {
+      "epoch": 0.00874,
+      "grad_norm": 0.9380632042884827,
+      "learning_rate": 0.002622,
+      "loss": 4.6105,
+      "step": 874
+    },
+    {
+      "epoch": 0.00875,
+      "grad_norm": 0.9407688975334167,
+      "learning_rate": 0.002625,
+      "loss": 4.5588,
+      "step": 875
+    },
+    {
+      "epoch": 0.00876,
+      "grad_norm": 1.0034210681915283,
+      "learning_rate": 0.002628,
+      "loss": 4.5949,
+      "step": 876
+    },
+    {
+      "epoch": 0.00877,
+      "grad_norm": 1.2963709831237793,
+      "learning_rate": 0.002631,
+      "loss": 4.6268,
+      "step": 877
+    },
+    {
+      "epoch": 0.00878,
+      "grad_norm": 0.7438150644302368,
+      "learning_rate": 0.002634,
+      "loss": 4.537,
+      "step": 878
+    },
+    {
+      "epoch": 0.00879,
+      "grad_norm": 0.8438600301742554,
+      "learning_rate": 0.002637,
+      "loss": 4.5729,
+      "step": 879
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.7277801036834717,
+      "learning_rate": 0.00264,
+      "loss": 4.5704,
+      "step": 880
+    },
+    {
+      "epoch": 0.00881,
+      "grad_norm": 0.9043455123901367,
+      "learning_rate": 0.002643,
+      "loss": 4.5794,
+      "step": 881
+    },
+    {
+      "epoch": 0.00882,
+      "grad_norm": 0.9441054463386536,
+      "learning_rate": 0.002646,
+      "loss": 4.5807,
+      "step": 882
+    },
+    {
+      "epoch": 0.00883,
+      "grad_norm": 0.8490539193153381,
+      "learning_rate": 0.002649,
+      "loss": 4.5704,
+      "step": 883
+    },
+    {
+      "epoch": 0.00884,
+      "grad_norm": 0.8639208078384399,
+      "learning_rate": 0.0026520000000000003,
+      "loss": 4.555,
+      "step": 884
+    },
+    {
+      "epoch": 0.00885,
+      "grad_norm": 0.8000319600105286,
+      "learning_rate": 0.0026550000000000002,
+      "loss": 4.5531,
+      "step": 885
+    },
+    {
+      "epoch": 0.00886,
+      "grad_norm": 0.8000409007072449,
+      "learning_rate": 0.002658,
+      "loss": 4.5702,
+      "step": 886
+    },
+    {
+      "epoch": 0.00887,
+      "grad_norm": 0.5944011807441711,
+      "learning_rate": 0.002661,
+      "loss": 4.5499,
+      "step": 887
+    },
+    {
+      "epoch": 0.00888,
+      "grad_norm": 0.5328640341758728,
+      "learning_rate": 0.002664,
+      "loss": 4.5494,
+      "step": 888
+    },
+    {
+      "epoch": 0.00889,
+      "grad_norm": 0.5338924527168274,
+      "learning_rate": 0.002667,
+      "loss": 4.5018,
+      "step": 889
+    },
+    {
+      "epoch": 0.0089,
+      "grad_norm": 0.4938536286354065,
+      "learning_rate": 0.00267,
+      "loss": 4.545,
+      "step": 890
+    },
+    {
+      "epoch": 0.00891,
+      "grad_norm": 0.4882456064224243,
+      "learning_rate": 0.002673,
+      "loss": 4.5301,
+      "step": 891
+    },
+    {
+      "epoch": 0.00892,
+      "grad_norm": 0.4257451891899109,
+      "learning_rate": 0.002676,
+      "loss": 4.5393,
+      "step": 892
+    },
+    {
+      "epoch": 0.00893,
+      "grad_norm": 0.5505130290985107,
+      "learning_rate": 0.002679,
+      "loss": 4.5171,
+      "step": 893
+    },
+    {
+      "epoch": 0.00894,
+      "grad_norm": 0.6718395352363586,
+      "learning_rate": 0.002682,
+      "loss": 4.5132,
+      "step": 894
+    },
+    {
+      "epoch": 0.00895,
+      "grad_norm": 0.7553327679634094,
+      "learning_rate": 0.0026850000000000003,
+      "loss": 4.5223,
+      "step": 895
+    },
+    {
+      "epoch": 0.00896,
+      "grad_norm": 0.7149863243103027,
+      "learning_rate": 0.0026880000000000003,
+      "loss": 4.5119,
+      "step": 896
+    },
+    {
+      "epoch": 0.00897,
+      "grad_norm": 0.8281179070472717,
+      "learning_rate": 0.0026910000000000002,
+      "loss": 4.5328,
+      "step": 897
+    },
+    {
+      "epoch": 0.00898,
+      "grad_norm": 0.9519450068473816,
+      "learning_rate": 0.002694,
+      "loss": 4.5342,
+      "step": 898
+    },
+    {
+      "epoch": 0.00899,
+      "grad_norm": 0.9227859377861023,
+      "learning_rate": 0.002697,
+      "loss": 4.5445,
+      "step": 899
+    },
+    {
+      "epoch": 0.009,
+      "grad_norm": 1.040459156036377,
+      "learning_rate": 0.0027,
+      "loss": 4.5756,
+      "step": 900
+    },
+    {
+      "epoch": 0.00901,
+      "grad_norm": 0.8912736773490906,
+      "learning_rate": 0.002703,
+      "loss": 4.5234,
+      "step": 901
+    },
+    {
+      "epoch": 0.00902,
+      "grad_norm": 0.768046498298645,
+      "learning_rate": 0.002706,
+      "loss": 4.5317,
+      "step": 902
+    },
+    {
+      "epoch": 0.00903,
+      "grad_norm": 0.698384702205658,
+      "learning_rate": 0.002709,
+      "loss": 4.5227,
+      "step": 903
+    },
+    {
+      "epoch": 0.00904,
+      "grad_norm": 0.7553470134735107,
+      "learning_rate": 0.002712,
+      "loss": 4.5046,
+      "step": 904
+    },
+    {
+      "epoch": 0.00905,
+      "grad_norm": 0.9005333185195923,
+      "learning_rate": 0.002715,
+      "loss": 4.5273,
+      "step": 905
+    },
+    {
+      "epoch": 0.00906,
+      "grad_norm": 0.9150082468986511,
+      "learning_rate": 0.002718,
+      "loss": 4.54,
+      "step": 906
+    },
+    {
+      "epoch": 0.00907,
+      "grad_norm": 0.8349012732505798,
+      "learning_rate": 0.0027210000000000003,
+      "loss": 4.5543,
+      "step": 907
+    },
+    {
+      "epoch": 0.00908,
+      "grad_norm": 0.7856695055961609,
+      "learning_rate": 0.0027240000000000003,
+      "loss": 4.5219,
+      "step": 908
+    },
+    {
+      "epoch": 0.00909,
+      "grad_norm": 0.8433918952941895,
+      "learning_rate": 0.0027270000000000003,
+      "loss": 4.5423,
+      "step": 909
+    },
+    {
+      "epoch": 0.0091,
+      "grad_norm": 0.9650413393974304,
+      "learning_rate": 0.0027300000000000002,
+      "loss": 4.5577,
+      "step": 910
+    },
+    {
+      "epoch": 0.00911,
+      "grad_norm": 0.9894043803215027,
+      "learning_rate": 0.002733,
+      "loss": 4.514,
+      "step": 911
+    },
+    {
+      "epoch": 0.00912,
+      "grad_norm": 0.9117268919944763,
+      "learning_rate": 0.002736,
+      "loss": 4.5245,
+      "step": 912
+    },
+    {
+      "epoch": 0.00913,
+      "grad_norm": 1.020936369895935,
+      "learning_rate": 0.002739,
+      "loss": 4.5244,
+      "step": 913
+    },
+    {
+      "epoch": 0.00914,
+      "grad_norm": 1.1810083389282227,
+      "learning_rate": 0.002742,
+      "loss": 4.5723,
+      "step": 914
+    },
+    {
+      "epoch": 0.00915,
+      "grad_norm": 0.8899980187416077,
+      "learning_rate": 0.002745,
+      "loss": 4.5252,
+      "step": 915
+    },
+    {
+      "epoch": 0.00916,
+      "grad_norm": 0.9896888136863708,
+      "learning_rate": 0.002748,
+      "loss": 4.5028,
+      "step": 916
+    },
+    {
+      "epoch": 0.00917,
+      "grad_norm": 1.3935941457748413,
+      "learning_rate": 0.002751,
+      "loss": 4.5471,
+      "step": 917
+    },
+    {
+      "epoch": 0.00918,
+      "grad_norm": 1.1309375762939453,
+      "learning_rate": 0.0027540000000000004,
+      "loss": 4.562,
+      "step": 918
+    },
+    {
+      "epoch": 0.00919,
+      "grad_norm": 0.8358169198036194,
+      "learning_rate": 0.0027570000000000003,
+      "loss": 4.5255,
+      "step": 919
+    },
+    {
+      "epoch": 0.0092,
+      "grad_norm": 0.8860310316085815,
+      "learning_rate": 0.0027600000000000003,
+      "loss": 4.5423,
+      "step": 920
+    },
+    {
+      "epoch": 0.00921,
+      "grad_norm": 1.0510568618774414,
+      "learning_rate": 0.0027630000000000003,
+      "loss": 4.5336,
+      "step": 921
+    },
+    {
+      "epoch": 0.00922,
+      "grad_norm": 1.1311016082763672,
+      "learning_rate": 0.0027660000000000002,
+      "loss": 4.5483,
+      "step": 922
+    },
+    {
+      "epoch": 0.00923,
+      "grad_norm": 1.1111584901809692,
+      "learning_rate": 0.002769,
+      "loss": 4.5339,
+      "step": 923
+    },
+    {
+      "epoch": 0.00924,
+      "grad_norm": 0.9117261171340942,
+      "learning_rate": 0.002772,
+      "loss": 4.5412,
+      "step": 924
+    },
+    {
+      "epoch": 0.00925,
+      "grad_norm": 0.9757253527641296,
+      "learning_rate": 0.002775,
+      "loss": 4.5263,
+      "step": 925
+    },
+    {
+      "epoch": 0.00926,
+      "grad_norm": 0.907768189907074,
+      "learning_rate": 0.002778,
+      "loss": 4.5422,
+      "step": 926
+    },
+    {
+      "epoch": 0.00927,
+      "grad_norm": 0.8181371092796326,
+      "learning_rate": 0.002781,
+      "loss": 4.4999,
+      "step": 927
+    },
+    {
+      "epoch": 0.00928,
+      "grad_norm": 0.7728373408317566,
+      "learning_rate": 0.002784,
+      "loss": 4.5461,
+      "step": 928
+    },
+    {
+      "epoch": 0.00929,
+      "grad_norm": 0.624686598777771,
+      "learning_rate": 0.0027870000000000004,
+      "loss": 4.5192,
+      "step": 929
+    },
+    {
+      "epoch": 0.0093,
+      "grad_norm": 0.6143611073493958,
+      "learning_rate": 0.0027900000000000004,
+      "loss": 4.4968,
+      "step": 930
+    },
+    {
+      "epoch": 0.00931,
+      "grad_norm": 0.5353983044624329,
+      "learning_rate": 0.0027930000000000003,
+      "loss": 4.5041,
+      "step": 931
+    },
+    {
+      "epoch": 0.00932,
+      "grad_norm": 0.5034843683242798,
+      "learning_rate": 0.0027960000000000003,
+      "loss": 4.5081,
+      "step": 932
+    },
+    {
+      "epoch": 0.00933,
+      "grad_norm": 0.43557336926460266,
+      "learning_rate": 0.0027990000000000003,
+      "loss": 4.47,
+      "step": 933
+    },
+    {
+      "epoch": 0.00934,
+      "grad_norm": 0.42429375648498535,
+      "learning_rate": 0.0028020000000000002,
+      "loss": 4.4927,
+      "step": 934
+    },
+    {
+      "epoch": 0.00935,
+      "grad_norm": 0.4439206123352051,
+      "learning_rate": 0.002805,
+      "loss": 4.5015,
+      "step": 935
+    },
+    {
+      "epoch": 0.00936,
+      "grad_norm": 0.4363570511341095,
+      "learning_rate": 0.002808,
+      "loss": 4.4989,
+      "step": 936
+    },
+    {
+      "epoch": 0.00937,
+      "grad_norm": 0.4996969401836395,
+      "learning_rate": 0.002811,
+      "loss": 4.5097,
+      "step": 937
+    },
+    {
+      "epoch": 0.00938,
+      "grad_norm": 0.5577415227890015,
+      "learning_rate": 0.002814,
+      "loss": 4.4882,
+      "step": 938
+    },
+    {
+      "epoch": 0.00939,
+      "grad_norm": 0.6116971373558044,
+      "learning_rate": 0.002817,
+      "loss": 4.4823,
+      "step": 939
+    },
+    {
+      "epoch": 0.0094,
+      "grad_norm": 0.675699770450592,
+      "learning_rate": 0.00282,
+      "loss": 4.4838,
+      "step": 940
+    },
+    {
+      "epoch": 0.00941,
+      "grad_norm": 0.8027246594429016,
+      "learning_rate": 0.002823,
+      "loss": 4.4671,
+      "step": 941
+    },
+    {
+      "epoch": 0.00942,
+      "grad_norm": 0.9442744851112366,
+      "learning_rate": 0.002826,
+      "loss": 4.5129,
+      "step": 942
+    },
+    {
+      "epoch": 0.00943,
+      "grad_norm": 0.963239312171936,
+      "learning_rate": 0.002829,
+      "loss": 4.4975,
+      "step": 943
+    },
+    {
+      "epoch": 0.00944,
+      "grad_norm": 0.8721352815628052,
+      "learning_rate": 0.002832,
+      "loss": 4.493,
+      "step": 944
+    },
+    {
+      "epoch": 0.00945,
+      "grad_norm": 1.0316184759140015,
+      "learning_rate": 0.002835,
+      "loss": 4.5084,
+      "step": 945
+    },
+    {
+      "epoch": 0.00946,
+      "grad_norm": 0.7907041907310486,
+      "learning_rate": 0.002838,
+      "loss": 4.5093,
+      "step": 946
+    },
+    {
+      "epoch": 0.00947,
+      "grad_norm": 0.8508433699607849,
+      "learning_rate": 0.0028409999999999998,
+      "loss": 4.4859,
+      "step": 947
+    },
+    {
+      "epoch": 0.00948,
+      "grad_norm": 0.8942288756370544,
+      "learning_rate": 0.0028439999999999997,
+      "loss": 4.4805,
+      "step": 948
+    },
+    {
+      "epoch": 0.00949,
+      "grad_norm": 0.8804354071617126,
+      "learning_rate": 0.002847,
+      "loss": 4.5178,
+      "step": 949
+    },
+    {
+      "epoch": 0.0095,
+      "grad_norm": 0.9102524518966675,
+      "learning_rate": 0.00285,
+      "loss": 4.5104,
+      "step": 950
+    },
+    {
+      "epoch": 0.00951,
+      "grad_norm": 0.9140040278434753,
+      "learning_rate": 0.002853,
+      "loss": 4.4913,
+      "step": 951
+    },
+    {
+      "epoch": 0.00952,
+      "grad_norm": 0.996699869632721,
+      "learning_rate": 0.002856,
+      "loss": 4.5192,
+      "step": 952
+    },
+    {
+      "epoch": 0.00953,
+      "grad_norm": 1.0743249654769897,
+      "learning_rate": 0.002859,
+      "loss": 4.4742,
+      "step": 953
+    },
+    {
+      "epoch": 0.00954,
+      "grad_norm": 0.9619385004043579,
+      "learning_rate": 0.002862,
+      "loss": 4.532,
+      "step": 954
+    },
+    {
+      "epoch": 0.00955,
+      "grad_norm": 0.9820901155471802,
+      "learning_rate": 0.002865,
+      "loss": 4.5178,
+      "step": 955
+    },
+    {
+      "epoch": 0.00956,
+      "grad_norm": 1.0036413669586182,
+      "learning_rate": 0.002868,
+      "loss": 4.501,
+      "step": 956
+    },
+    {
+      "epoch": 0.00957,
+      "grad_norm": 0.8994410634040833,
+      "learning_rate": 0.002871,
+      "loss": 4.5266,
+      "step": 957
+    },
+    {
+      "epoch": 0.00958,
+      "grad_norm": 0.910679817199707,
+      "learning_rate": 0.002874,
+      "loss": 4.5103,
+      "step": 958
+    },
+    {
+      "epoch": 0.00959,
+      "grad_norm": 0.7567980885505676,
+      "learning_rate": 0.002877,
+      "loss": 4.4958,
+      "step": 959
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.7841255068778992,
+      "learning_rate": 0.0028799999999999997,
+      "loss": 4.4967,
+      "step": 960
+    },
+    {
+      "epoch": 0.00961,
+      "grad_norm": 0.6947323083877563,
+      "learning_rate": 0.002883,
+      "loss": 4.4665,
+      "step": 961
+    },
+    {
+      "epoch": 0.00962,
+      "grad_norm": 0.5831562280654907,
+      "learning_rate": 0.002886,
+      "loss": 4.4887,
+      "step": 962
+    },
+    {
+      "epoch": 0.00963,
+      "grad_norm": 0.5601492524147034,
+      "learning_rate": 0.002889,
+      "loss": 4.4846,
+      "step": 963
+    },
+    {
+      "epoch": 0.00964,
+      "grad_norm": 0.474755197763443,
+      "learning_rate": 0.002892,
+      "loss": 4.47,
+      "step": 964
+    },
+    {
+      "epoch": 0.00965,
+      "grad_norm": 0.5203716158866882,
+      "learning_rate": 0.002895,
+      "loss": 4.4451,
+      "step": 965
+    },
+    {
+      "epoch": 0.00966,
+      "grad_norm": 0.4714201092720032,
+      "learning_rate": 0.002898,
+      "loss": 4.4572,
+      "step": 966
+    },
+    {
+      "epoch": 0.00967,
+      "grad_norm": 0.4852457344532013,
+      "learning_rate": 0.002901,
+      "loss": 4.4924,
+      "step": 967
+    },
+    {
+      "epoch": 0.00968,
+      "grad_norm": 0.5026896595954895,
+      "learning_rate": 0.002904,
+      "loss": 4.4559,
+      "step": 968
+    },
+    {
+      "epoch": 0.00969,
+      "grad_norm": 0.5138400793075562,
+      "learning_rate": 0.002907,
+      "loss": 4.4843,
+      "step": 969
+    },
+    {
+      "epoch": 0.0097,
+      "grad_norm": 0.46154963970184326,
+      "learning_rate": 0.00291,
+      "loss": 4.4402,
+      "step": 970
+    },
+    {
+      "epoch": 0.00971,
+      "grad_norm": 0.4250973165035248,
+      "learning_rate": 0.002913,
+      "loss": 4.4437,
+      "step": 971
+    },
+    {
+      "epoch": 0.00972,
+      "grad_norm": 0.46983814239501953,
+      "learning_rate": 0.002916,
+      "loss": 4.45,
+      "step": 972
+    },
+    {
+      "epoch": 0.00973,
+      "grad_norm": 0.578143835067749,
+      "learning_rate": 0.002919,
+      "loss": 4.4478,
+      "step": 973
+    },
+    {
+      "epoch": 0.00974,
+      "grad_norm": 0.7184126377105713,
+      "learning_rate": 0.002922,
+      "loss": 4.4366,
+      "step": 974
+    },
+    {
+      "epoch": 0.00975,
+      "grad_norm": 0.8322773575782776,
+      "learning_rate": 0.002925,
+      "loss": 4.4683,
+      "step": 975
+    },
+    {
+      "epoch": 0.00976,
+      "grad_norm": 0.7910879850387573,
+      "learning_rate": 0.002928,
+      "loss": 4.4497,
+      "step": 976
+    },
+    {
+      "epoch": 0.00977,
+      "grad_norm": 0.7662490606307983,
+      "learning_rate": 0.002931,
+      "loss": 4.4868,
+      "step": 977
+    },
+    {
+      "epoch": 0.00978,
+      "grad_norm": 0.6879754066467285,
+      "learning_rate": 0.002934,
+      "loss": 4.4568,
+      "step": 978
+    },
+    {
+      "epoch": 0.00979,
+      "grad_norm": 0.7417387366294861,
+      "learning_rate": 0.002937,
+      "loss": 4.4491,
+      "step": 979
+    },
+    {
+      "epoch": 0.0098,
+      "grad_norm": 0.6087706089019775,
+      "learning_rate": 0.00294,
+      "loss": 4.4405,
+      "step": 980
+    },
+    {
+      "epoch": 0.00981,
+      "grad_norm": 0.5661059617996216,
+      "learning_rate": 0.002943,
+      "loss": 4.4516,
+      "step": 981
+    },
+    {
+      "epoch": 0.00982,
+      "grad_norm": 0.5970472097396851,
+      "learning_rate": 0.002946,
+      "loss": 4.4435,
+      "step": 982
+    },
+    {
+      "epoch": 0.00983,
+      "grad_norm": 0.6391454935073853,
+      "learning_rate": 0.0029490000000000002,
+      "loss": 4.4444,
+      "step": 983
+    },
+    {
+      "epoch": 0.00984,
+      "grad_norm": 0.5948253870010376,
+      "learning_rate": 0.002952,
+      "loss": 4.4303,
+      "step": 984
+    },
+    {
+      "epoch": 0.00985,
+      "grad_norm": 0.5908463597297668,
+      "learning_rate": 0.002955,
+      "loss": 4.4509,
+      "step": 985
+    },
+    {
+      "epoch": 0.00986,
+      "grad_norm": 0.6963019371032715,
+      "learning_rate": 0.002958,
+      "loss": 4.4177,
+      "step": 986
+    },
+    {
+      "epoch": 0.00987,
+      "grad_norm": 0.7601437568664551,
+      "learning_rate": 0.002961,
+      "loss": 4.4332,
+      "step": 987
+    },
+    {
+      "epoch": 0.00988,
+      "grad_norm": 0.8287732005119324,
+      "learning_rate": 0.002964,
+      "loss": 4.4707,
+      "step": 988
+    },
+    {
+      "epoch": 0.00989,
+      "grad_norm": 0.9172109961509705,
+      "learning_rate": 0.002967,
+      "loss": 4.4168,
+      "step": 989
+    },
+    {
+      "epoch": 0.0099,
+      "grad_norm": 0.9710732102394104,
+      "learning_rate": 0.00297,
+      "loss": 4.4601,
+      "step": 990
+    },
+    {
+      "epoch": 0.00991,
+      "grad_norm": 0.9308454990386963,
+      "learning_rate": 0.002973,
+      "loss": 4.4596,
+      "step": 991
+    },
+    {
+      "epoch": 0.00992,
+      "grad_norm": 0.8575865030288696,
+      "learning_rate": 0.002976,
+      "loss": 4.4567,
+      "step": 992
+    },
+    {
+      "epoch": 0.00993,
+      "grad_norm": 0.8846513032913208,
+      "learning_rate": 0.002979,
+      "loss": 4.4578,
+      "step": 993
+    },
+    {
+      "epoch": 0.00994,
+      "grad_norm": 1.0507467985153198,
+      "learning_rate": 0.002982,
+      "loss": 4.4695,
+      "step": 994
+    },
+    {
+      "epoch": 0.00995,
+      "grad_norm": 0.9225064516067505,
+      "learning_rate": 0.0029850000000000002,
+      "loss": 4.472,
+      "step": 995
+    },
+    {
+      "epoch": 0.00996,
+      "grad_norm": 1.0122137069702148,
+      "learning_rate": 0.002988,
+      "loss": 4.4787,
+      "step": 996
+    },
+    {
+      "epoch": 0.00997,
+      "grad_norm": 1.1055110692977905,
+      "learning_rate": 0.002991,
+      "loss": 4.4822,
+      "step": 997
+    },
+    {
+      "epoch": 0.00998,
+      "grad_norm": 1.0749611854553223,
+      "learning_rate": 0.002994,
+      "loss": 4.4866,
+      "step": 998
+    },
+    {
+      "epoch": 0.00999,
+      "grad_norm": 1.0997530221939087,
+      "learning_rate": 0.002997,
+      "loss": 4.469,
+      "step": 999
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.853782057762146,
+      "learning_rate": 0.003,
+      "loss": 4.4844,
+      "step": 1000
+    },
+    {
+      "epoch": 0.01001,
+      "grad_norm": 0.7132817506790161,
+      "learning_rate": 0.003,
+      "loss": 4.4776,
+      "step": 1001
+    },
+    {
+      "epoch": 0.01002,
+      "grad_norm": 0.8123744130134583,
+      "learning_rate": 0.003,
+      "loss": 4.4523,
+      "step": 1002
+    },
+    {
+      "epoch": 0.01003,
+      "grad_norm": 0.7343231439590454,
+      "learning_rate": 0.003,
+      "loss": 4.4688,
+      "step": 1003
+    },
+    {
+      "epoch": 0.01004,
+      "grad_norm": 0.7312922477722168,
+      "learning_rate": 0.003,
+      "loss": 4.4567,
+      "step": 1004
+    },
+    {
+      "epoch": 0.01005,
+      "grad_norm": 0.7205970287322998,
+      "learning_rate": 0.003,
+      "loss": 4.4254,
+      "step": 1005
+    },
+    {
+      "epoch": 0.01006,
+      "grad_norm": 0.8032294511795044,
+      "learning_rate": 0.003,
+      "loss": 4.4272,
+      "step": 1006
+    },
+    {
+      "epoch": 0.01007,
+      "grad_norm": 0.8357327580451965,
+      "learning_rate": 0.003,
+      "loss": 4.4775,
+      "step": 1007
+    },
+    {
+      "epoch": 0.01008,
+      "grad_norm": 0.8112847208976746,
+      "learning_rate": 0.003,
+      "loss": 4.4864,
+      "step": 1008
+    },
+    {
+      "epoch": 0.01009,
+      "grad_norm": 0.9153414964675903,
+      "learning_rate": 0.003,
+      "loss": 4.46,
+      "step": 1009
+    },
+    {
+      "epoch": 0.0101,
+      "grad_norm": 0.9501859545707703,
+      "learning_rate": 0.003,
+      "loss": 4.4762,
+      "step": 1010
+    },
+    {
+      "epoch": 0.01011,
+      "grad_norm": 0.9173474907875061,
+      "learning_rate": 0.003,
+      "loss": 4.4819,
+      "step": 1011
+    },
+    {
+      "epoch": 0.01012,
+      "grad_norm": 0.8518911600112915,
+      "learning_rate": 0.003,
+      "loss": 4.4635,
+      "step": 1012
+    },
+    {
+      "epoch": 0.01013,
+      "grad_norm": 0.8621218204498291,
+      "learning_rate": 0.003,
+      "loss": 4.4943,
+      "step": 1013
+    },
+    {
+      "epoch": 0.01014,
+      "grad_norm": 0.6589127779006958,
+      "learning_rate": 0.003,
+      "loss": 4.419,
+      "step": 1014
+    },
+    {
+      "epoch": 0.01015,
+      "grad_norm": 0.7594727277755737,
+      "learning_rate": 0.003,
+      "loss": 4.4653,
+      "step": 1015
+    },
+    {
+      "epoch": 0.01016,
+      "grad_norm": 0.800542414188385,
+      "learning_rate": 0.003,
+      "loss": 4.4749,
+      "step": 1016
+    },
+    {
+      "epoch": 0.01017,
+      "grad_norm": 0.7398179769515991,
+      "learning_rate": 0.003,
+      "loss": 4.4124,
+      "step": 1017
+    },
+    {
+      "epoch": 0.01018,
+      "grad_norm": 0.6518093347549438,
+      "learning_rate": 0.003,
+      "loss": 4.4645,
+      "step": 1018
+    },
+    {
+      "epoch": 0.01019,
+      "grad_norm": 0.6830618381500244,
+      "learning_rate": 0.003,
+      "loss": 4.4391,
+      "step": 1019
+    },
+    {
+      "epoch": 0.0102,
+      "grad_norm": 0.8045121431350708,
+      "learning_rate": 0.003,
+      "loss": 4.4426,
+      "step": 1020
+    },
+    {
+      "epoch": 0.01021,
+      "grad_norm": 0.6937596201896667,
+      "learning_rate": 0.003,
+      "loss": 4.4245,
+      "step": 1021
+    },
+    {
+      "epoch": 0.01022,
+      "grad_norm": 0.5872735977172852,
+      "learning_rate": 0.003,
+      "loss": 4.4273,
+      "step": 1022
+    },
+    {
+      "epoch": 0.01023,
+      "grad_norm": 0.63226717710495,
+      "learning_rate": 0.003,
+      "loss": 4.4308,
+      "step": 1023
+    },
+    {
+      "epoch": 0.01024,
+      "grad_norm": 0.7018114924430847,
+      "learning_rate": 0.003,
+      "loss": 4.3934,
+      "step": 1024
+    },
+    {
+      "epoch": 0.01025,
+      "grad_norm": 0.5812709927558899,
+      "learning_rate": 0.003,
+      "loss": 4.4216,
+      "step": 1025
+    },
+    {
+      "epoch": 0.01026,
+      "grad_norm": 0.4411616921424866,
+      "learning_rate": 0.003,
+      "loss": 4.4039,
+      "step": 1026
+    },
+    {
+      "epoch": 0.01027,
+      "grad_norm": 0.6123212575912476,
+      "learning_rate": 0.003,
+      "loss": 4.4333,
+      "step": 1027
+    },
+    {
+      "epoch": 0.01028,
+      "grad_norm": 0.6239144802093506,
+      "learning_rate": 0.003,
+      "loss": 4.4176,
+      "step": 1028
+    },
+    {
+      "epoch": 0.01029,
+      "grad_norm": 0.46944838762283325,
+      "learning_rate": 0.003,
+      "loss": 4.386,
+      "step": 1029
+    },
+    {
+      "epoch": 0.0103,
+      "grad_norm": 0.5337734818458557,
+      "learning_rate": 0.003,
+      "loss": 4.4157,
+      "step": 1030
+    },
+    {
+      "epoch": 0.01031,
+      "grad_norm": 0.6015392541885376,
+      "learning_rate": 0.003,
+      "loss": 4.4386,
+      "step": 1031
+    },
+    {
+      "epoch": 0.01032,
+      "grad_norm": 0.49139612913131714,
+      "learning_rate": 0.003,
+      "loss": 4.3989,
+      "step": 1032
+    },
+    {
+      "epoch": 0.01033,
+      "grad_norm": 0.6846477389335632,
+      "learning_rate": 0.003,
+      "loss": 4.444,
+      "step": 1033
+    },
+    {
+      "epoch": 0.01034,
+      "grad_norm": 0.619234025478363,
+      "learning_rate": 0.003,
+      "loss": 4.4088,
+      "step": 1034
+    },
+    {
+      "epoch": 0.01035,
+      "grad_norm": 0.567406952381134,
+      "learning_rate": 0.003,
+      "loss": 4.4003,
+      "step": 1035
+    },
+    {
+      "epoch": 0.01036,
+      "grad_norm": 0.7047313451766968,
+      "learning_rate": 0.003,
+      "loss": 4.4001,
+      "step": 1036
+    },
+    {
+      "epoch": 0.01037,
+      "grad_norm": 0.6020769476890564,
+      "learning_rate": 0.003,
+      "loss": 4.4235,
+      "step": 1037
+    },
+    {
+      "epoch": 0.01038,
+      "grad_norm": 0.498977929353714,
+      "learning_rate": 0.003,
+      "loss": 4.4148,
+      "step": 1038
+    },
+    {
+      "epoch": 0.01039,
+      "grad_norm": 0.5447039008140564,
+      "learning_rate": 0.003,
+      "loss": 4.4085,
+      "step": 1039
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.5632253289222717,
+      "learning_rate": 0.003,
+      "loss": 4.4204,
+      "step": 1040
+    },
+    {
+      "epoch": 0.01041,
+      "grad_norm": 0.5558527708053589,
+      "learning_rate": 0.003,
+      "loss": 4.4009,
+      "step": 1041
+    },
+    {
+      "epoch": 0.01042,
+      "grad_norm": 0.5142215490341187,
+      "learning_rate": 0.003,
+      "loss": 4.4233,
+      "step": 1042
+    },
+    {
+      "epoch": 0.01043,
+      "grad_norm": 0.4451257884502411,
+      "learning_rate": 0.003,
+      "loss": 4.3651,
+      "step": 1043
+    },
+    {
+      "epoch": 0.01044,
+      "grad_norm": 0.41156554222106934,
+      "learning_rate": 0.003,
+      "loss": 4.4008,
+      "step": 1044
+    },
+    {
+      "epoch": 0.01045,
+      "grad_norm": 0.42055779695510864,
+      "learning_rate": 0.003,
+      "loss": 4.4203,
+      "step": 1045
+    },
+    {
+      "epoch": 0.01046,
+      "grad_norm": 0.5353807806968689,
+      "learning_rate": 0.003,
+      "loss": 4.3928,
+      "step": 1046
+    },
+    {
+      "epoch": 0.01047,
+      "grad_norm": 0.6298825144767761,
+      "learning_rate": 0.003,
+      "loss": 4.4015,
+      "step": 1047
+    },
+    {
+      "epoch": 0.01048,
+      "grad_norm": 0.7191411256790161,
+      "learning_rate": 0.003,
+      "loss": 4.3988,
+      "step": 1048
+    },
+    {
+      "epoch": 0.01049,
+      "grad_norm": 0.7613799571990967,
+      "learning_rate": 0.003,
+      "loss": 4.3808,
+      "step": 1049
+    },
+    {
+      "epoch": 0.0105,
+      "grad_norm": 0.7375987768173218,
+      "learning_rate": 0.003,
+      "loss": 4.4222,
+      "step": 1050
+    },
+    {
+      "epoch": 0.01051,
+      "grad_norm": 0.6425654888153076,
+      "learning_rate": 0.003,
+      "loss": 4.3959,
+      "step": 1051
+    },
+    {
+      "epoch": 0.01052,
+      "grad_norm": 0.7071349620819092,
+      "learning_rate": 0.003,
+      "loss": 4.4073,
+      "step": 1052
+    },
+    {
+      "epoch": 0.01053,
+      "grad_norm": 0.838892936706543,
+      "learning_rate": 0.003,
+      "loss": 4.4246,
+      "step": 1053
+    },
+    {
+      "epoch": 0.01054,
+      "grad_norm": 0.8081632852554321,
+      "learning_rate": 0.003,
+      "loss": 4.4168,
+      "step": 1054
+    },
+    {
+      "epoch": 0.01055,
+      "grad_norm": 1.0391567945480347,
+      "learning_rate": 0.003,
+      "loss": 4.4056,
+      "step": 1055
+    },
+    {
+      "epoch": 0.01056,
+      "grad_norm": 0.9966610074043274,
+      "learning_rate": 0.003,
+      "loss": 4.4166,
+      "step": 1056
+    },
+    {
+      "epoch": 0.01057,
+      "grad_norm": 0.8638164401054382,
+      "learning_rate": 0.003,
+      "loss": 4.4191,
+      "step": 1057
+    },
+    {
+      "epoch": 0.01058,
+      "grad_norm": 0.8240249156951904,
+      "learning_rate": 0.003,
+      "loss": 4.4115,
+      "step": 1058
+    },
+    {
+      "epoch": 0.01059,
+      "grad_norm": 0.8680912256240845,
+      "learning_rate": 0.003,
+      "loss": 4.4127,
+      "step": 1059
+    },
+    {
+      "epoch": 0.0106,
+      "grad_norm": 0.8156179785728455,
+      "learning_rate": 0.003,
+      "loss": 4.4084,
+      "step": 1060
+    },
+    {
+      "epoch": 0.01061,
+      "grad_norm": 0.7857629656791687,
+      "learning_rate": 0.003,
+      "loss": 4.4047,
+      "step": 1061
+    },
+    {
+      "epoch": 0.01062,
+      "grad_norm": 0.8633149862289429,
+      "learning_rate": 0.003,
+      "loss": 4.398,
+      "step": 1062
+    },
+    {
+      "epoch": 0.01063,
+      "grad_norm": 0.9093345403671265,
+      "learning_rate": 0.003,
+      "loss": 4.4015,
+      "step": 1063
+    },
+    {
+      "epoch": 0.01064,
+      "grad_norm": 0.8469107151031494,
+      "learning_rate": 0.003,
+      "loss": 4.4193,
+      "step": 1064
+    },
+    {
+      "epoch": 0.01065,
+      "grad_norm": 0.8231899738311768,
+      "learning_rate": 0.003,
+      "loss": 4.4189,
+      "step": 1065
+    },
+    {
+      "epoch": 0.01066,
+      "grad_norm": 0.8297374248504639,
+      "learning_rate": 0.003,
+      "loss": 4.423,
+      "step": 1066
+    },
+    {
+      "epoch": 0.01067,
+      "grad_norm": 0.678811252117157,
+      "learning_rate": 0.003,
+      "loss": 4.4286,
+      "step": 1067
+    },
+    {
+      "epoch": 0.01068,
+      "grad_norm": 0.7318910360336304,
+      "learning_rate": 0.003,
+      "loss": 4.4099,
+      "step": 1068
+    },
+    {
+      "epoch": 0.01069,
+      "grad_norm": 0.7908743023872375,
+      "learning_rate": 0.003,
+      "loss": 4.4229,
+      "step": 1069
+    },
+    {
+      "epoch": 0.0107,
+      "grad_norm": 0.8041807413101196,
+      "learning_rate": 0.003,
+      "loss": 4.4771,
+      "step": 1070
+    },
+    {
+      "epoch": 0.01071,
+      "grad_norm": 0.6532490849494934,
+      "learning_rate": 0.003,
+      "loss": 4.3932,
+      "step": 1071
+    },
+    {
+      "epoch": 0.01072,
+      "grad_norm": 0.6556645035743713,
+      "learning_rate": 0.003,
+      "loss": 4.426,
+      "step": 1072
+    },
+    {
+      "epoch": 0.01073,
+      "grad_norm": 0.8516754508018494,
+      "learning_rate": 0.003,
+      "loss": 4.4268,
+      "step": 1073
+    },
+    {
+      "epoch": 0.01074,
+      "grad_norm": 1.0397266149520874,
+      "learning_rate": 0.003,
+      "loss": 4.4077,
+      "step": 1074
+    },
+    {
+      "epoch": 0.01075,
+      "grad_norm": 0.8689488172531128,
+      "learning_rate": 0.003,
+      "loss": 4.4416,
+      "step": 1075
+    },
+    {
+      "epoch": 0.01076,
+      "grad_norm": 0.9471074342727661,
+      "learning_rate": 0.003,
+      "loss": 4.4417,
+      "step": 1076
+    },
+    {
+      "epoch": 0.01077,
+      "grad_norm": 0.9231577515602112,
+      "learning_rate": 0.003,
+      "loss": 4.4357,
+      "step": 1077
+    },
+    {
+      "epoch": 0.01078,
+      "grad_norm": 0.8509425520896912,
+      "learning_rate": 0.003,
+      "loss": 4.3906,
+      "step": 1078
+    },
+    {
+      "epoch": 0.01079,
+      "grad_norm": 0.7320951819419861,
+      "learning_rate": 0.003,
+      "loss": 4.4158,
+      "step": 1079
+    },
+    {
+      "epoch": 0.0108,
+      "grad_norm": 0.6985662579536438,
+      "learning_rate": 0.003,
+      "loss": 4.3942,
+      "step": 1080
+    },
+    {
+      "epoch": 0.01081,
+      "grad_norm": 0.5917143225669861,
+      "learning_rate": 0.003,
+      "loss": 4.396,
+      "step": 1081
+    },
+    {
+      "epoch": 0.01082,
+      "grad_norm": 0.567608654499054,
+      "learning_rate": 0.003,
+      "loss": 4.4109,
+      "step": 1082
+    },
+    {
+      "epoch": 0.01083,
+      "grad_norm": 0.5785332322120667,
+      "learning_rate": 0.003,
+      "loss": 4.3961,
+      "step": 1083
+    },
+    {
+      "epoch": 0.01084,
+      "grad_norm": 0.5440964102745056,
+      "learning_rate": 0.003,
+      "loss": 4.3992,
+      "step": 1084
+    },
+    {
+      "epoch": 0.01085,
+      "grad_norm": 0.49031350016593933,
+      "learning_rate": 0.003,
+      "loss": 4.3852,
+      "step": 1085
+    },
+    {
+      "epoch": 0.01086,
+      "grad_norm": 0.5103932619094849,
+      "learning_rate": 0.003,
+      "loss": 4.3604,
+      "step": 1086
+    },
+    {
+      "epoch": 0.01087,
+      "grad_norm": 0.5229532718658447,
+      "learning_rate": 0.003,
+      "loss": 4.3777,
+      "step": 1087
+    },
+    {
+      "epoch": 0.01088,
+      "grad_norm": 0.6494709253311157,
+      "learning_rate": 0.003,
+      "loss": 4.4131,
+      "step": 1088
+    },
+    {
+      "epoch": 0.01089,
+      "grad_norm": 0.6600701808929443,
+      "learning_rate": 0.003,
+      "loss": 4.3871,
+      "step": 1089
+    },
+    {
+      "epoch": 0.0109,
+      "grad_norm": 0.4794403612613678,
+      "learning_rate": 0.003,
+      "loss": 4.4042,
+      "step": 1090
+    },
+    {
+      "epoch": 0.01091,
+      "grad_norm": 0.4167538285255432,
+      "learning_rate": 0.003,
+      "loss": 4.369,
+      "step": 1091
+    },
+    {
+      "epoch": 0.01092,
+      "grad_norm": 0.4487409293651581,
+      "learning_rate": 0.003,
+      "loss": 4.3678,
+      "step": 1092
+    },
+    {
+      "epoch": 0.01093,
+      "grad_norm": 0.39760273694992065,
+      "learning_rate": 0.003,
+      "loss": 4.3829,
+      "step": 1093
+    },
+    {
+      "epoch": 0.01094,
+      "grad_norm": 0.3819560706615448,
+      "learning_rate": 0.003,
+      "loss": 4.3691,
+      "step": 1094
+    },
+    {
+      "epoch": 0.01095,
+      "grad_norm": 0.4009959101676941,
+      "learning_rate": 0.003,
+      "loss": 4.3966,
+      "step": 1095
+    },
+    {
+      "epoch": 0.01096,
+      "grad_norm": 0.4385271668434143,
+      "learning_rate": 0.003,
+      "loss": 4.3677,
+      "step": 1096
+    },
+    {
+      "epoch": 0.01097,
+      "grad_norm": 0.5154523253440857,
+      "learning_rate": 0.003,
+      "loss": 4.3585,
+      "step": 1097
+    },
+    {
+      "epoch": 0.01098,
+      "grad_norm": 0.6741756200790405,
+      "learning_rate": 0.003,
+      "loss": 4.3816,
+      "step": 1098
+    },
+    {
+      "epoch": 0.01099,
+      "grad_norm": 0.782984733581543,
+      "learning_rate": 0.003,
+      "loss": 4.3628,
+      "step": 1099
+    },
+    {
+      "epoch": 0.011,
+      "grad_norm": 0.7672849297523499,
+      "learning_rate": 0.003,
+      "loss": 4.3633,
+      "step": 1100
+    },
+    {
+      "epoch": 0.01101,
+      "grad_norm": 0.7932085394859314,
+      "learning_rate": 0.003,
+      "loss": 4.377,
+      "step": 1101
+    },
+    {
+      "epoch": 0.01102,
+      "grad_norm": 0.8302851915359497,
+      "learning_rate": 0.003,
+      "loss": 4.3891,
+      "step": 1102
+    },
+    {
+      "epoch": 0.01103,
+      "grad_norm": 0.747067391872406,
+      "learning_rate": 0.003,
+      "loss": 4.3737,
+      "step": 1103
+    },
+    {
+      "epoch": 0.01104,
+      "grad_norm": 0.789983332157135,
+      "learning_rate": 0.003,
+      "loss": 4.3715,
+      "step": 1104
+    },
+    {
+      "epoch": 0.01105,
+      "grad_norm": 1.0034455060958862,
+      "learning_rate": 0.003,
+      "loss": 4.4029,
+      "step": 1105
+    },
+    {
+      "epoch": 0.01106,
+      "grad_norm": 1.1279704570770264,
+      "learning_rate": 0.003,
+      "loss": 4.3962,
+      "step": 1106
+    },
+    {
+      "epoch": 0.01107,
+      "grad_norm": 0.916431725025177,
+      "learning_rate": 0.003,
+      "loss": 4.3914,
+      "step": 1107
+    },
+    {
+      "epoch": 0.01108,
+      "grad_norm": 0.9773505330085754,
+      "learning_rate": 0.003,
+      "loss": 4.3731,
+      "step": 1108
+    },
+    {
+      "epoch": 0.01109,
+      "grad_norm": 0.8878449201583862,
+      "learning_rate": 0.003,
+      "loss": 4.3883,
+      "step": 1109
+    },
+    {
+      "epoch": 0.0111,
+      "grad_norm": 0.7733376622200012,
+      "learning_rate": 0.003,
+      "loss": 4.4082,
+      "step": 1110
+    },
+    {
+      "epoch": 0.01111,
+      "grad_norm": 0.7688936591148376,
+      "learning_rate": 0.003,
+      "loss": 4.3824,
+      "step": 1111
+    },
+    {
+      "epoch": 0.01112,
+      "grad_norm": 0.8010363578796387,
+      "learning_rate": 0.003,
+      "loss": 4.4064,
+      "step": 1112
+    },
+    {
+      "epoch": 0.01113,
+      "grad_norm": 0.7642234563827515,
+      "learning_rate": 0.003,
+      "loss": 4.3689,
+      "step": 1113
+    },
+    {
+      "epoch": 0.01114,
+      "grad_norm": 0.6956503987312317,
+      "learning_rate": 0.003,
+      "loss": 4.386,
+      "step": 1114
+    },
+    {
+      "epoch": 0.01115,
+      "grad_norm": 0.5809863805770874,
+      "learning_rate": 0.003,
+      "loss": 4.3985,
+      "step": 1115
+    },
+    {
+      "epoch": 0.01116,
+      "grad_norm": 0.5753639936447144,
+      "learning_rate": 0.003,
+      "loss": 4.3898,
+      "step": 1116
+    },
+    {
+      "epoch": 0.01117,
+      "grad_norm": 0.5193636417388916,
+      "learning_rate": 0.003,
+      "loss": 4.3734,
+      "step": 1117
+    },
+    {
+      "epoch": 0.01118,
+      "grad_norm": 0.5058582425117493,
+      "learning_rate": 0.003,
+      "loss": 4.3742,
+      "step": 1118
+    },
+    {
+      "epoch": 0.01119,
+      "grad_norm": 0.4714462161064148,
+      "learning_rate": 0.003,
+      "loss": 4.3802,
+      "step": 1119
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.5877813100814819,
+      "learning_rate": 0.003,
+      "loss": 4.3643,
+      "step": 1120
+    },
+    {
+      "epoch": 0.01121,
+      "grad_norm": 0.6811574697494507,
+      "learning_rate": 0.003,
+      "loss": 4.3858,
+      "step": 1121
+    },
+    {
+      "epoch": 0.01122,
+      "grad_norm": 0.7246198058128357,
+      "learning_rate": 0.003,
+      "loss": 4.3577,
+      "step": 1122
+    },
+    {
+      "epoch": 0.01123,
+      "grad_norm": 0.6985503435134888,
+      "learning_rate": 0.003,
+      "loss": 4.3459,
+      "step": 1123
+    },
+    {
+      "epoch": 0.01124,
+      "grad_norm": 0.6198428273200989,
+      "learning_rate": 0.003,
+      "loss": 4.3921,
+      "step": 1124
+    },
+    {
+      "epoch": 0.01125,
+      "grad_norm": 0.6425508856773376,
+      "learning_rate": 0.003,
+      "loss": 4.3601,
+      "step": 1125
+    },
+    {
+      "epoch": 0.01126,
+      "grad_norm": 0.7215856313705444,
+      "learning_rate": 0.003,
+      "loss": 4.3688,
+      "step": 1126
+    },
+    {
+      "epoch": 0.01127,
+      "grad_norm": 0.614211916923523,
+      "learning_rate": 0.003,
+      "loss": 4.3955,
+      "step": 1127
+    },
+    {
+      "epoch": 0.01128,
+      "grad_norm": 0.5611268877983093,
+      "learning_rate": 0.003,
+      "loss": 4.3725,
+      "step": 1128
+    },
+    {
+      "epoch": 0.01129,
+      "grad_norm": 0.5580106973648071,
+      "learning_rate": 0.003,
+      "loss": 4.3639,
+      "step": 1129
+    },
+    {
+      "epoch": 0.0113,
+      "grad_norm": 0.5985743403434753,
+      "learning_rate": 0.003,
+      "loss": 4.3456,
+      "step": 1130
+    },
+    {
+      "epoch": 0.01131,
+      "grad_norm": 0.6220662593841553,
+      "learning_rate": 0.003,
+      "loss": 4.3584,
+      "step": 1131
+    },
+    {
+      "epoch": 0.01132,
+      "grad_norm": 0.6651334166526794,
+      "learning_rate": 0.003,
+      "loss": 4.3835,
+      "step": 1132
+    },
+    {
+      "epoch": 0.01133,
+      "grad_norm": 0.6686156392097473,
+      "learning_rate": 0.003,
+      "loss": 4.3408,
+      "step": 1133
+    },
+    {
+      "epoch": 0.01134,
+      "grad_norm": 0.6207416653633118,
+      "learning_rate": 0.003,
+      "loss": 4.3549,
+      "step": 1134
+    },
+    {
+      "epoch": 0.01135,
+      "grad_norm": 0.4418866038322449,
+      "learning_rate": 0.003,
+      "loss": 4.3765,
+      "step": 1135
+    },
+    {
+      "epoch": 0.01136,
+      "grad_norm": 0.496660441160202,
+      "learning_rate": 0.003,
+      "loss": 4.3534,
+      "step": 1136
+    },
+    {
+      "epoch": 0.01137,
+      "grad_norm": 0.516294002532959,
+      "learning_rate": 0.003,
+      "loss": 4.3812,
+      "step": 1137
+    },
+    {
+      "epoch": 0.01138,
+      "grad_norm": 0.7070192098617554,
+      "learning_rate": 0.003,
+      "loss": 4.3409,
+      "step": 1138
+    },
+    {
+      "epoch": 0.01139,
+      "grad_norm": 0.9681088924407959,
+      "learning_rate": 0.003,
+      "loss": 4.3765,
+      "step": 1139
+    },
+    {
+      "epoch": 0.0114,
+      "grad_norm": 1.0279847383499146,
+      "learning_rate": 0.003,
+      "loss": 4.361,
+      "step": 1140
+    },
+    {
+      "epoch": 0.01141,
+      "grad_norm": 0.8630406260490417,
+      "learning_rate": 0.003,
+      "loss": 4.3777,
+      "step": 1141
+    },
+    {
+      "epoch": 0.01142,
+      "grad_norm": 0.8118616938591003,
+      "learning_rate": 0.003,
+      "loss": 4.3623,
+      "step": 1142
+    },
+    {
+      "epoch": 0.01143,
+      "grad_norm": 0.6431841254234314,
+      "learning_rate": 0.003,
+      "loss": 4.3605,
+      "step": 1143
+    },
+    {
+      "epoch": 0.01144,
+      "grad_norm": 0.8104698061943054,
+      "learning_rate": 0.003,
+      "loss": 4.3374,
+      "step": 1144
+    },
+    {
+      "epoch": 0.01145,
+      "grad_norm": 0.8287402987480164,
+      "learning_rate": 0.003,
+      "loss": 4.3806,
+      "step": 1145
+    },
+    {
+      "epoch": 0.01146,
+      "grad_norm": 0.6933982968330383,
+      "learning_rate": 0.003,
+      "loss": 4.358,
+      "step": 1146
+    },
+    {
+      "epoch": 0.01147,
+      "grad_norm": 0.7871519327163696,
+      "learning_rate": 0.003,
+      "loss": 4.3521,
+      "step": 1147
+    },
+    {
+      "epoch": 0.01148,
+      "grad_norm": 0.879456639289856,
+      "learning_rate": 0.003,
+      "loss": 4.3655,
+      "step": 1148
+    },
+    {
+      "epoch": 0.01149,
+      "grad_norm": 0.973317563533783,
+      "learning_rate": 0.003,
+      "loss": 4.3825,
+      "step": 1149
+    },
+    {
+      "epoch": 0.0115,
+      "grad_norm": 0.9118475914001465,
+      "learning_rate": 0.003,
+      "loss": 4.3787,
+      "step": 1150
+    },
+    {
+      "epoch": 0.01151,
+      "grad_norm": 0.8108662962913513,
+      "learning_rate": 0.003,
+      "loss": 4.3725,
+      "step": 1151
+    },
+    {
+      "epoch": 0.01152,
+      "grad_norm": 0.6577885150909424,
+      "learning_rate": 0.003,
+      "loss": 4.3738,
+      "step": 1152
+    },
+    {
+      "epoch": 0.01153,
+      "grad_norm": 0.7330801486968994,
+      "learning_rate": 0.003,
+      "loss": 4.3918,
+      "step": 1153
+    },
+    {
+      "epoch": 0.01154,
+      "grad_norm": 0.6542471051216125,
+      "learning_rate": 0.003,
+      "loss": 4.3669,
+      "step": 1154
+    },
+    {
+      "epoch": 0.01155,
+      "grad_norm": 0.6441750526428223,
+      "learning_rate": 0.003,
+      "loss": 4.3518,
+      "step": 1155
+    },
+    {
+      "epoch": 0.01156,
+      "grad_norm": 0.6419921517372131,
+      "learning_rate": 0.003,
+      "loss": 4.3556,
+      "step": 1156
+    },
+    {
+      "epoch": 0.01157,
+      "grad_norm": 0.6812348365783691,
+      "learning_rate": 0.003,
+      "loss": 4.3858,
+      "step": 1157
+    },
+    {
+      "epoch": 0.01158,
+      "grad_norm": 0.6114307045936584,
+      "learning_rate": 0.003,
+      "loss": 4.3497,
+      "step": 1158
+    },
+    {
+      "epoch": 0.01159,
+      "grad_norm": 0.5222328901290894,
+      "learning_rate": 0.003,
+      "loss": 4.3411,
+      "step": 1159
+    },
+    {
+      "epoch": 0.0116,
+      "grad_norm": 0.5184376239776611,
+      "learning_rate": 0.003,
+      "loss": 4.3556,
+      "step": 1160
+    },
+    {
+      "epoch": 0.01161,
+      "grad_norm": 0.48943445086479187,
+      "learning_rate": 0.003,
+      "loss": 4.3274,
+      "step": 1161
+    },
+    {
+      "epoch": 0.01162,
+      "grad_norm": 0.44027477502822876,
+      "learning_rate": 0.003,
+      "loss": 4.3376,
+      "step": 1162
+    },
+    {
+      "epoch": 0.01163,
+      "grad_norm": 0.40750086307525635,
+      "learning_rate": 0.003,
+      "loss": 4.3549,
+      "step": 1163
+    },
+    {
+      "epoch": 0.01164,
+      "grad_norm": 0.3570636808872223,
+      "learning_rate": 0.003,
+      "loss": 4.3421,
+      "step": 1164
+    },
+    {
+      "epoch": 0.01165,
+      "grad_norm": 0.37404096126556396,
+      "learning_rate": 0.003,
+      "loss": 4.3343,
+      "step": 1165
+    },
+    {
+      "epoch": 0.01166,
+      "grad_norm": 0.3607056736946106,
+      "learning_rate": 0.003,
+      "loss": 4.3358,
+      "step": 1166
+    },
+    {
+      "epoch": 0.01167,
+      "grad_norm": 0.4395747482776642,
+      "learning_rate": 0.003,
+      "loss": 4.3038,
+      "step": 1167
+    },
+    {
+      "epoch": 0.01168,
+      "grad_norm": 0.5243505239486694,
+      "learning_rate": 0.003,
+      "loss": 4.3153,
+      "step": 1168
+    },
+    {
+      "epoch": 0.01169,
+      "grad_norm": 0.8133231997489929,
+      "learning_rate": 0.003,
+      "loss": 4.3628,
+      "step": 1169
+    },
+    {
+      "epoch": 0.0117,
+      "grad_norm": 0.8480315208435059,
+      "learning_rate": 0.003,
+      "loss": 4.3697,
+      "step": 1170
+    },
+    {
+      "epoch": 0.01171,
+      "grad_norm": 0.6388808488845825,
+      "learning_rate": 0.003,
+      "loss": 4.3357,
+      "step": 1171
+    },
+    {
+      "epoch": 0.01172,
+      "grad_norm": 0.6913546919822693,
+      "learning_rate": 0.003,
+      "loss": 4.3289,
+      "step": 1172
+    },
+    {
+      "epoch": 0.01173,
+      "grad_norm": 0.656560480594635,
+      "learning_rate": 0.003,
+      "loss": 4.3667,
+      "step": 1173
+    },
+    {
+      "epoch": 0.01174,
+      "grad_norm": 0.5804395079612732,
+      "learning_rate": 0.003,
+      "loss": 4.3352,
+      "step": 1174
+    },
+    {
+      "epoch": 0.01175,
+      "grad_norm": 0.7265805602073669,
+      "learning_rate": 0.003,
+      "loss": 4.3269,
+      "step": 1175
+    },
+    {
+      "epoch": 0.01176,
+      "grad_norm": 0.753591001033783,
+      "learning_rate": 0.003,
+      "loss": 4.3628,
+      "step": 1176
+    },
+    {
+      "epoch": 0.01177,
+      "grad_norm": 0.7016688585281372,
+      "learning_rate": 0.003,
+      "loss": 4.3288,
+      "step": 1177
+    },
+    {
+      "epoch": 0.01178,
+      "grad_norm": 0.7714430689811707,
+      "learning_rate": 0.003,
+      "loss": 4.3516,
+      "step": 1178
+    },
+    {
+      "epoch": 0.01179,
+      "grad_norm": 0.7303088903427124,
+      "learning_rate": 0.003,
+      "loss": 4.3309,
+      "step": 1179
+    },
+    {
+      "epoch": 0.0118,
+      "grad_norm": 0.6078006029129028,
+      "learning_rate": 0.003,
+      "loss": 4.3881,
+      "step": 1180
+    },
+    {
+      "epoch": 0.01181,
+      "grad_norm": 0.6842349767684937,
+      "learning_rate": 0.003,
+      "loss": 4.3776,
+      "step": 1181
+    },
+    {
+      "epoch": 0.01182,
+      "grad_norm": 0.6943366527557373,
+      "learning_rate": 0.003,
+      "loss": 4.3406,
+      "step": 1182
+    },
+    {
+      "epoch": 0.01183,
+      "grad_norm": 0.8091237545013428,
+      "learning_rate": 0.003,
+      "loss": 4.3348,
+      "step": 1183
+    },
+    {
+      "epoch": 0.01184,
+      "grad_norm": 1.061568021774292,
+      "learning_rate": 0.003,
+      "loss": 4.3914,
+      "step": 1184
+    },
+    {
+      "epoch": 0.01185,
+      "grad_norm": 0.8616968989372253,
+      "learning_rate": 0.003,
+      "loss": 4.3565,
+      "step": 1185
+    },
+    {
+      "epoch": 0.01186,
+      "grad_norm": 0.9476875066757202,
+      "learning_rate": 0.003,
+      "loss": 4.3829,
+      "step": 1186
+    },
+    {
+      "epoch": 0.01187,
+      "grad_norm": 0.8803635239601135,
+      "learning_rate": 0.003,
+      "loss": 4.3482,
+      "step": 1187
+    },
+    {
+      "epoch": 0.01188,
+      "grad_norm": 0.8929046988487244,
+      "learning_rate": 0.003,
+      "loss": 4.3663,
+      "step": 1188
+    },
+    {
+      "epoch": 0.01189,
+      "grad_norm": 1.0426416397094727,
+      "learning_rate": 0.003,
+      "loss": 4.3669,
+      "step": 1189
+    },
+    {
+      "epoch": 0.0119,
+      "grad_norm": 0.8442360758781433,
+      "learning_rate": 0.003,
+      "loss": 4.3513,
+      "step": 1190
+    },
+    {
+      "epoch": 0.01191,
+      "grad_norm": 0.6745409965515137,
+      "learning_rate": 0.003,
+      "loss": 4.3234,
+      "step": 1191
+    },
+    {
+      "epoch": 0.01192,
+      "grad_norm": 0.6114994883537292,
+      "learning_rate": 0.003,
+      "loss": 4.3525,
+      "step": 1192
+    },
+    {
+      "epoch": 0.01193,
+      "grad_norm": 0.545036792755127,
+      "learning_rate": 0.003,
+      "loss": 4.3526,
+      "step": 1193
+    },
+    {
+      "epoch": 0.01194,
+      "grad_norm": 0.556999921798706,
+      "learning_rate": 0.003,
+      "loss": 4.3504,
+      "step": 1194
+    },
+    {
+      "epoch": 0.01195,
+      "grad_norm": 0.5564072728157043,
+      "learning_rate": 0.003,
+      "loss": 4.3299,
+      "step": 1195
+    },
+    {
+      "epoch": 0.01196,
+      "grad_norm": 0.5982546210289001,
+      "learning_rate": 0.003,
+      "loss": 4.337,
+      "step": 1196
+    },
+    {
+      "epoch": 0.01197,
+      "grad_norm": 0.6563606858253479,
+      "learning_rate": 0.003,
+      "loss": 4.3029,
+      "step": 1197
+    },
+    {
+      "epoch": 0.01198,
+      "grad_norm": 0.7455101609230042,
+      "learning_rate": 0.003,
+      "loss": 4.379,
+      "step": 1198
+    },
+    {
+      "epoch": 0.01199,
+      "grad_norm": 0.7808868885040283,
+      "learning_rate": 0.003,
+      "loss": 4.3568,
+      "step": 1199
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.6572416424751282,
+      "learning_rate": 0.003,
+      "loss": 4.3439,
+      "step": 1200
+    },
+    {
+      "epoch": 0.01201,
+      "grad_norm": 0.588565468788147,
+      "learning_rate": 0.003,
+      "loss": 4.3678,
+      "step": 1201
+    },
+    {
+      "epoch": 0.01202,
+      "grad_norm": 0.6117038726806641,
+      "learning_rate": 0.003,
+      "loss": 4.3358,
+      "step": 1202
+    },
+    {
+      "epoch": 0.01203,
+      "grad_norm": 0.5589160323143005,
+      "learning_rate": 0.003,
+      "loss": 4.343,
+      "step": 1203
+    },
+    {
+      "epoch": 0.01204,
+      "grad_norm": 0.537390410900116,
+      "learning_rate": 0.003,
+      "loss": 4.3383,
+      "step": 1204
+    },
+    {
+      "epoch": 0.01205,
+      "grad_norm": 0.6048082709312439,
+      "learning_rate": 0.003,
+      "loss": 4.3289,
+      "step": 1205
+    },
+    {
+      "epoch": 0.01206,
+      "grad_norm": 0.5843051671981812,
+      "learning_rate": 0.003,
+      "loss": 4.3719,
+      "step": 1206
+    },
+    {
+      "epoch": 0.01207,
+      "grad_norm": 0.5032393932342529,
+      "learning_rate": 0.003,
+      "loss": 4.2929,
+      "step": 1207
+    },
+    {
+      "epoch": 0.01208,
+      "grad_norm": 0.4879417419433594,
+      "learning_rate": 0.003,
+      "loss": 4.351,
+      "step": 1208
+    },
+    {
+      "epoch": 0.01209,
+      "grad_norm": 0.5636802911758423,
+      "learning_rate": 0.003,
+      "loss": 4.3346,
+      "step": 1209
+    },
+    {
+      "epoch": 0.0121,
+      "grad_norm": 0.6080469489097595,
+      "learning_rate": 0.003,
+      "loss": 4.3373,
+      "step": 1210
+    },
+    {
+      "epoch": 0.01211,
+      "grad_norm": 0.5971558094024658,
+      "learning_rate": 0.003,
+      "loss": 4.3219,
+      "step": 1211
+    },
+    {
+      "epoch": 0.01212,
+      "grad_norm": 0.5976467132568359,
+      "learning_rate": 0.003,
+      "loss": 4.3467,
+      "step": 1212
+    },
+    {
+      "epoch": 0.01213,
+      "grad_norm": 0.5944254994392395,
+      "learning_rate": 0.003,
+      "loss": 4.3244,
+      "step": 1213
+    },
+    {
+      "epoch": 0.01214,
+      "grad_norm": 0.5843701958656311,
+      "learning_rate": 0.003,
+      "loss": 4.306,
+      "step": 1214
+    },
+    {
+      "epoch": 0.01215,
+      "grad_norm": 0.6481133103370667,
+      "learning_rate": 0.003,
+      "loss": 4.3176,
+      "step": 1215
+    },
+    {
+      "epoch": 0.01216,
+      "grad_norm": 0.819490909576416,
+      "learning_rate": 0.003,
+      "loss": 4.3212,
+      "step": 1216
+    },
+    {
+      "epoch": 0.01217,
+      "grad_norm": 0.8658471703529358,
+      "learning_rate": 0.003,
+      "loss": 4.3388,
+      "step": 1217
+    },
+    {
+      "epoch": 0.01218,
+      "grad_norm": 1.0888832807540894,
+      "learning_rate": 0.003,
+      "loss": 4.3466,
+      "step": 1218
+    },
+    {
+      "epoch": 0.01219,
+      "grad_norm": 1.1137385368347168,
+      "learning_rate": 0.003,
+      "loss": 4.359,
+      "step": 1219
+    },
+    {
+      "epoch": 0.0122,
+      "grad_norm": 0.8392791152000427,
+      "learning_rate": 0.003,
+      "loss": 4.341,
+      "step": 1220
+    },
+    {
+      "epoch": 0.01221,
+      "grad_norm": 0.842366635799408,
+      "learning_rate": 0.003,
+      "loss": 4.355,
+      "step": 1221
+    },
+    {
+      "epoch": 0.01222,
+      "grad_norm": 0.9087112545967102,
+      "learning_rate": 0.003,
+      "loss": 4.3425,
+      "step": 1222
+    },
+    {
+      "epoch": 0.01223,
+      "grad_norm": 0.8507757186889648,
+      "learning_rate": 0.003,
+      "loss": 4.3381,
+      "step": 1223
+    },
+    {
+      "epoch": 0.01224,
+      "grad_norm": 0.745599091053009,
+      "learning_rate": 0.003,
+      "loss": 4.3236,
+      "step": 1224
+    },
+    {
+      "epoch": 0.01225,
+      "grad_norm": 0.6612358093261719,
+      "learning_rate": 0.003,
+      "loss": 4.3414,
+      "step": 1225
+    },
+    {
+      "epoch": 0.01226,
+      "grad_norm": 0.7157174348831177,
+      "learning_rate": 0.003,
+      "loss": 4.3359,
+      "step": 1226
+    },
+    {
+      "epoch": 0.01227,
+      "grad_norm": 0.7675558924674988,
+      "learning_rate": 0.003,
+      "loss": 4.3648,
+      "step": 1227
+    },
+    {
+      "epoch": 0.01228,
+      "grad_norm": 0.7858232855796814,
+      "learning_rate": 0.003,
+      "loss": 4.3493,
+      "step": 1228
+    },
+    {
+      "epoch": 0.01229,
+      "grad_norm": 0.750853419303894,
+      "learning_rate": 0.003,
+      "loss": 4.3338,
+      "step": 1229
+    },
+    {
+      "epoch": 0.0123,
+      "grad_norm": 0.7338488101959229,
+      "learning_rate": 0.003,
+      "loss": 4.3678,
+      "step": 1230
+    },
+    {
+      "epoch": 0.01231,
+      "grad_norm": 0.7138859033584595,
+      "learning_rate": 0.003,
+      "loss": 4.3349,
+      "step": 1231
+    },
+    {
+      "epoch": 0.01232,
+      "grad_norm": 0.6423472762107849,
+      "learning_rate": 0.003,
+      "loss": 4.3101,
+      "step": 1232
+    },
+    {
+      "epoch": 0.01233,
+      "grad_norm": 0.6176342964172363,
+      "learning_rate": 0.003,
+      "loss": 4.3143,
+      "step": 1233
+    },
+    {
+      "epoch": 0.01234,
+      "grad_norm": 0.5909737348556519,
+      "learning_rate": 0.003,
+      "loss": 4.3319,
+      "step": 1234
+    },
+    {
+      "epoch": 0.01235,
+      "grad_norm": 0.5610330700874329,
+      "learning_rate": 0.003,
+      "loss": 4.3196,
+      "step": 1235
+    },
+    {
+      "epoch": 0.01236,
+      "grad_norm": 0.5141494870185852,
+      "learning_rate": 0.003,
+      "loss": 4.3229,
+      "step": 1236
+    },
+    {
+      "epoch": 0.01237,
+      "grad_norm": 0.5010937452316284,
+      "learning_rate": 0.003,
+      "loss": 4.3305,
+      "step": 1237
+    },
+    {
+      "epoch": 0.01238,
+      "grad_norm": 0.5260263085365295,
+      "learning_rate": 0.003,
+      "loss": 4.3262,
+      "step": 1238
+    },
+    {
+      "epoch": 0.01239,
+      "grad_norm": 0.6261786222457886,
+      "learning_rate": 0.003,
+      "loss": 4.3109,
+      "step": 1239
+    },
+    {
+      "epoch": 0.0124,
+      "grad_norm": 0.6283076405525208,
+      "learning_rate": 0.003,
+      "loss": 4.294,
+      "step": 1240
+    },
+    {
+      "epoch": 0.01241,
+      "grad_norm": 0.520620584487915,
+      "learning_rate": 0.003,
+      "loss": 4.3044,
+      "step": 1241
+    },
+    {
+      "epoch": 0.01242,
+      "grad_norm": 0.45525336265563965,
+      "learning_rate": 0.003,
+      "loss": 4.3077,
+      "step": 1242
+    },
+    {
+      "epoch": 0.01243,
+      "grad_norm": 0.5128642320632935,
+      "learning_rate": 0.003,
+      "loss": 4.2904,
+      "step": 1243
+    },
+    {
+      "epoch": 0.01244,
+      "grad_norm": 0.6018971800804138,
+      "learning_rate": 0.003,
+      "loss": 4.3061,
+      "step": 1244
+    },
+    {
+      "epoch": 0.01245,
+      "grad_norm": 0.8249465227127075,
+      "learning_rate": 0.003,
+      "loss": 4.3067,
+      "step": 1245
+    },
+    {
+      "epoch": 0.01246,
+      "grad_norm": 0.983553946018219,
+      "learning_rate": 0.003,
+      "loss": 4.3098,
+      "step": 1246
+    },
+    {
+      "epoch": 0.01247,
+      "grad_norm": 0.8006371259689331,
+      "learning_rate": 0.003,
+      "loss": 4.3508,
+      "step": 1247
+    },
+    {
+      "epoch": 0.01248,
+      "grad_norm": 0.7486335635185242,
+      "learning_rate": 0.003,
+      "loss": 4.3351,
+      "step": 1248
+    },
+    {
+      "epoch": 0.01249,
+      "grad_norm": 0.731898307800293,
+      "learning_rate": 0.003,
+      "loss": 4.3269,
+      "step": 1249
+    },
+    {
+      "epoch": 0.0125,
+      "grad_norm": 0.7205458283424377,
+      "learning_rate": 0.003,
+      "loss": 4.3015,
+      "step": 1250
+    },
+    {
+      "epoch": 0.01251,
+      "grad_norm": 0.9905809760093689,
+      "learning_rate": 0.003,
+      "loss": 4.3425,
+      "step": 1251
+    },
+    {
+      "epoch": 0.01252,
+      "grad_norm": 0.7830891013145447,
+      "learning_rate": 0.003,
+      "loss": 4.319,
+      "step": 1252
+    },
+    {
+      "epoch": 0.01253,
+      "grad_norm": 0.6952295303344727,
+      "learning_rate": 0.003,
+      "loss": 4.2965,
+      "step": 1253
+    },
+    {
+      "epoch": 0.01254,
+      "grad_norm": 0.7467588782310486,
+      "learning_rate": 0.003,
+      "loss": 4.3427,
+      "step": 1254
+    },
+    {
+      "epoch": 0.01255,
+      "grad_norm": 0.7031508684158325,
+      "learning_rate": 0.003,
+      "loss": 4.2927,
+      "step": 1255
+    },
+    {
+      "epoch": 0.01256,
+      "grad_norm": 0.6603983640670776,
+      "learning_rate": 0.003,
+      "loss": 4.3194,
+      "step": 1256
+    },
+    {
+      "epoch": 0.01257,
+      "grad_norm": 0.6309519410133362,
+      "learning_rate": 0.003,
+      "loss": 4.3008,
+      "step": 1257
+    },
+    {
+      "epoch": 0.01258,
+      "grad_norm": 0.6506064534187317,
+      "learning_rate": 0.003,
+      "loss": 4.3107,
+      "step": 1258
+    },
+    {
+      "epoch": 0.01259,
+      "grad_norm": 0.5962826013565063,
+      "learning_rate": 0.003,
+      "loss": 4.3105,
+      "step": 1259
+    },
+    {
+      "epoch": 0.0126,
+      "grad_norm": 0.5265083909034729,
+      "learning_rate": 0.003,
+      "loss": 4.3316,
+      "step": 1260
+    },
+    {
+      "epoch": 0.01261,
+      "grad_norm": 0.5279543995857239,
+      "learning_rate": 0.003,
+      "loss": 4.3275,
+      "step": 1261
+    },
+    {
+      "epoch": 0.01262,
+      "grad_norm": 0.5171265006065369,
+      "learning_rate": 0.003,
+      "loss": 4.3175,
+      "step": 1262
+    },
+    {
+      "epoch": 0.01263,
+      "grad_norm": 0.5120646953582764,
+      "learning_rate": 0.003,
+      "loss": 4.3111,
+      "step": 1263
+    },
+    {
+      "epoch": 0.01264,
+      "grad_norm": 0.5407173037528992,
+      "learning_rate": 0.003,
+      "loss": 4.3,
+      "step": 1264
+    },
+    {
+      "epoch": 0.01265,
+      "grad_norm": 0.5656223893165588,
+      "learning_rate": 0.003,
+      "loss": 4.3113,
+      "step": 1265
+    },
+    {
+      "epoch": 0.01266,
+      "grad_norm": 0.4160173237323761,
+      "learning_rate": 0.003,
+      "loss": 4.3033,
+      "step": 1266
+    },
+    {
+      "epoch": 0.01267,
+      "grad_norm": 0.4453507661819458,
+      "learning_rate": 0.003,
+      "loss": 4.3244,
+      "step": 1267
+    },
+    {
+      "epoch": 0.01268,
+      "grad_norm": 0.4457267224788666,
+      "learning_rate": 0.003,
+      "loss": 4.3093,
+      "step": 1268
+    },
+    {
+      "epoch": 0.01269,
+      "grad_norm": 0.38706403970718384,
+      "learning_rate": 0.003,
+      "loss": 4.2734,
+      "step": 1269
+    },
+    {
+      "epoch": 0.0127,
+      "grad_norm": 0.38581886887550354,
+      "learning_rate": 0.003,
+      "loss": 4.3082,
+      "step": 1270
+    },
+    {
+      "epoch": 0.01271,
+      "grad_norm": 0.40513476729393005,
+      "learning_rate": 0.003,
+      "loss": 4.2869,
+      "step": 1271
+    },
+    {
+      "epoch": 0.01272,
+      "grad_norm": 0.522612988948822,
+      "learning_rate": 0.003,
+      "loss": 4.2863,
+      "step": 1272
+    },
+    {
+      "epoch": 0.01273,
+      "grad_norm": 0.8000910878181458,
+      "learning_rate": 0.003,
+      "loss": 4.321,
+      "step": 1273
+    },
+    {
+      "epoch": 0.01274,
+      "grad_norm": 1.0203778743743896,
+      "learning_rate": 0.003,
+      "loss": 4.3505,
+      "step": 1274
+    },
+    {
+      "epoch": 0.01275,
+      "grad_norm": 0.8743080496788025,
+      "learning_rate": 0.003,
+      "loss": 4.3003,
+      "step": 1275
+    },
+    {
+      "epoch": 0.01276,
+      "grad_norm": 0.9019266963005066,
+      "learning_rate": 0.003,
+      "loss": 4.3144,
+      "step": 1276
+    },
+    {
+      "epoch": 0.01277,
+      "grad_norm": 0.822109580039978,
+      "learning_rate": 0.003,
+      "loss": 4.3254,
+      "step": 1277
+    },
+    {
+      "epoch": 0.01278,
+      "grad_norm": 0.6606391668319702,
+      "learning_rate": 0.003,
+      "loss": 4.325,
+      "step": 1278
+    },
+    {
+      "epoch": 0.01279,
+      "grad_norm": 0.7756208777427673,
+      "learning_rate": 0.003,
+      "loss": 4.3104,
+      "step": 1279
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.7292629480361938,
+      "learning_rate": 0.003,
+      "loss": 4.311,
+      "step": 1280
+    },
+    {
+      "epoch": 0.01281,
+      "grad_norm": 0.6916735768318176,
+      "learning_rate": 0.003,
+      "loss": 4.2953,
+      "step": 1281
+    },
+    {
+      "epoch": 0.01282,
+      "grad_norm": 0.6080433130264282,
+      "learning_rate": 0.003,
+      "loss": 4.3294,
+      "step": 1282
+    },
+    {
+      "epoch": 0.01283,
+      "grad_norm": 0.6106524467468262,
+      "learning_rate": 0.003,
+      "loss": 4.301,
+      "step": 1283
+    },
+    {
+      "epoch": 0.01284,
+      "grad_norm": 0.5725691318511963,
+      "learning_rate": 0.003,
+      "loss": 4.2991,
+      "step": 1284
+    },
+    {
+      "epoch": 0.01285,
+      "grad_norm": 0.6401168704032898,
+      "learning_rate": 0.003,
+      "loss": 4.3022,
+      "step": 1285
+    },
+    {
+      "epoch": 0.01286,
+      "grad_norm": 0.620028555393219,
+      "learning_rate": 0.003,
+      "loss": 4.3246,
+      "step": 1286
+    },
+    {
+      "epoch": 0.01287,
+      "grad_norm": 0.662833034992218,
+      "learning_rate": 0.003,
+      "loss": 4.2902,
+      "step": 1287
+    },
+    {
+      "epoch": 0.01288,
+      "grad_norm": 0.6968905925750732,
+      "learning_rate": 0.003,
+      "loss": 4.307,
+      "step": 1288
+    },
+    {
+      "epoch": 0.01289,
+      "grad_norm": 0.5818215608596802,
+      "learning_rate": 0.003,
+      "loss": 4.2943,
+      "step": 1289
+    },
+    {
+      "epoch": 0.0129,
+      "grad_norm": 0.5997996926307678,
+      "learning_rate": 0.003,
+      "loss": 4.3178,
+      "step": 1290
+    },
+    {
+      "epoch": 0.01291,
+      "grad_norm": 0.6353629231452942,
+      "learning_rate": 0.003,
+      "loss": 4.2994,
+      "step": 1291
+    },
+    {
+      "epoch": 0.01292,
+      "grad_norm": 0.7124042510986328,
+      "learning_rate": 0.003,
+      "loss": 4.3335,
+      "step": 1292
+    },
+    {
+      "epoch": 0.01293,
+      "grad_norm": 0.8257373571395874,
+      "learning_rate": 0.003,
+      "loss": 4.3265,
+      "step": 1293
+    },
+    {
+      "epoch": 0.01294,
+      "grad_norm": 0.7939269542694092,
+      "learning_rate": 0.003,
+      "loss": 4.2974,
+      "step": 1294
+    },
+    {
+      "epoch": 0.01295,
+      "grad_norm": 0.703215479850769,
+      "learning_rate": 0.003,
+      "loss": 4.3191,
+      "step": 1295
+    },
+    {
+      "epoch": 0.01296,
+      "grad_norm": 0.851357102394104,
+      "learning_rate": 0.003,
+      "loss": 4.2949,
+      "step": 1296
+    },
+    {
+      "epoch": 0.01297,
+      "grad_norm": 0.8115617036819458,
+      "learning_rate": 0.003,
+      "loss": 4.309,
+      "step": 1297
+    },
+    {
+      "epoch": 0.01298,
+      "grad_norm": 0.8384767174720764,
+      "learning_rate": 0.003,
+      "loss": 4.2888,
+      "step": 1298
+    },
+    {
+      "epoch": 0.01299,
+      "grad_norm": 0.8158665895462036,
+      "learning_rate": 0.003,
+      "loss": 4.305,
+      "step": 1299
+    },
+    {
+      "epoch": 0.013,
+      "grad_norm": 0.7459951639175415,
+      "learning_rate": 0.003,
+      "loss": 4.29,
+      "step": 1300
+    },
+    {
+      "epoch": 0.01301,
+      "grad_norm": 0.7418009042739868,
+      "learning_rate": 0.003,
+      "loss": 4.3054,
+      "step": 1301
+    },
+    {
+      "epoch": 0.01302,
+      "grad_norm": 0.8234617710113525,
+      "learning_rate": 0.003,
+      "loss": 4.358,
+      "step": 1302
+    },
+    {
+      "epoch": 0.01303,
+      "grad_norm": 0.7982010841369629,
+      "learning_rate": 0.003,
+      "loss": 4.3068,
+      "step": 1303
+    },
+    {
+      "epoch": 0.01304,
+      "grad_norm": 0.812175452709198,
+      "learning_rate": 0.003,
+      "loss": 4.3282,
+      "step": 1304
+    },
+    {
+      "epoch": 0.01305,
+      "grad_norm": 0.8789085745811462,
+      "learning_rate": 0.003,
+      "loss": 4.322,
+      "step": 1305
+    },
+    {
+      "epoch": 0.01306,
+      "grad_norm": 0.828743577003479,
+      "learning_rate": 0.003,
+      "loss": 4.3122,
+      "step": 1306
+    },
+    {
+      "epoch": 0.01307,
+      "grad_norm": 0.7142960429191589,
+      "learning_rate": 0.003,
+      "loss": 4.3298,
+      "step": 1307
+    },
+    {
+      "epoch": 0.01308,
+      "grad_norm": 0.7313523888587952,
+      "learning_rate": 0.003,
+      "loss": 4.3053,
+      "step": 1308
+    },
+    {
+      "epoch": 0.01309,
+      "grad_norm": 0.7293263077735901,
+      "learning_rate": 0.003,
+      "loss": 4.3473,
+      "step": 1309
+    },
+    {
+      "epoch": 0.0131,
+      "grad_norm": 0.7924617528915405,
+      "learning_rate": 0.003,
+      "loss": 4.2932,
+      "step": 1310
+    },
+    {
+      "epoch": 0.01311,
+      "grad_norm": 0.8199212551116943,
+      "learning_rate": 0.003,
+      "loss": 4.3417,
+      "step": 1311
+    },
+    {
+      "epoch": 0.01312,
+      "grad_norm": 0.8725687861442566,
+      "learning_rate": 0.003,
+      "loss": 4.3076,
+      "step": 1312
+    },
+    {
+      "epoch": 0.01313,
+      "grad_norm": 0.8202559351921082,
+      "learning_rate": 0.003,
+      "loss": 4.2971,
+      "step": 1313
+    },
+    {
+      "epoch": 0.01314,
+      "grad_norm": 0.7738103866577148,
+      "learning_rate": 0.003,
+      "loss": 4.2944,
+      "step": 1314
+    },
+    {
+      "epoch": 0.01315,
+      "grad_norm": 0.6587685346603394,
+      "learning_rate": 0.003,
+      "loss": 4.3124,
+      "step": 1315
+    },
+    {
+      "epoch": 0.01316,
+      "grad_norm": 0.7233019471168518,
+      "learning_rate": 0.003,
+      "loss": 4.3037,
+      "step": 1316
+    },
+    {
+      "epoch": 0.01317,
+      "grad_norm": 0.6440667510032654,
+      "learning_rate": 0.003,
+      "loss": 4.3256,
+      "step": 1317
+    },
+    {
+      "epoch": 0.01318,
+      "grad_norm": 0.5893415212631226,
+      "learning_rate": 0.003,
+      "loss": 4.301,
+      "step": 1318
+    },
+    {
+      "epoch": 0.01319,
+      "grad_norm": 0.7347000241279602,
+      "learning_rate": 0.003,
+      "loss": 4.3133,
+      "step": 1319
+    },
+    {
+      "epoch": 0.0132,
+      "grad_norm": 0.8620561957359314,
+      "learning_rate": 0.003,
+      "loss": 4.3153,
+      "step": 1320
+    },
+    {
+      "epoch": 0.01321,
+      "grad_norm": 0.9148700833320618,
+      "learning_rate": 0.003,
+      "loss": 4.2941,
+      "step": 1321
+    },
+    {
+      "epoch": 0.01322,
+      "grad_norm": 0.8122279047966003,
+      "learning_rate": 0.003,
+      "loss": 4.3273,
+      "step": 1322
+    },
+    {
+      "epoch": 0.01323,
+      "grad_norm": 0.7629367113113403,
+      "learning_rate": 0.003,
+      "loss": 4.3161,
+      "step": 1323
+    },
+    {
+      "epoch": 0.01324,
+      "grad_norm": 0.6652523875236511,
+      "learning_rate": 0.003,
+      "loss": 4.2981,
+      "step": 1324
+    },
+    {
+      "epoch": 0.01325,
+      "grad_norm": 0.6290608048439026,
+      "learning_rate": 0.003,
+      "loss": 4.3001,
+      "step": 1325
+    },
+    {
+      "epoch": 0.01326,
+      "grad_norm": 0.6334658861160278,
+      "learning_rate": 0.003,
+      "loss": 4.2848,
+      "step": 1326
+    },
+    {
+      "epoch": 0.01327,
+      "grad_norm": 0.5682603716850281,
+      "learning_rate": 0.003,
+      "loss": 4.3018,
+      "step": 1327
+    },
+    {
+      "epoch": 0.01328,
+      "grad_norm": 0.4706239700317383,
+      "learning_rate": 0.003,
+      "loss": 4.2884,
+      "step": 1328
+    },
+    {
+      "epoch": 0.01329,
+      "grad_norm": 0.46228259801864624,
+      "learning_rate": 0.003,
+      "loss": 4.2518,
+      "step": 1329
+    },
+    {
+      "epoch": 0.0133,
+      "grad_norm": 0.4335189163684845,
+      "learning_rate": 0.003,
+      "loss": 4.293,
+      "step": 1330
+    },
+    {
+      "epoch": 0.01331,
+      "grad_norm": 0.383666068315506,
+      "learning_rate": 0.003,
+      "loss": 4.2772,
+      "step": 1331
+    },
+    {
+      "epoch": 0.01332,
+      "grad_norm": 0.39924779534339905,
+      "learning_rate": 0.003,
+      "loss": 4.2955,
+      "step": 1332
+    },
+    {
+      "epoch": 0.01333,
+      "grad_norm": 0.4323978126049042,
+      "learning_rate": 0.003,
+      "loss": 4.2692,
+      "step": 1333
+    },
+    {
+      "epoch": 0.01334,
+      "grad_norm": 0.43190306425094604,
+      "learning_rate": 0.003,
+      "loss": 4.2574,
+      "step": 1334
+    },
+    {
+      "epoch": 0.01335,
+      "grad_norm": 0.49212101101875305,
+      "learning_rate": 0.003,
+      "loss": 4.2803,
+      "step": 1335
+    },
+    {
+      "epoch": 0.01336,
+      "grad_norm": 0.6307611465454102,
+      "learning_rate": 0.003,
+      "loss": 4.2833,
+      "step": 1336
+    },
+    {
+      "epoch": 0.01337,
+      "grad_norm": 0.9523991942405701,
+      "learning_rate": 0.003,
+      "loss": 4.31,
+      "step": 1337
+    },
+    {
+      "epoch": 0.01338,
+      "grad_norm": 1.0491547584533691,
+      "learning_rate": 0.003,
+      "loss": 4.3099,
+      "step": 1338
+    },
+    {
+      "epoch": 0.01339,
+      "grad_norm": 0.7694852948188782,
+      "learning_rate": 0.003,
+      "loss": 4.307,
+      "step": 1339
+    },
+    {
+      "epoch": 0.0134,
+      "grad_norm": 0.7887849807739258,
+      "learning_rate": 0.003,
+      "loss": 4.3001,
+      "step": 1340
+    },
+    {
+      "epoch": 0.01341,
+      "grad_norm": 0.7728468179702759,
+      "learning_rate": 0.003,
+      "loss": 4.3115,
+      "step": 1341
+    },
+    {
+      "epoch": 0.01342,
+      "grad_norm": 0.7008172273635864,
+      "learning_rate": 0.003,
+      "loss": 4.2807,
+      "step": 1342
+    },
+    {
+      "epoch": 0.01343,
+      "grad_norm": 0.7046615481376648,
+      "learning_rate": 0.003,
+      "loss": 4.2868,
+      "step": 1343
+    },
+    {
+      "epoch": 0.01344,
+      "grad_norm": 0.6420544385910034,
+      "learning_rate": 0.003,
+      "loss": 4.2829,
+      "step": 1344
+    },
+    {
+      "epoch": 0.01345,
+      "grad_norm": 0.5914574861526489,
+      "learning_rate": 0.003,
+      "loss": 4.3212,
+      "step": 1345
+    },
+    {
+      "epoch": 0.01346,
+      "grad_norm": 0.5600994229316711,
+      "learning_rate": 0.003,
+      "loss": 4.2911,
+      "step": 1346
+    },
+    {
+      "epoch": 0.01347,
+      "grad_norm": 0.46645763516426086,
+      "learning_rate": 0.003,
+      "loss": 4.2774,
+      "step": 1347
+    },
+    {
+      "epoch": 0.01348,
+      "grad_norm": 0.49815306067466736,
+      "learning_rate": 0.003,
+      "loss": 4.2778,
+      "step": 1348
+    },
+    {
+      "epoch": 0.01349,
+      "grad_norm": 0.525874674320221,
+      "learning_rate": 0.003,
+      "loss": 4.2987,
+      "step": 1349
+    },
+    {
+      "epoch": 0.0135,
+      "grad_norm": 0.487930566072464,
+      "learning_rate": 0.003,
+      "loss": 4.2878,
+      "step": 1350
+    },
+    {
+      "epoch": 0.01351,
+      "grad_norm": 0.5213081240653992,
+      "learning_rate": 0.003,
+      "loss": 4.3174,
+      "step": 1351
+    },
+    {
+      "epoch": 0.01352,
+      "grad_norm": 0.6742727160453796,
+      "learning_rate": 0.003,
+      "loss": 4.2961,
+      "step": 1352
+    },
+    {
+      "epoch": 0.01353,
+      "grad_norm": 0.7189781069755554,
+      "learning_rate": 0.003,
+      "loss": 4.2871,
+      "step": 1353
+    },
+    {
+      "epoch": 0.01354,
+      "grad_norm": 0.59047532081604,
+      "learning_rate": 0.003,
+      "loss": 4.2676,
+      "step": 1354
+    },
+    {
+      "epoch": 0.01355,
+      "grad_norm": 0.5686421394348145,
+      "learning_rate": 0.003,
+      "loss": 4.2819,
+      "step": 1355
+    },
+    {
+      "epoch": 0.01356,
+      "grad_norm": 0.5455272793769836,
+      "learning_rate": 0.003,
+      "loss": 4.2746,
+      "step": 1356
+    },
+    {
+      "epoch": 0.01357,
+      "grad_norm": 0.6083900332450867,
+      "learning_rate": 0.003,
+      "loss": 4.2628,
+      "step": 1357
+    },
+    {
+      "epoch": 0.01358,
+      "grad_norm": 0.5685535073280334,
+      "learning_rate": 0.003,
+      "loss": 4.258,
+      "step": 1358
+    },
+    {
+      "epoch": 0.01359,
+      "grad_norm": 0.5254443883895874,
+      "learning_rate": 0.003,
+      "loss": 4.2971,
+      "step": 1359
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.6204063892364502,
+      "learning_rate": 0.003,
+      "loss": 4.2647,
+      "step": 1360
+    },
+    {
+      "epoch": 0.01361,
+      "grad_norm": 0.7256515026092529,
+      "learning_rate": 0.003,
+      "loss": 4.2884,
+      "step": 1361
+    },
+    {
+      "epoch": 0.01362,
+      "grad_norm": 0.8139166831970215,
+      "learning_rate": 0.003,
+      "loss": 4.3042,
+      "step": 1362
+    },
+    {
+      "epoch": 0.01363,
+      "grad_norm": 0.7577769756317139,
+      "learning_rate": 0.003,
+      "loss": 4.3089,
+      "step": 1363
+    },
+    {
+      "epoch": 0.01364,
+      "grad_norm": 0.6858768463134766,
+      "learning_rate": 0.003,
+      "loss": 4.2972,
+      "step": 1364
+    },
+    {
+      "epoch": 0.01365,
+      "grad_norm": 0.7445408701896667,
+      "learning_rate": 0.003,
+      "loss": 4.315,
+      "step": 1365
+    },
+    {
+      "epoch": 0.01366,
+      "grad_norm": 0.9198596477508545,
+      "learning_rate": 0.003,
+      "loss": 4.2795,
+      "step": 1366
+    },
+    {
+      "epoch": 0.01367,
+      "grad_norm": 0.8477755188941956,
+      "learning_rate": 0.003,
+      "loss": 4.3279,
+      "step": 1367
+    },
+    {
+      "epoch": 0.01368,
+      "grad_norm": 0.8082318305969238,
+      "learning_rate": 0.003,
+      "loss": 4.302,
+      "step": 1368
+    },
+    {
+      "epoch": 0.01369,
+      "grad_norm": 0.8816357851028442,
+      "learning_rate": 0.003,
+      "loss": 4.3,
+      "step": 1369
+    },
+    {
+      "epoch": 0.0137,
+      "grad_norm": 0.8340180516242981,
+      "learning_rate": 0.003,
+      "loss": 4.326,
+      "step": 1370
+    },
+    {
+      "epoch": 0.01371,
+      "grad_norm": 0.7977674007415771,
+      "learning_rate": 0.003,
+      "loss": 4.3179,
+      "step": 1371
+    },
+    {
+      "epoch": 0.01372,
+      "grad_norm": 0.7062758803367615,
+      "learning_rate": 0.003,
+      "loss": 4.2633,
+      "step": 1372
+    },
+    {
+      "epoch": 0.01373,
+      "grad_norm": 0.6788503527641296,
+      "learning_rate": 0.003,
+      "loss": 4.3,
+      "step": 1373
+    },
+    {
+      "epoch": 0.01374,
+      "grad_norm": 0.6201223731040955,
+      "learning_rate": 0.003,
+      "loss": 4.3053,
+      "step": 1374
+    },
+    {
+      "epoch": 0.01375,
+      "grad_norm": 0.6361509561538696,
+      "learning_rate": 0.003,
+      "loss": 4.264,
+      "step": 1375
+    },
+    {
+      "epoch": 0.01376,
+      "grad_norm": 0.5589747428894043,
+      "learning_rate": 0.003,
+      "loss": 4.2871,
+      "step": 1376
+    },
+    {
+      "epoch": 0.01377,
+      "grad_norm": 0.6119049787521362,
+      "learning_rate": 0.003,
+      "loss": 4.2875,
+      "step": 1377
+    },
+    {
+      "epoch": 0.01378,
+      "grad_norm": 0.5476299524307251,
+      "learning_rate": 0.003,
+      "loss": 4.3133,
+      "step": 1378
+    },
+    {
+      "epoch": 0.01379,
+      "grad_norm": 0.5394819974899292,
+      "learning_rate": 0.003,
+      "loss": 4.2841,
+      "step": 1379
+    },
+    {
+      "epoch": 0.0138,
+      "grad_norm": 0.49164652824401855,
+      "learning_rate": 0.003,
+      "loss": 4.2791,
+      "step": 1380
+    },
+    {
+      "epoch": 0.01381,
+      "grad_norm": 0.4505774974822998,
+      "learning_rate": 0.003,
+      "loss": 4.2845,
+      "step": 1381
+    },
+    {
+      "epoch": 0.01382,
+      "grad_norm": 0.4613495469093323,
+      "learning_rate": 0.003,
+      "loss": 4.279,
+      "step": 1382
+    },
+    {
+      "epoch": 0.01383,
+      "grad_norm": 0.6032357215881348,
+      "learning_rate": 0.003,
+      "loss": 4.2785,
+      "step": 1383
+    },
+    {
+      "epoch": 0.01384,
+      "grad_norm": 0.8419365882873535,
+      "learning_rate": 0.003,
+      "loss": 4.3193,
+      "step": 1384
+    },
+    {
+      "epoch": 0.01385,
+      "grad_norm": 1.1751699447631836,
+      "learning_rate": 0.003,
+      "loss": 4.3253,
+      "step": 1385
+    },
+    {
+      "epoch": 0.01386,
+      "grad_norm": 0.6834710240364075,
+      "learning_rate": 0.003,
+      "loss": 4.2903,
+      "step": 1386
+    },
+    {
+      "epoch": 0.01387,
+      "grad_norm": 0.5957779288291931,
+      "learning_rate": 0.003,
+      "loss": 4.2761,
+      "step": 1387
+    },
+    {
+      "epoch": 0.01388,
+      "grad_norm": 0.6788092851638794,
+      "learning_rate": 0.003,
+      "loss": 4.3125,
+      "step": 1388
+    },
+    {
+      "epoch": 0.01389,
+      "grad_norm": 0.6753062605857849,
+      "learning_rate": 0.003,
+      "loss": 4.2893,
+      "step": 1389
+    },
+    {
+      "epoch": 0.0139,
+      "grad_norm": 0.6624464392662048,
+      "learning_rate": 0.003,
+      "loss": 4.2919,
+      "step": 1390
+    },
+    {
+      "epoch": 0.01391,
+      "grad_norm": 0.5935611724853516,
+      "learning_rate": 0.003,
+      "loss": 4.2895,
+      "step": 1391
+    },
+    {
+      "epoch": 0.01392,
+      "grad_norm": 0.5433003902435303,
+      "learning_rate": 0.003,
+      "loss": 4.2973,
+      "step": 1392
+    },
+    {
+      "epoch": 0.01393,
+      "grad_norm": 0.5145422220230103,
+      "learning_rate": 0.003,
+      "loss": 4.2666,
+      "step": 1393
+    },
+    {
+      "epoch": 0.01394,
+      "grad_norm": 0.49970656633377075,
+      "learning_rate": 0.003,
+      "loss": 4.2526,
+      "step": 1394
+    },
+    {
+      "epoch": 0.01395,
+      "grad_norm": 0.5094105005264282,
+      "learning_rate": 0.003,
+      "loss": 4.283,
+      "step": 1395
+    },
+    {
+      "epoch": 0.01396,
+      "grad_norm": 0.5121984481811523,
+      "learning_rate": 0.003,
+      "loss": 4.274,
+      "step": 1396
+    },
+    {
+      "epoch": 0.01397,
+      "grad_norm": 0.5195156931877136,
+      "learning_rate": 0.003,
+      "loss": 4.286,
+      "step": 1397
+    },
+    {
+      "epoch": 0.01398,
+      "grad_norm": 0.47007647156715393,
+      "learning_rate": 0.003,
+      "loss": 4.2651,
+      "step": 1398
+    },
+    {
+      "epoch": 0.01399,
+      "grad_norm": 0.4805718660354614,
+      "learning_rate": 0.003,
+      "loss": 4.287,
+      "step": 1399
+    },
+    {
+      "epoch": 0.014,
+      "grad_norm": 0.6213575005531311,
+      "learning_rate": 0.003,
+      "loss": 4.2485,
+      "step": 1400
+    },
+    {
+      "epoch": 0.01401,
+      "grad_norm": 0.7338332533836365,
+      "learning_rate": 0.003,
+      "loss": 4.2762,
+      "step": 1401
+    },
+    {
+      "epoch": 0.01402,
+      "grad_norm": 0.7789960503578186,
+      "learning_rate": 0.003,
+      "loss": 4.2883,
+      "step": 1402
+    },
+    {
+      "epoch": 0.01403,
+      "grad_norm": 0.7047394514083862,
+      "learning_rate": 0.003,
+      "loss": 4.2622,
+      "step": 1403
+    },
+    {
+      "epoch": 0.01404,
+      "grad_norm": 0.5580564737319946,
+      "learning_rate": 0.003,
+      "loss": 4.279,
+      "step": 1404
+    },
+    {
+      "epoch": 0.01405,
+      "grad_norm": 0.6150562763214111,
+      "learning_rate": 0.003,
+      "loss": 4.2978,
+      "step": 1405
+    },
+    {
+      "epoch": 0.01406,
+      "grad_norm": 0.6412274241447449,
+      "learning_rate": 0.003,
+      "loss": 4.2817,
+      "step": 1406
+    },
+    {
+      "epoch": 0.01407,
+      "grad_norm": 0.6951134204864502,
+      "learning_rate": 0.003,
+      "loss": 4.2975,
+      "step": 1407
+    },
+    {
+      "epoch": 0.01408,
+      "grad_norm": 0.7907094955444336,
+      "learning_rate": 0.003,
+      "loss": 4.2629,
+      "step": 1408
+    },
+    {
+      "epoch": 0.01409,
+      "grad_norm": 0.6941542625427246,
+      "learning_rate": 0.003,
+      "loss": 4.2885,
+      "step": 1409
+    },
+    {
+      "epoch": 0.0141,
+      "grad_norm": 0.6482591032981873,
+      "learning_rate": 0.003,
+      "loss": 4.2693,
+      "step": 1410
+    },
+    {
+      "epoch": 0.01411,
+      "grad_norm": 0.6562476754188538,
+      "learning_rate": 0.003,
+      "loss": 4.2437,
+      "step": 1411
+    },
+    {
+      "epoch": 0.01412,
+      "grad_norm": 0.7608663439750671,
+      "learning_rate": 0.003,
+      "loss": 4.2874,
+      "step": 1412
+    },
+    {
+      "epoch": 0.01413,
+      "grad_norm": 0.763931930065155,
+      "learning_rate": 0.003,
+      "loss": 4.2842,
+      "step": 1413
+    },
+    {
+      "epoch": 0.01414,
+      "grad_norm": 0.7897149324417114,
+      "learning_rate": 0.003,
+      "loss": 4.2662,
+      "step": 1414
+    },
+    {
+      "epoch": 0.01415,
+      "grad_norm": 0.7817095518112183,
+      "learning_rate": 0.003,
+      "loss": 4.2634,
+      "step": 1415
+    },
+    {
+      "epoch": 0.01416,
+      "grad_norm": 0.7488892674446106,
+      "learning_rate": 0.003,
+      "loss": 4.2852,
+      "step": 1416
+    },
+    {
+      "epoch": 0.01417,
+      "grad_norm": 0.8105233907699585,
+      "learning_rate": 0.003,
+      "loss": 4.2802,
+      "step": 1417
+    },
+    {
+      "epoch": 0.01418,
+      "grad_norm": 0.7555150389671326,
+      "learning_rate": 0.003,
+      "loss": 4.2974,
+      "step": 1418
+    },
+    {
+      "epoch": 0.01419,
+      "grad_norm": 0.7873964309692383,
+      "learning_rate": 0.003,
+      "loss": 4.2928,
+      "step": 1419
+    },
+    {
+      "epoch": 0.0142,
+      "grad_norm": 0.942128598690033,
+      "learning_rate": 0.003,
+      "loss": 4.3049,
+      "step": 1420
+    },
+    {
+      "epoch": 0.01421,
+      "grad_norm": 1.1309813261032104,
+      "learning_rate": 0.003,
+      "loss": 4.2967,
+      "step": 1421
+    },
+    {
+      "epoch": 0.01422,
+      "grad_norm": 0.8632763028144836,
+      "learning_rate": 0.003,
+      "loss": 4.2868,
+      "step": 1422
+    },
+    {
+      "epoch": 0.01423,
+      "grad_norm": 0.8987135887145996,
+      "learning_rate": 0.003,
+      "loss": 4.2941,
+      "step": 1423
+    },
+    {
+      "epoch": 0.01424,
+      "grad_norm": 0.7247830629348755,
+      "learning_rate": 0.003,
+      "loss": 4.3046,
+      "step": 1424
+    },
+    {
+      "epoch": 0.01425,
+      "grad_norm": 0.6278427839279175,
+      "learning_rate": 0.003,
+      "loss": 4.3054,
+      "step": 1425
+    },
+    {
+      "epoch": 0.01426,
+      "grad_norm": 0.6600518226623535,
+      "learning_rate": 0.003,
+      "loss": 4.2819,
+      "step": 1426
+    },
+    {
+      "epoch": 0.01427,
+      "grad_norm": 0.744967520236969,
+      "learning_rate": 0.003,
+      "loss": 4.2813,
+      "step": 1427
+    },
+    {
+      "epoch": 0.01428,
+      "grad_norm": 0.6531183123588562,
+      "learning_rate": 0.003,
+      "loss": 4.2709,
+      "step": 1428
+    },
+    {
+      "epoch": 0.01429,
+      "grad_norm": 0.6728975176811218,
+      "learning_rate": 0.003,
+      "loss": 4.2816,
+      "step": 1429
+    },
+    {
+      "epoch": 0.0143,
+      "grad_norm": 0.6716436147689819,
+      "learning_rate": 0.003,
+      "loss": 4.2945,
+      "step": 1430
+    },
+    {
+      "epoch": 0.01431,
+      "grad_norm": 0.674777090549469,
+      "learning_rate": 0.003,
+      "loss": 4.2804,
+      "step": 1431
+    },
+    {
+      "epoch": 0.01432,
+      "grad_norm": 0.6107741594314575,
+      "learning_rate": 0.003,
+      "loss": 4.2694,
+      "step": 1432
+    },
+    {
+      "epoch": 0.01433,
+      "grad_norm": 0.5782887935638428,
+      "learning_rate": 0.003,
+      "loss": 4.2651,
+      "step": 1433
+    },
+    {
+      "epoch": 0.01434,
+      "grad_norm": 0.5728036761283875,
+      "learning_rate": 0.003,
+      "loss": 4.2497,
+      "step": 1434
+    },
+    {
+      "epoch": 0.01435,
+      "grad_norm": 0.5406824350357056,
+      "learning_rate": 0.003,
+      "loss": 4.2562,
+      "step": 1435
+    },
+    {
+      "epoch": 0.01436,
+      "grad_norm": 0.5029864311218262,
+      "learning_rate": 0.003,
+      "loss": 4.2703,
+      "step": 1436
+    },
+    {
+      "epoch": 0.01437,
+      "grad_norm": 0.5657121539115906,
+      "learning_rate": 0.003,
+      "loss": 4.2852,
+      "step": 1437
+    },
+    {
+      "epoch": 0.01438,
+      "grad_norm": 0.5845190286636353,
+      "learning_rate": 0.003,
+      "loss": 4.2793,
+      "step": 1438
+    },
+    {
+      "epoch": 0.01439,
+      "grad_norm": 0.591201901435852,
+      "learning_rate": 0.003,
+      "loss": 4.2448,
+      "step": 1439
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.8200764656066895,
+      "learning_rate": 0.003,
+      "loss": 4.2641,
+      "step": 1440
+    },
+    {
+      "epoch": 0.01441,
+      "grad_norm": 0.888174831867218,
+      "learning_rate": 0.003,
+      "loss": 4.2921,
+      "step": 1441
+    },
+    {
+      "epoch": 0.01442,
+      "grad_norm": 0.79758220911026,
+      "learning_rate": 0.003,
+      "loss": 4.2638,
+      "step": 1442
+    },
+    {
+      "epoch": 0.01443,
+      "grad_norm": 0.745823085308075,
+      "learning_rate": 0.003,
+      "loss": 4.3231,
+      "step": 1443
+    },
+    {
+      "epoch": 0.01444,
+      "grad_norm": 0.8156546354293823,
+      "learning_rate": 0.003,
+      "loss": 4.2923,
+      "step": 1444
+    },
+    {
+      "epoch": 0.01445,
+      "grad_norm": 0.7982717752456665,
+      "learning_rate": 0.003,
+      "loss": 4.2557,
+      "step": 1445
+    },
+    {
+      "epoch": 0.01446,
+      "grad_norm": 0.6928796172142029,
+      "learning_rate": 0.003,
+      "loss": 4.264,
+      "step": 1446
+    },
+    {
+      "epoch": 0.01447,
+      "grad_norm": 0.6646214723587036,
+      "learning_rate": 0.003,
+      "loss": 4.2746,
+      "step": 1447
+    },
+    {
+      "epoch": 0.01448,
+      "grad_norm": 0.6782436966896057,
+      "learning_rate": 0.003,
+      "loss": 4.2822,
+      "step": 1448
+    },
+    {
+      "epoch": 0.01449,
+      "grad_norm": 0.6758451461791992,
+      "learning_rate": 0.003,
+      "loss": 4.2758,
+      "step": 1449
+    },
+    {
+      "epoch": 0.0145,
+      "grad_norm": 0.6399548649787903,
+      "learning_rate": 0.003,
+      "loss": 4.291,
+      "step": 1450
+    },
+    {
+      "epoch": 0.01451,
+      "grad_norm": 0.6663489937782288,
+      "learning_rate": 0.003,
+      "loss": 4.2675,
+      "step": 1451
+    },
+    {
+      "epoch": 0.01452,
+      "grad_norm": 0.676326334476471,
+      "learning_rate": 0.003,
+      "loss": 4.2687,
+      "step": 1452
+    },
+    {
+      "epoch": 0.01453,
+      "grad_norm": 0.5554569959640503,
+      "learning_rate": 0.003,
+      "loss": 4.2765,
+      "step": 1453
+    },
+    {
+      "epoch": 0.01454,
+      "grad_norm": 0.5639092922210693,
+      "learning_rate": 0.003,
+      "loss": 4.2445,
+      "step": 1454
+    },
+    {
+      "epoch": 0.01455,
+      "grad_norm": 0.49674221873283386,
+      "learning_rate": 0.003,
+      "loss": 4.2722,
+      "step": 1455
+    },
+    {
+      "epoch": 0.01456,
+      "grad_norm": 0.5553675293922424,
+      "learning_rate": 0.003,
+      "loss": 4.2521,
+      "step": 1456
+    },
+    {
+      "epoch": 0.01457,
+      "grad_norm": 0.6361852288246155,
+      "learning_rate": 0.003,
+      "loss": 4.2449,
+      "step": 1457
+    },
+    {
+      "epoch": 0.01458,
+      "grad_norm": 0.7916824817657471,
+      "learning_rate": 0.003,
+      "loss": 4.2458,
+      "step": 1458
+    },
+    {
+      "epoch": 0.01459,
+      "grad_norm": 0.8565419316291809,
+      "learning_rate": 0.003,
+      "loss": 4.2834,
+      "step": 1459
+    },
+    {
+      "epoch": 0.0146,
+      "grad_norm": 0.7326977252960205,
+      "learning_rate": 0.003,
+      "loss": 4.28,
+      "step": 1460
+    },
+    {
+      "epoch": 0.01461,
+      "grad_norm": 0.7401906251907349,
+      "learning_rate": 0.003,
+      "loss": 4.2599,
+      "step": 1461
+    },
+    {
+      "epoch": 0.01462,
+      "grad_norm": 0.7613895535469055,
+      "learning_rate": 0.003,
+      "loss": 4.2676,
+      "step": 1462
+    },
+    {
+      "epoch": 0.01463,
+      "grad_norm": 0.642987072467804,
+      "learning_rate": 0.003,
+      "loss": 4.2603,
+      "step": 1463
+    },
+    {
+      "epoch": 0.01464,
+      "grad_norm": 0.705771803855896,
+      "learning_rate": 0.003,
+      "loss": 4.2361,
+      "step": 1464
+    },
+    {
+      "epoch": 0.01465,
+      "grad_norm": 0.6884810328483582,
+      "learning_rate": 0.003,
+      "loss": 4.284,
+      "step": 1465
+    },
+    {
+      "epoch": 0.01466,
+      "grad_norm": 0.5892930626869202,
+      "learning_rate": 0.003,
+      "loss": 4.2489,
+      "step": 1466
+    },
+    {
+      "epoch": 0.01467,
+      "grad_norm": 0.6196565628051758,
+      "learning_rate": 0.003,
+      "loss": 4.2848,
+      "step": 1467
+    },
+    {
+      "epoch": 0.01468,
+      "grad_norm": 0.7562909126281738,
+      "learning_rate": 0.003,
+      "loss": 4.2639,
+      "step": 1468
+    },
+    {
+      "epoch": 0.01469,
+      "grad_norm": 0.7464540004730225,
+      "learning_rate": 0.003,
+      "loss": 4.2693,
+      "step": 1469
+    },
+    {
+      "epoch": 0.0147,
+      "grad_norm": 0.6821353435516357,
+      "learning_rate": 0.003,
+      "loss": 4.2569,
+      "step": 1470
+    },
+    {
+      "epoch": 0.01471,
+      "grad_norm": 0.5921043753623962,
+      "learning_rate": 0.003,
+      "loss": 4.2689,
+      "step": 1471
+    },
+    {
+      "epoch": 0.01472,
+      "grad_norm": 0.531998336315155,
+      "learning_rate": 0.003,
+      "loss": 4.252,
+      "step": 1472
+    },
+    {
+      "epoch": 0.01473,
+      "grad_norm": 0.5447686910629272,
+      "learning_rate": 0.003,
+      "loss": 4.2658,
+      "step": 1473
+    },
+    {
+      "epoch": 0.01474,
+      "grad_norm": 0.6622011661529541,
+      "learning_rate": 0.003,
+      "loss": 4.2515,
+      "step": 1474
+    },
+    {
+      "epoch": 0.01475,
+      "grad_norm": 0.7866929173469543,
+      "learning_rate": 0.003,
+      "loss": 4.2783,
+      "step": 1475
+    },
+    {
+      "epoch": 0.01476,
+      "grad_norm": 0.965038537979126,
+      "learning_rate": 0.003,
+      "loss": 4.2726,
+      "step": 1476
+    },
+    {
+      "epoch": 0.01477,
+      "grad_norm": 0.9310076236724854,
+      "learning_rate": 0.003,
+      "loss": 4.2926,
+      "step": 1477
+    },
+    {
+      "epoch": 0.01478,
+      "grad_norm": 0.7643823027610779,
+      "learning_rate": 0.003,
+      "loss": 4.2629,
+      "step": 1478
+    },
+    {
+      "epoch": 0.01479,
+      "grad_norm": 0.867812991142273,
+      "learning_rate": 0.003,
+      "loss": 4.3023,
+      "step": 1479
+    },
+    {
+      "epoch": 0.0148,
+      "grad_norm": 0.7325505018234253,
+      "learning_rate": 0.003,
+      "loss": 4.2689,
+      "step": 1480
+    },
+    {
+      "epoch": 0.01481,
+      "grad_norm": 0.673205554485321,
+      "learning_rate": 0.003,
+      "loss": 4.2668,
+      "step": 1481
+    },
+    {
+      "epoch": 0.01482,
+      "grad_norm": 0.7312605381011963,
+      "learning_rate": 0.003,
+      "loss": 4.2525,
+      "step": 1482
+    },
+    {
+      "epoch": 0.01483,
+      "grad_norm": 0.7356554269790649,
+      "learning_rate": 0.003,
+      "loss": 4.256,
+      "step": 1483
+    },
+    {
+      "epoch": 0.01484,
+      "grad_norm": 0.6896143555641174,
+      "learning_rate": 0.003,
+      "loss": 4.3028,
+      "step": 1484
+    },
+    {
+      "epoch": 0.01485,
+      "grad_norm": 0.6053803563117981,
+      "learning_rate": 0.003,
+      "loss": 4.2617,
+      "step": 1485
+    },
+    {
+      "epoch": 0.01486,
+      "grad_norm": 0.5422018766403198,
+      "learning_rate": 0.003,
+      "loss": 4.2576,
+      "step": 1486
+    },
+    {
+      "epoch": 0.01487,
+      "grad_norm": 0.584571897983551,
+      "learning_rate": 0.003,
+      "loss": 4.2611,
+      "step": 1487
+    },
+    {
+      "epoch": 0.01488,
+      "grad_norm": 0.58668452501297,
+      "learning_rate": 0.003,
+      "loss": 4.2703,
+      "step": 1488
+    },
+    {
+      "epoch": 0.01489,
+      "grad_norm": 0.5606329441070557,
+      "learning_rate": 0.003,
+      "loss": 4.2644,
+      "step": 1489
+    },
+    {
+      "epoch": 0.0149,
+      "grad_norm": 0.5718882083892822,
+      "learning_rate": 0.003,
+      "loss": 4.2562,
+      "step": 1490
+    },
+    {
+      "epoch": 0.01491,
+      "grad_norm": 0.609704852104187,
+      "learning_rate": 0.003,
+      "loss": 4.2714,
+      "step": 1491
+    },
+    {
+      "epoch": 0.01492,
+      "grad_norm": 0.6093578338623047,
+      "learning_rate": 0.003,
+      "loss": 4.2341,
+      "step": 1492
+    },
+    {
+      "epoch": 0.01493,
+      "grad_norm": 0.6492394208908081,
+      "learning_rate": 0.003,
+      "loss": 4.2659,
+      "step": 1493
+    },
+    {
+      "epoch": 0.01494,
+      "grad_norm": 0.7214344143867493,
+      "learning_rate": 0.003,
+      "loss": 4.2478,
+      "step": 1494
+    },
+    {
+      "epoch": 0.01495,
+      "grad_norm": 0.6858140826225281,
+      "learning_rate": 0.003,
+      "loss": 4.2587,
+      "step": 1495
+    },
+    {
+      "epoch": 0.01496,
+      "grad_norm": 0.6935936808586121,
+      "learning_rate": 0.003,
+      "loss": 4.2367,
+      "step": 1496
+    },
+    {
+      "epoch": 0.01497,
+      "grad_norm": 0.6599563360214233,
+      "learning_rate": 0.003,
+      "loss": 4.2117,
+      "step": 1497
+    },
+    {
+      "epoch": 0.01498,
+      "grad_norm": 0.6300835609436035,
+      "learning_rate": 0.003,
+      "loss": 4.2634,
+      "step": 1498
+    },
+    {
+      "epoch": 0.01499,
+      "grad_norm": 0.721996009349823,
+      "learning_rate": 0.003,
+      "loss": 4.2933,
+      "step": 1499
+    },
+    {
+      "epoch": 0.015,
+      "grad_norm": 0.7603088021278381,
+      "learning_rate": 0.003,
+      "loss": 4.2446,
+      "step": 1500
+    },
+    {
+      "epoch": 0.01501,
+      "grad_norm": 0.7853468060493469,
+      "learning_rate": 0.003,
+      "loss": 4.2524,
+      "step": 1501
+    },
+    {
+      "epoch": 0.01502,
+      "grad_norm": 0.6559277772903442,
+      "learning_rate": 0.003,
+      "loss": 4.2862,
+      "step": 1502
+    },
+    {
+      "epoch": 0.01503,
+      "grad_norm": 0.6914763450622559,
+      "learning_rate": 0.003,
+      "loss": 4.2669,
+      "step": 1503
+    },
+    {
+      "epoch": 0.01504,
+      "grad_norm": 0.8051386475563049,
+      "learning_rate": 0.003,
+      "loss": 4.2746,
+      "step": 1504
+    },
+    {
+      "epoch": 0.01505,
+      "grad_norm": 0.7562403082847595,
+      "learning_rate": 0.003,
+      "loss": 4.2564,
+      "step": 1505
+    },
+    {
+      "epoch": 0.01506,
+      "grad_norm": 0.6991645693778992,
+      "learning_rate": 0.003,
+      "loss": 4.2715,
+      "step": 1506
+    },
+    {
+      "epoch": 0.01507,
+      "grad_norm": 0.6562188267707825,
+      "learning_rate": 0.003,
+      "loss": 4.2831,
+      "step": 1507
+    },
+    {
+      "epoch": 0.01508,
+      "grad_norm": 0.6328547596931458,
+      "learning_rate": 0.003,
+      "loss": 4.2611,
+      "step": 1508
+    },
+    {
+      "epoch": 0.01509,
+      "grad_norm": 0.6792221665382385,
+      "learning_rate": 0.003,
+      "loss": 4.2777,
+      "step": 1509
+    },
+    {
+      "epoch": 0.0151,
+      "grad_norm": 0.7432823777198792,
+      "learning_rate": 0.003,
+      "loss": 4.2532,
+      "step": 1510
+    },
+    {
+      "epoch": 0.01511,
+      "grad_norm": 0.8595705628395081,
+      "learning_rate": 0.003,
+      "loss": 4.2519,
+      "step": 1511
+    },
+    {
+      "epoch": 0.01512,
+      "grad_norm": 0.9392295479774475,
+      "learning_rate": 0.003,
+      "loss": 4.2664,
+      "step": 1512
+    },
+    {
+      "epoch": 0.01513,
+      "grad_norm": 0.706947386264801,
+      "learning_rate": 0.003,
+      "loss": 4.2625,
+      "step": 1513
+    },
+    {
+      "epoch": 0.01514,
+      "grad_norm": 0.6648169159889221,
+      "learning_rate": 0.003,
+      "loss": 4.2697,
+      "step": 1514
+    },
+    {
+      "epoch": 0.01515,
+      "grad_norm": 0.7984362244606018,
+      "learning_rate": 0.003,
+      "loss": 4.2642,
+      "step": 1515
+    },
+    {
+      "epoch": 0.01516,
+      "grad_norm": 0.8379718065261841,
+      "learning_rate": 0.003,
+      "loss": 4.2755,
+      "step": 1516
+    },
+    {
+      "epoch": 0.01517,
+      "grad_norm": 0.7840726375579834,
+      "learning_rate": 0.003,
+      "loss": 4.2252,
+      "step": 1517
+    },
+    {
+      "epoch": 0.01518,
+      "grad_norm": 0.7350525856018066,
+      "learning_rate": 0.003,
+      "loss": 4.2609,
+      "step": 1518
+    },
+    {
+      "epoch": 0.01519,
+      "grad_norm": 0.637050449848175,
+      "learning_rate": 0.003,
+      "loss": 4.2905,
+      "step": 1519
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.6365798711776733,
+      "learning_rate": 0.003,
+      "loss": 4.2441,
+      "step": 1520
+    },
+    {
+      "epoch": 0.01521,
+      "grad_norm": 0.5890719890594482,
+      "learning_rate": 0.003,
+      "loss": 4.2552,
+      "step": 1521
+    },
+    {
+      "epoch": 0.01522,
+      "grad_norm": 0.5176184773445129,
+      "learning_rate": 0.003,
+      "loss": 4.2486,
+      "step": 1522
+    },
+    {
+      "epoch": 0.01523,
+      "grad_norm": 0.5967531800270081,
+      "learning_rate": 0.003,
+      "loss": 4.2464,
+      "step": 1523
+    },
+    {
+      "epoch": 0.01524,
+      "grad_norm": 0.57130366563797,
+      "learning_rate": 0.003,
+      "loss": 4.2759,
+      "step": 1524
+    },
+    {
+      "epoch": 0.01525,
+      "grad_norm": 0.68913733959198,
+      "learning_rate": 0.003,
+      "loss": 4.2789,
+      "step": 1525
+    },
+    {
+      "epoch": 0.01526,
+      "grad_norm": 0.6771203279495239,
+      "learning_rate": 0.003,
+      "loss": 4.2725,
+      "step": 1526
+    },
+    {
+      "epoch": 0.01527,
+      "grad_norm": 0.7048394083976746,
+      "learning_rate": 0.003,
+      "loss": 4.2371,
+      "step": 1527
+    },
+    {
+      "epoch": 0.01528,
+      "grad_norm": 0.7917721271514893,
+      "learning_rate": 0.003,
+      "loss": 4.2607,
+      "step": 1528
+    },
+    {
+      "epoch": 0.01529,
+      "grad_norm": 0.7461357116699219,
+      "learning_rate": 0.003,
+      "loss": 4.2575,
+      "step": 1529
+    },
+    {
+      "epoch": 0.0153,
+      "grad_norm": 0.6422540545463562,
+      "learning_rate": 0.003,
+      "loss": 4.2576,
+      "step": 1530
+    },
+    {
+      "epoch": 0.01531,
+      "grad_norm": 0.6448599696159363,
+      "learning_rate": 0.003,
+      "loss": 4.2541,
+      "step": 1531
+    },
+    {
+      "epoch": 0.01532,
+      "grad_norm": 0.556128203868866,
+      "learning_rate": 0.003,
+      "loss": 4.2286,
+      "step": 1532
+    },
+    {
+      "epoch": 0.01533,
+      "grad_norm": 0.511016845703125,
+      "learning_rate": 0.003,
+      "loss": 4.2532,
+      "step": 1533
+    },
+    {
+      "epoch": 0.01534,
+      "grad_norm": 0.4958668351173401,
+      "learning_rate": 0.003,
+      "loss": 4.2504,
+      "step": 1534
+    },
+    {
+      "epoch": 0.01535,
+      "grad_norm": 0.472829669713974,
+      "learning_rate": 0.003,
+      "loss": 4.2611,
+      "step": 1535
+    },
+    {
+      "epoch": 0.01536,
+      "grad_norm": 0.4443574547767639,
+      "learning_rate": 0.003,
+      "loss": 4.2397,
+      "step": 1536
+    },
+    {
+      "epoch": 0.01537,
+      "grad_norm": 0.5031164884567261,
+      "learning_rate": 0.003,
+      "loss": 4.2348,
+      "step": 1537
+    },
+    {
+      "epoch": 0.01538,
+      "grad_norm": 0.6832772493362427,
+      "learning_rate": 0.003,
+      "loss": 4.2195,
+      "step": 1538
+    },
+    {
+      "epoch": 0.01539,
+      "grad_norm": 0.875408947467804,
+      "learning_rate": 0.003,
+      "loss": 4.2638,
+      "step": 1539
+    },
+    {
+      "epoch": 0.0154,
+      "grad_norm": 0.9963869452476501,
+      "learning_rate": 0.003,
+      "loss": 4.2726,
+      "step": 1540
+    },
+    {
+      "epoch": 0.01541,
+      "grad_norm": 0.8254619836807251,
+      "learning_rate": 0.003,
+      "loss": 4.249,
+      "step": 1541
+    },
+    {
+      "epoch": 0.01542,
+      "grad_norm": 0.8195037245750427,
+      "learning_rate": 0.003,
+      "loss": 4.2801,
+      "step": 1542
+    },
+    {
+      "epoch": 0.01543,
+      "grad_norm": 0.9007482528686523,
+      "learning_rate": 0.003,
+      "loss": 4.268,
+      "step": 1543
+    },
+    {
+      "epoch": 0.01544,
+      "grad_norm": 0.8777545690536499,
+      "learning_rate": 0.003,
+      "loss": 4.2622,
+      "step": 1544
+    },
+    {
+      "epoch": 0.01545,
+      "grad_norm": 0.7702150344848633,
+      "learning_rate": 0.003,
+      "loss": 4.3234,
+      "step": 1545
+    },
+    {
+      "epoch": 0.01546,
+      "grad_norm": 0.903275191783905,
+      "learning_rate": 0.003,
+      "loss": 4.2965,
+      "step": 1546
+    },
+    {
+      "epoch": 0.01547,
+      "grad_norm": 1.0503301620483398,
+      "learning_rate": 0.003,
+      "loss": 4.307,
+      "step": 1547
+    },
+    {
+      "epoch": 0.01548,
+      "grad_norm": 0.868294894695282,
+      "learning_rate": 0.003,
+      "loss": 4.2647,
+      "step": 1548
+    },
+    {
+      "epoch": 0.01549,
+      "grad_norm": 0.8206138610839844,
+      "learning_rate": 0.003,
+      "loss": 4.2515,
+      "step": 1549
+    },
+    {
+      "epoch": 0.0155,
+      "grad_norm": 0.9134683012962341,
+      "learning_rate": 0.003,
+      "loss": 4.2824,
+      "step": 1550
+    },
+    {
+      "epoch": 0.01551,
+      "grad_norm": 0.9439716935157776,
+      "learning_rate": 0.003,
+      "loss": 4.2675,
+      "step": 1551
+    },
+    {
+      "epoch": 0.01552,
+      "grad_norm": 0.7804630994796753,
+      "learning_rate": 0.003,
+      "loss": 4.28,
+      "step": 1552
+    },
+    {
+      "epoch": 0.01553,
+      "grad_norm": 0.6432939171791077,
+      "learning_rate": 0.003,
+      "loss": 4.2993,
+      "step": 1553
+    },
+    {
+      "epoch": 0.01554,
+      "grad_norm": 0.6834948062896729,
+      "learning_rate": 0.003,
+      "loss": 4.2822,
+      "step": 1554
+    },
+    {
+      "epoch": 0.01555,
+      "grad_norm": 0.7107034921646118,
+      "learning_rate": 0.003,
+      "loss": 4.2553,
+      "step": 1555
+    },
+    {
+      "epoch": 0.01556,
+      "grad_norm": 0.6976543664932251,
+      "learning_rate": 0.003,
+      "loss": 4.2901,
+      "step": 1556
+    },
+    {
+      "epoch": 0.01557,
+      "grad_norm": 0.6529300808906555,
+      "learning_rate": 0.003,
+      "loss": 4.2623,
+      "step": 1557
+    },
+    {
+      "epoch": 0.01558,
+      "grad_norm": 0.6456514000892639,
+      "learning_rate": 0.003,
+      "loss": 4.2503,
+      "step": 1558
+    },
+    {
+      "epoch": 0.01559,
+      "grad_norm": 0.5664896368980408,
+      "learning_rate": 0.003,
+      "loss": 4.2838,
+      "step": 1559
+    },
+    {
+      "epoch": 0.0156,
+      "grad_norm": 0.5338277816772461,
+      "learning_rate": 0.003,
+      "loss": 4.2719,
+      "step": 1560
+    },
+    {
+      "epoch": 0.01561,
+      "grad_norm": 0.45519402623176575,
+      "learning_rate": 0.003,
+      "loss": 4.2494,
+      "step": 1561
+    },
+    {
+      "epoch": 0.01562,
+      "grad_norm": 0.4456147849559784,
+      "learning_rate": 0.003,
+      "loss": 4.2411,
+      "step": 1562
+    },
+    {
+      "epoch": 0.01563,
+      "grad_norm": 0.3998737335205078,
+      "learning_rate": 0.003,
+      "loss": 4.2455,
+      "step": 1563
+    },
+    {
+      "epoch": 0.01564,
+      "grad_norm": 0.4089747965335846,
+      "learning_rate": 0.003,
+      "loss": 4.2731,
+      "step": 1564
+    },
+    {
+      "epoch": 0.01565,
+      "grad_norm": 0.41704586148262024,
+      "learning_rate": 0.003,
+      "loss": 4.2464,
+      "step": 1565
+    },
+    {
+      "epoch": 0.01566,
+      "grad_norm": 0.4549218416213989,
+      "learning_rate": 0.003,
+      "loss": 4.265,
+      "step": 1566
+    },
+    {
+      "epoch": 0.01567,
+      "grad_norm": 0.45544859766960144,
+      "learning_rate": 0.003,
+      "loss": 4.2568,
+      "step": 1567
+    },
+    {
+      "epoch": 0.01568,
+      "grad_norm": 0.496439129114151,
+      "learning_rate": 0.003,
+      "loss": 4.2394,
+      "step": 1568
+    },
+    {
+      "epoch": 0.01569,
+      "grad_norm": 0.5869415998458862,
+      "learning_rate": 0.003,
+      "loss": 4.2472,
+      "step": 1569
+    },
+    {
+      "epoch": 0.0157,
+      "grad_norm": 0.7246452569961548,
+      "learning_rate": 0.003,
+      "loss": 4.2683,
+      "step": 1570
+    },
+    {
+      "epoch": 0.01571,
+      "grad_norm": 0.807225227355957,
+      "learning_rate": 0.003,
+      "loss": 4.268,
+      "step": 1571
+    },
+    {
+      "epoch": 0.01572,
+      "grad_norm": 0.7089520692825317,
+      "learning_rate": 0.003,
+      "loss": 4.2658,
+      "step": 1572
+    },
+    {
+      "epoch": 0.01573,
+      "grad_norm": 0.7833078503608704,
+      "learning_rate": 0.003,
+      "loss": 4.2544,
+      "step": 1573
+    },
+    {
+      "epoch": 0.01574,
+      "grad_norm": 0.8009607791900635,
+      "learning_rate": 0.003,
+      "loss": 4.2723,
+      "step": 1574
+    },
+    {
+      "epoch": 0.01575,
+      "grad_norm": 0.7152323126792908,
+      "learning_rate": 0.003,
+      "loss": 4.2303,
+      "step": 1575
+    },
+    {
+      "epoch": 0.01576,
+      "grad_norm": 0.8425666093826294,
+      "learning_rate": 0.003,
+      "loss": 4.258,
+      "step": 1576
+    },
+    {
+      "epoch": 0.01577,
+      "grad_norm": 0.9029209613800049,
+      "learning_rate": 0.003,
+      "loss": 4.2227,
+      "step": 1577
+    },
+    {
+      "epoch": 0.01578,
+      "grad_norm": 0.8221943378448486,
+      "learning_rate": 0.003,
+      "loss": 4.2532,
+      "step": 1578
+    },
+    {
+      "epoch": 0.01579,
+      "grad_norm": 0.6562032699584961,
+      "learning_rate": 0.003,
+      "loss": 4.2422,
+      "step": 1579
+    },
+    {
+      "epoch": 0.0158,
+      "grad_norm": 0.6161283254623413,
+      "learning_rate": 0.003,
+      "loss": 4.2534,
+      "step": 1580
+    },
+    {
+      "epoch": 0.01581,
+      "grad_norm": 0.5998492240905762,
+      "learning_rate": 0.003,
+      "loss": 4.2595,
+      "step": 1581
+    },
+    {
+      "epoch": 0.01582,
+      "grad_norm": 0.6605640053749084,
+      "learning_rate": 0.003,
+      "loss": 4.2418,
+      "step": 1582
+    },
+    {
+      "epoch": 0.01583,
+      "grad_norm": 0.6276175379753113,
+      "learning_rate": 0.003,
+      "loss": 4.2293,
+      "step": 1583
+    },
+    {
+      "epoch": 0.01584,
+      "grad_norm": 0.5631775856018066,
+      "learning_rate": 0.003,
+      "loss": 4.2575,
+      "step": 1584
+    },
+    {
+      "epoch": 0.01585,
+      "grad_norm": 0.5975695252418518,
+      "learning_rate": 0.003,
+      "loss": 4.2521,
+      "step": 1585
+    },
+    {
+      "epoch": 0.01586,
+      "grad_norm": 0.7194235324859619,
+      "learning_rate": 0.003,
+      "loss": 4.229,
+      "step": 1586
+    },
+    {
+      "epoch": 0.01587,
+      "grad_norm": 0.7284054160118103,
+      "learning_rate": 0.003,
+      "loss": 4.246,
+      "step": 1587
+    },
+    {
+      "epoch": 0.01588,
+      "grad_norm": 0.6996057629585266,
+      "learning_rate": 0.003,
+      "loss": 4.2728,
+      "step": 1588
+    },
+    {
+      "epoch": 0.01589,
+      "grad_norm": 0.5657826662063599,
+      "learning_rate": 0.003,
+      "loss": 4.2616,
+      "step": 1589
+    },
+    {
+      "epoch": 0.0159,
+      "grad_norm": 0.6453202366828918,
+      "learning_rate": 0.003,
+      "loss": 4.2584,
+      "step": 1590
+    },
+    {
+      "epoch": 0.01591,
+      "grad_norm": 0.5882779955863953,
+      "learning_rate": 0.003,
+      "loss": 4.2627,
+      "step": 1591
+    },
+    {
+      "epoch": 0.01592,
+      "grad_norm": 0.5240422487258911,
+      "learning_rate": 0.003,
+      "loss": 4.2485,
+      "step": 1592
+    },
+    {
+      "epoch": 0.01593,
+      "grad_norm": 0.5395859479904175,
+      "learning_rate": 0.003,
+      "loss": 4.2507,
+      "step": 1593
+    },
+    {
+      "epoch": 0.01594,
+      "grad_norm": 0.5181586742401123,
+      "learning_rate": 0.003,
+      "loss": 4.2388,
+      "step": 1594
+    },
+    {
+      "epoch": 0.01595,
+      "grad_norm": 0.5144811272621155,
+      "learning_rate": 0.003,
+      "loss": 4.2661,
+      "step": 1595
+    },
+    {
+      "epoch": 0.01596,
+      "grad_norm": 0.5639116764068604,
+      "learning_rate": 0.003,
+      "loss": 4.2583,
+      "step": 1596
+    },
+    {
+      "epoch": 0.01597,
+      "grad_norm": 0.616133987903595,
+      "learning_rate": 0.003,
+      "loss": 4.2207,
+      "step": 1597
+    },
+    {
+      "epoch": 0.01598,
+      "grad_norm": 0.7612146735191345,
+      "learning_rate": 0.003,
+      "loss": 4.2233,
+      "step": 1598
+    },
+    {
+      "epoch": 0.01599,
+      "grad_norm": 0.8201577663421631,
+      "learning_rate": 0.003,
+      "loss": 4.2494,
+      "step": 1599
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.7031198143959045,
+      "learning_rate": 0.003,
+      "loss": 4.2122,
+      "step": 1600
+    },
+    {
+      "epoch": 0.01601,
+      "grad_norm": 0.7635224461555481,
+      "learning_rate": 0.003,
+      "loss": 4.2634,
+      "step": 1601
+    },
+    {
+      "epoch": 0.01602,
+      "grad_norm": 0.8013474941253662,
+      "learning_rate": 0.003,
+      "loss": 4.2394,
+      "step": 1602
+    },
+    {
+      "epoch": 0.01603,
+      "grad_norm": 0.7904717326164246,
+      "learning_rate": 0.003,
+      "loss": 4.2591,
+      "step": 1603
+    },
+    {
+      "epoch": 0.01604,
+      "grad_norm": 0.6602920293807983,
+      "learning_rate": 0.003,
+      "loss": 4.2474,
+      "step": 1604
+    },
+    {
+      "epoch": 0.01605,
+      "grad_norm": 0.7017102241516113,
+      "learning_rate": 0.003,
+      "loss": 4.2323,
+      "step": 1605
+    },
+    {
+      "epoch": 0.01606,
+      "grad_norm": 0.9051344394683838,
+      "learning_rate": 0.003,
+      "loss": 4.2411,
+      "step": 1606
+    },
+    {
+      "epoch": 0.01607,
+      "grad_norm": 0.8984688520431519,
+      "learning_rate": 0.003,
+      "loss": 4.2807,
+      "step": 1607
+    },
+    {
+      "epoch": 0.01608,
+      "grad_norm": 0.7925190329551697,
+      "learning_rate": 0.003,
+      "loss": 4.2482,
+      "step": 1608
+    },
+    {
+      "epoch": 0.01609,
+      "grad_norm": 0.8031415939331055,
+      "learning_rate": 0.003,
+      "loss": 4.2542,
+      "step": 1609
+    },
+    {
+      "epoch": 0.0161,
+      "grad_norm": 0.8899487257003784,
+      "learning_rate": 0.003,
+      "loss": 4.2282,
+      "step": 1610
+    },
+    {
+      "epoch": 0.01611,
+      "grad_norm": 0.7361058592796326,
+      "learning_rate": 0.003,
+      "loss": 4.2408,
+      "step": 1611
+    },
+    {
+      "epoch": 0.01612,
+      "grad_norm": 0.6385438442230225,
+      "learning_rate": 0.003,
+      "loss": 4.2281,
+      "step": 1612
+    },
+    {
+      "epoch": 0.01613,
+      "grad_norm": 0.5635811686515808,
+      "learning_rate": 0.003,
+      "loss": 4.2693,
+      "step": 1613
+    },
+    {
+      "epoch": 0.01614,
+      "grad_norm": 0.49793457984924316,
+      "learning_rate": 0.003,
+      "loss": 4.2898,
+      "step": 1614
+    },
+    {
+      "epoch": 0.01615,
+      "grad_norm": 0.5850211381912231,
+      "learning_rate": 0.003,
+      "loss": 4.2593,
+      "step": 1615
+    },
+    {
+      "epoch": 0.01616,
+      "grad_norm": 0.557797908782959,
+      "learning_rate": 0.003,
+      "loss": 4.2378,
+      "step": 1616
+    },
+    {
+      "epoch": 0.01617,
+      "grad_norm": 0.5186588764190674,
+      "learning_rate": 0.003,
+      "loss": 4.226,
+      "step": 1617
+    },
+    {
+      "epoch": 0.01618,
+      "grad_norm": 0.5103740692138672,
+      "learning_rate": 0.003,
+      "loss": 4.2319,
+      "step": 1618
+    },
+    {
+      "epoch": 0.01619,
+      "grad_norm": 0.4262300729751587,
+      "learning_rate": 0.003,
+      "loss": 4.2167,
+      "step": 1619
+    },
+    {
+      "epoch": 0.0162,
+      "grad_norm": 0.43414008617401123,
+      "learning_rate": 0.003,
+      "loss": 4.2243,
+      "step": 1620
+    },
+    {
+      "epoch": 0.01621,
+      "grad_norm": 0.47554972767829895,
+      "learning_rate": 0.003,
+      "loss": 4.2198,
+      "step": 1621
+    },
+    {
+      "epoch": 0.01622,
+      "grad_norm": 0.49989646673202515,
+      "learning_rate": 0.003,
+      "loss": 4.2531,
+      "step": 1622
+    },
+    {
+      "epoch": 0.01623,
+      "grad_norm": 0.514892578125,
+      "learning_rate": 0.003,
+      "loss": 4.2306,
+      "step": 1623
+    },
+    {
+      "epoch": 0.01624,
+      "grad_norm": 0.606673538684845,
+      "learning_rate": 0.003,
+      "loss": 4.2463,
+      "step": 1624
+    },
+    {
+      "epoch": 0.01625,
+      "grad_norm": 0.7891315817832947,
+      "learning_rate": 0.003,
+      "loss": 4.2309,
+      "step": 1625
+    },
+    {
+      "epoch": 0.01626,
+      "grad_norm": 0.9178404808044434,
+      "learning_rate": 0.003,
+      "loss": 4.2593,
+      "step": 1626
+    },
+    {
+      "epoch": 0.01627,
+      "grad_norm": 1.0693062543869019,
+      "learning_rate": 0.003,
+      "loss": 4.2387,
+      "step": 1627
+    },
+    {
+      "epoch": 0.01628,
+      "grad_norm": 0.7644784450531006,
+      "learning_rate": 0.003,
+      "loss": 4.2438,
+      "step": 1628
+    },
+    {
+      "epoch": 0.01629,
+      "grad_norm": 0.7061607241630554,
+      "learning_rate": 0.003,
+      "loss": 4.2389,
+      "step": 1629
+    },
+    {
+      "epoch": 0.0163,
+      "grad_norm": 0.8324339985847473,
+      "learning_rate": 0.003,
+      "loss": 4.2364,
+      "step": 1630
+    },
+    {
+      "epoch": 0.01631,
+      "grad_norm": 0.7869639992713928,
+      "learning_rate": 0.003,
+      "loss": 4.2544,
+      "step": 1631
+    },
+    {
+      "epoch": 0.01632,
+      "grad_norm": 0.7570949792861938,
+      "learning_rate": 0.003,
+      "loss": 4.264,
+      "step": 1632
+    },
+    {
+      "epoch": 0.01633,
+      "grad_norm": 0.8038742542266846,
+      "learning_rate": 0.003,
+      "loss": 4.2404,
+      "step": 1633
+    },
+    {
+      "epoch": 0.01634,
+      "grad_norm": 0.7716854810714722,
+      "learning_rate": 0.003,
+      "loss": 4.2566,
+      "step": 1634
+    },
+    {
+      "epoch": 0.01635,
+      "grad_norm": 0.6350013613700867,
+      "learning_rate": 0.003,
+      "loss": 4.2343,
+      "step": 1635
+    },
+    {
+      "epoch": 0.01636,
+      "grad_norm": 0.5491396188735962,
+      "learning_rate": 0.003,
+      "loss": 4.247,
+      "step": 1636
+    },
+    {
+      "epoch": 0.01637,
+      "grad_norm": 0.5923768281936646,
+      "learning_rate": 0.003,
+      "loss": 4.2317,
+      "step": 1637
+    },
+    {
+      "epoch": 0.01638,
+      "grad_norm": 0.5720195174217224,
+      "learning_rate": 0.003,
+      "loss": 4.2361,
+      "step": 1638
+    },
+    {
+      "epoch": 0.01639,
+      "grad_norm": 0.5404289364814758,
+      "learning_rate": 0.003,
+      "loss": 4.2596,
+      "step": 1639
+    },
+    {
+      "epoch": 0.0164,
+      "grad_norm": 0.5601854920387268,
+      "learning_rate": 0.003,
+      "loss": 4.2389,
+      "step": 1640
+    },
+    {
+      "epoch": 0.01641,
+      "grad_norm": 0.5230163335800171,
+      "learning_rate": 0.003,
+      "loss": 4.212,
+      "step": 1641
+    },
+    {
+      "epoch": 0.01642,
+      "grad_norm": 0.5209521651268005,
+      "learning_rate": 0.003,
+      "loss": 4.2547,
+      "step": 1642
+    },
+    {
+      "epoch": 0.01643,
+      "grad_norm": 0.6072290539741516,
+      "learning_rate": 0.003,
+      "loss": 4.2421,
+      "step": 1643
+    },
+    {
+      "epoch": 0.01644,
+      "grad_norm": 0.6505742073059082,
+      "learning_rate": 0.003,
+      "loss": 4.2612,
+      "step": 1644
+    },
+    {
+      "epoch": 0.01645,
+      "grad_norm": 0.6426966786384583,
+      "learning_rate": 0.003,
+      "loss": 4.2068,
+      "step": 1645
+    },
+    {
+      "epoch": 0.01646,
+      "grad_norm": 0.6992736458778381,
+      "learning_rate": 0.003,
+      "loss": 4.2429,
+      "step": 1646
+    },
+    {
+      "epoch": 0.01647,
+      "grad_norm": 0.6382941603660583,
+      "learning_rate": 0.003,
+      "loss": 4.2378,
+      "step": 1647
+    },
+    {
+      "epoch": 0.01648,
+      "grad_norm": 0.6025996804237366,
+      "learning_rate": 0.003,
+      "loss": 4.2157,
+      "step": 1648
+    },
+    {
+      "epoch": 0.01649,
+      "grad_norm": 0.5261057615280151,
+      "learning_rate": 0.003,
+      "loss": 4.2469,
+      "step": 1649
+    },
+    {
+      "epoch": 0.0165,
+      "grad_norm": 0.6138892769813538,
+      "learning_rate": 0.003,
+      "loss": 4.2268,
+      "step": 1650
+    },
+    {
+      "epoch": 0.01651,
+      "grad_norm": 0.5947723984718323,
+      "learning_rate": 0.003,
+      "loss": 4.2358,
+      "step": 1651
+    },
+    {
+      "epoch": 0.01652,
+      "grad_norm": 0.6178387403488159,
+      "learning_rate": 0.003,
+      "loss": 4.1929,
+      "step": 1652
+    },
+    {
+      "epoch": 0.01653,
+      "grad_norm": 0.6459247469902039,
+      "learning_rate": 0.003,
+      "loss": 4.2379,
+      "step": 1653
+    },
+    {
+      "epoch": 0.01654,
+      "grad_norm": 0.6552532911300659,
+      "learning_rate": 0.003,
+      "loss": 4.2241,
+      "step": 1654
+    },
+    {
+      "epoch": 0.01655,
+      "grad_norm": 0.780878484249115,
+      "learning_rate": 0.003,
+      "loss": 4.2406,
+      "step": 1655
+    },
+    {
+      "epoch": 0.01656,
+      "grad_norm": 0.9652771949768066,
+      "learning_rate": 0.003,
+      "loss": 4.2288,
+      "step": 1656
+    },
+    {
+      "epoch": 0.01657,
+      "grad_norm": 1.1051970720291138,
+      "learning_rate": 0.003,
+      "loss": 4.2586,
+      "step": 1657
+    },
+    {
+      "epoch": 0.01658,
+      "grad_norm": 0.7399364709854126,
+      "learning_rate": 0.003,
+      "loss": 4.2318,
+      "step": 1658
+    },
+    {
+      "epoch": 0.01659,
+      "grad_norm": 0.6876490116119385,
+      "learning_rate": 0.003,
+      "loss": 4.2377,
+      "step": 1659
+    },
+    {
+      "epoch": 0.0166,
+      "grad_norm": 0.8514830470085144,
+      "learning_rate": 0.003,
+      "loss": 4.2247,
+      "step": 1660
+    },
+    {
+      "epoch": 0.01661,
+      "grad_norm": 0.7782738208770752,
+      "learning_rate": 0.003,
+      "loss": 4.2268,
+      "step": 1661
+    },
+    {
+      "epoch": 0.01662,
+      "grad_norm": 0.6898196339607239,
+      "learning_rate": 0.003,
+      "loss": 4.2305,
+      "step": 1662
+    },
+    {
+      "epoch": 0.01663,
+      "grad_norm": 0.6155083179473877,
+      "learning_rate": 0.003,
+      "loss": 4.2454,
+      "step": 1663
+    },
+    {
+      "epoch": 0.01664,
+      "grad_norm": 0.5973721742630005,
+      "learning_rate": 0.003,
+      "loss": 4.2626,
+      "step": 1664
+    },
+    {
+      "epoch": 0.01665,
+      "grad_norm": 0.6198952794075012,
+      "learning_rate": 0.003,
+      "loss": 4.2273,
+      "step": 1665
+    },
+    {
+      "epoch": 0.01666,
+      "grad_norm": 0.5744661092758179,
+      "learning_rate": 0.003,
+      "loss": 4.2297,
+      "step": 1666
+    },
+    {
+      "epoch": 0.01667,
+      "grad_norm": 0.5517554879188538,
+      "learning_rate": 0.003,
+      "loss": 4.241,
+      "step": 1667
+    },
+    {
+      "epoch": 0.01668,
+      "grad_norm": 0.6101126670837402,
+      "learning_rate": 0.003,
+      "loss": 4.2451,
+      "step": 1668
+    },
+    {
+      "epoch": 0.01669,
+      "grad_norm": 0.6228256821632385,
+      "learning_rate": 0.003,
+      "loss": 4.2315,
+      "step": 1669
+    },
+    {
+      "epoch": 0.0167,
+      "grad_norm": 0.5298703908920288,
+      "learning_rate": 0.003,
+      "loss": 4.2179,
+      "step": 1670
+    },
+    {
+      "epoch": 0.01671,
+      "grad_norm": 0.5599346160888672,
+      "learning_rate": 0.003,
+      "loss": 4.2066,
+      "step": 1671
+    },
+    {
+      "epoch": 0.01672,
+      "grad_norm": 0.6326868534088135,
+      "learning_rate": 0.003,
+      "loss": 4.2204,
+      "step": 1672
+    },
+    {
+      "epoch": 0.01673,
+      "grad_norm": 0.566566526889801,
+      "learning_rate": 0.003,
+      "loss": 4.2135,
+      "step": 1673
+    },
+    {
+      "epoch": 0.01674,
+      "grad_norm": 0.5534862875938416,
+      "learning_rate": 0.003,
+      "loss": 4.2339,
+      "step": 1674
+    },
+    {
+      "epoch": 0.01675,
+      "grad_norm": 0.6360026597976685,
+      "learning_rate": 0.003,
+      "loss": 4.2196,
+      "step": 1675
+    },
+    {
+      "epoch": 0.01676,
+      "grad_norm": 0.754280686378479,
+      "learning_rate": 0.003,
+      "loss": 4.2342,
+      "step": 1676
+    },
+    {
+      "epoch": 0.01677,
+      "grad_norm": 0.9305175542831421,
+      "learning_rate": 0.003,
+      "loss": 4.2304,
+      "step": 1677
+    },
+    {
+      "epoch": 0.01678,
+      "grad_norm": 0.9641466736793518,
+      "learning_rate": 0.003,
+      "loss": 4.2519,
+      "step": 1678
+    },
+    {
+      "epoch": 0.01679,
+      "grad_norm": 0.9234570264816284,
+      "learning_rate": 0.003,
+      "loss": 4.2348,
+      "step": 1679
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.8640843629837036,
+      "learning_rate": 0.003,
+      "loss": 4.2518,
+      "step": 1680
+    },
+    {
+      "epoch": 0.01681,
+      "grad_norm": 0.802136242389679,
+      "learning_rate": 0.003,
+      "loss": 4.2519,
+      "step": 1681
+    },
+    {
+      "epoch": 0.01682,
+      "grad_norm": 0.8513818383216858,
+      "learning_rate": 0.003,
+      "loss": 4.2466,
+      "step": 1682
+    },
+    {
+      "epoch": 0.01683,
+      "grad_norm": 0.8904326558113098,
+      "learning_rate": 0.003,
+      "loss": 4.265,
+      "step": 1683
+    },
+    {
+      "epoch": 0.01684,
+      "grad_norm": 0.8687060475349426,
+      "learning_rate": 0.003,
+      "loss": 4.2429,
+      "step": 1684
+    },
+    {
+      "epoch": 0.01685,
+      "grad_norm": 0.8002411127090454,
+      "learning_rate": 0.003,
+      "loss": 4.2528,
+      "step": 1685
+    },
+    {
+      "epoch": 0.01686,
+      "grad_norm": 0.8148955702781677,
+      "learning_rate": 0.003,
+      "loss": 4.2527,
+      "step": 1686
+    },
+    {
+      "epoch": 0.01687,
+      "grad_norm": 0.8790806531906128,
+      "learning_rate": 0.003,
+      "loss": 4.251,
+      "step": 1687
+    },
+    {
+      "epoch": 0.01688,
+      "grad_norm": 0.9963980317115784,
+      "learning_rate": 0.003,
+      "loss": 4.2442,
+      "step": 1688
+    },
+    {
+      "epoch": 0.01689,
+      "grad_norm": 0.9348352551460266,
+      "learning_rate": 0.003,
+      "loss": 4.2605,
+      "step": 1689
+    },
+    {
+      "epoch": 0.0169,
+      "grad_norm": 0.9107605814933777,
+      "learning_rate": 0.003,
+      "loss": 4.2483,
+      "step": 1690
+    },
+    {
+      "epoch": 0.01691,
+      "grad_norm": 0.8572477698326111,
+      "learning_rate": 0.003,
+      "loss": 4.2603,
+      "step": 1691
+    },
+    {
+      "epoch": 0.01692,
+      "grad_norm": 0.8438946008682251,
+      "learning_rate": 0.003,
+      "loss": 4.2653,
+      "step": 1692
+    },
+    {
+      "epoch": 0.01693,
+      "grad_norm": 0.880574107170105,
+      "learning_rate": 0.003,
+      "loss": 4.2867,
+      "step": 1693
+    },
+    {
+      "epoch": 0.01694,
+      "grad_norm": 0.8747463822364807,
+      "learning_rate": 0.003,
+      "loss": 4.2402,
+      "step": 1694
+    },
+    {
+      "epoch": 0.01695,
+      "grad_norm": 0.9774526953697205,
+      "learning_rate": 0.003,
+      "loss": 4.2739,
+      "step": 1695
+    },
+    {
+      "epoch": 0.01696,
+      "grad_norm": 1.0337826013565063,
+      "learning_rate": 0.003,
+      "loss": 4.2798,
+      "step": 1696
+    },
+    {
+      "epoch": 0.01697,
+      "grad_norm": 0.852648138999939,
+      "learning_rate": 0.003,
+      "loss": 4.2511,
+      "step": 1697
+    },
+    {
+      "epoch": 0.01698,
+      "grad_norm": 0.7840266227722168,
+      "learning_rate": 0.003,
+      "loss": 4.2388,
+      "step": 1698
+    },
+    {
+      "epoch": 0.01699,
+      "grad_norm": 0.7520343065261841,
+      "learning_rate": 0.003,
+      "loss": 4.2503,
+      "step": 1699
+    },
+    {
+      "epoch": 0.017,
+      "grad_norm": 0.686873197555542,
+      "learning_rate": 0.003,
+      "loss": 4.2629,
+      "step": 1700
+    },
+    {
+      "epoch": 0.01701,
+      "grad_norm": 0.6256718039512634,
+      "learning_rate": 0.003,
+      "loss": 4.2424,
+      "step": 1701
+    },
+    {
+      "epoch": 0.01702,
+      "grad_norm": 0.5732917785644531,
+      "learning_rate": 0.003,
+      "loss": 4.2618,
+      "step": 1702
+    },
+    {
+      "epoch": 0.01703,
+      "grad_norm": 0.5769745707511902,
+      "learning_rate": 0.003,
+      "loss": 4.2478,
+      "step": 1703
+    },
+    {
+      "epoch": 0.01704,
+      "grad_norm": 0.5204162001609802,
+      "learning_rate": 0.003,
+      "loss": 4.23,
+      "step": 1704
+    },
+    {
+      "epoch": 0.01705,
+      "grad_norm": 0.4713651239871979,
+      "learning_rate": 0.003,
+      "loss": 4.2307,
+      "step": 1705
+    },
+    {
+      "epoch": 0.01706,
+      "grad_norm": 0.3806135058403015,
+      "learning_rate": 0.003,
+      "loss": 4.2216,
+      "step": 1706
+    },
+    {
+      "epoch": 0.01707,
+      "grad_norm": 0.4176587164402008,
+      "learning_rate": 0.003,
+      "loss": 4.2444,
+      "step": 1707
+    },
+    {
+      "epoch": 0.01708,
+      "grad_norm": 0.36861827969551086,
+      "learning_rate": 0.003,
+      "loss": 4.2182,
+      "step": 1708
+    },
+    {
+      "epoch": 0.01709,
+      "grad_norm": 0.35236138105392456,
+      "learning_rate": 0.003,
+      "loss": 4.239,
+      "step": 1709
+    },
+    {
+      "epoch": 0.0171,
+      "grad_norm": 0.311535120010376,
+      "learning_rate": 0.003,
+      "loss": 4.1837,
+      "step": 1710
+    },
+    {
+      "epoch": 0.01711,
+      "grad_norm": 0.3165230453014374,
+      "learning_rate": 0.003,
+      "loss": 4.2135,
+      "step": 1711
+    },
+    {
+      "epoch": 0.01712,
+      "grad_norm": 0.3468313217163086,
+      "learning_rate": 0.003,
+      "loss": 4.2236,
+      "step": 1712
+    },
+    {
+      "epoch": 0.01713,
+      "grad_norm": 0.36829516291618347,
+      "learning_rate": 0.003,
+      "loss": 4.1963,
+      "step": 1713
+    },
+    {
+      "epoch": 0.01714,
+      "grad_norm": 0.37751680612564087,
+      "learning_rate": 0.003,
+      "loss": 4.2329,
+      "step": 1714
+    },
+    {
+      "epoch": 0.01715,
+      "grad_norm": 0.40578410029411316,
+      "learning_rate": 0.003,
+      "loss": 4.2399,
+      "step": 1715
+    },
+    {
+      "epoch": 0.01716,
+      "grad_norm": 0.46360400319099426,
+      "learning_rate": 0.003,
+      "loss": 4.218,
+      "step": 1716
+    },
+    {
+      "epoch": 0.01717,
+      "grad_norm": 0.628455638885498,
+      "learning_rate": 0.003,
+      "loss": 4.2545,
+      "step": 1717
+    },
+    {
+      "epoch": 0.01718,
+      "grad_norm": 0.8051602244377136,
+      "learning_rate": 0.003,
+      "loss": 4.2358,
+      "step": 1718
+    },
+    {
+      "epoch": 0.01719,
+      "grad_norm": 0.8738026022911072,
+      "learning_rate": 0.003,
+      "loss": 4.2571,
+      "step": 1719
+    },
+    {
+      "epoch": 0.0172,
+      "grad_norm": 0.7727829217910767,
+      "learning_rate": 0.003,
+      "loss": 4.2155,
+      "step": 1720
+    },
+    {
+      "epoch": 0.01721,
+      "grad_norm": 0.7172210812568665,
+      "learning_rate": 0.003,
+      "loss": 4.2297,
+      "step": 1721
+    },
+    {
+      "epoch": 0.01722,
+      "grad_norm": 0.7522033452987671,
+      "learning_rate": 0.003,
+      "loss": 4.2262,
+      "step": 1722
+    },
+    {
+      "epoch": 0.01723,
+      "grad_norm": 0.6683304905891418,
+      "learning_rate": 0.003,
+      "loss": 4.2285,
+      "step": 1723
+    },
+    {
+      "epoch": 0.01724,
+      "grad_norm": 0.5735771656036377,
+      "learning_rate": 0.003,
+      "loss": 4.2286,
+      "step": 1724
+    },
+    {
+      "epoch": 0.01725,
+      "grad_norm": 0.6008380651473999,
+      "learning_rate": 0.003,
+      "loss": 4.2128,
+      "step": 1725
+    },
+    {
+      "epoch": 0.01726,
+      "grad_norm": 0.57844078540802,
+      "learning_rate": 0.003,
+      "loss": 4.2055,
+      "step": 1726
+    },
+    {
+      "epoch": 0.01727,
+      "grad_norm": 0.5139830112457275,
+      "learning_rate": 0.003,
+      "loss": 4.2082,
+      "step": 1727
+    },
+    {
+      "epoch": 0.01728,
+      "grad_norm": 0.4524785876274109,
+      "learning_rate": 0.003,
+      "loss": 4.2098,
+      "step": 1728
+    },
+    {
+      "epoch": 0.01729,
+      "grad_norm": 0.40762364864349365,
+      "learning_rate": 0.003,
+      "loss": 4.2149,
+      "step": 1729
+    },
+    {
+      "epoch": 0.0173,
+      "grad_norm": 0.42627114057540894,
+      "learning_rate": 0.003,
+      "loss": 4.1889,
+      "step": 1730
+    },
+    {
+      "epoch": 0.01731,
+      "grad_norm": 0.41734832525253296,
+      "learning_rate": 0.003,
+      "loss": 4.208,
+      "step": 1731
+    },
+    {
+      "epoch": 0.01732,
+      "grad_norm": 0.46209970116615295,
+      "learning_rate": 0.003,
+      "loss": 4.2163,
+      "step": 1732
+    },
+    {
+      "epoch": 0.01733,
+      "grad_norm": 0.5026668906211853,
+      "learning_rate": 0.003,
+      "loss": 4.1988,
+      "step": 1733
+    },
+    {
+      "epoch": 0.01734,
+      "grad_norm": 0.4639485478401184,
+      "learning_rate": 0.003,
+      "loss": 4.2076,
+      "step": 1734
+    },
+    {
+      "epoch": 0.01735,
+      "grad_norm": 0.5392582416534424,
+      "learning_rate": 0.003,
+      "loss": 4.1815,
+      "step": 1735
+    },
+    {
+      "epoch": 0.01736,
+      "grad_norm": 0.6344774961471558,
+      "learning_rate": 0.003,
+      "loss": 4.186,
+      "step": 1736
+    },
+    {
+      "epoch": 0.01737,
+      "grad_norm": 0.6952452063560486,
+      "learning_rate": 0.003,
+      "loss": 4.2203,
+      "step": 1737
+    },
+    {
+      "epoch": 0.01738,
+      "grad_norm": 0.7636409997940063,
+      "learning_rate": 0.003,
+      "loss": 4.2103,
+      "step": 1738
+    },
+    {
+      "epoch": 0.01739,
+      "grad_norm": 0.8100059628486633,
+      "learning_rate": 0.003,
+      "loss": 4.2313,
+      "step": 1739
+    },
+    {
+      "epoch": 0.0174,
+      "grad_norm": 0.7574781775474548,
+      "learning_rate": 0.003,
+      "loss": 4.2287,
+      "step": 1740
+    },
+    {
+      "epoch": 0.01741,
+      "grad_norm": 0.8603562116622925,
+      "learning_rate": 0.003,
+      "loss": 4.2408,
+      "step": 1741
+    },
+    {
+      "epoch": 0.01742,
+      "grad_norm": 1.0610615015029907,
+      "learning_rate": 0.003,
+      "loss": 4.2403,
+      "step": 1742
+    },
+    {
+      "epoch": 0.01743,
+      "grad_norm": 0.9020041823387146,
+      "learning_rate": 0.003,
+      "loss": 4.2526,
+      "step": 1743
+    },
+    {
+      "epoch": 0.01744,
+      "grad_norm": 0.7726603746414185,
+      "learning_rate": 0.003,
+      "loss": 4.2302,
+      "step": 1744
+    },
+    {
+      "epoch": 0.01745,
+      "grad_norm": 0.842292070388794,
+      "learning_rate": 0.003,
+      "loss": 4.1998,
+      "step": 1745
+    },
+    {
+      "epoch": 0.01746,
+      "grad_norm": 0.8620842099189758,
+      "learning_rate": 0.003,
+      "loss": 4.2285,
+      "step": 1746
+    },
+    {
+      "epoch": 0.01747,
+      "grad_norm": 0.8984124660491943,
+      "learning_rate": 0.003,
+      "loss": 4.2309,
+      "step": 1747
+    },
+    {
+      "epoch": 0.01748,
+      "grad_norm": 0.7890809178352356,
+      "learning_rate": 0.003,
+      "loss": 4.2184,
+      "step": 1748
+    },
+    {
+      "epoch": 0.01749,
+      "grad_norm": 0.9051846861839294,
+      "learning_rate": 0.003,
+      "loss": 4.2199,
+      "step": 1749
+    },
+    {
+      "epoch": 0.0175,
+      "grad_norm": 0.8282995820045471,
+      "learning_rate": 0.003,
+      "loss": 4.2356,
+      "step": 1750
+    },
+    {
+      "epoch": 0.01751,
+      "grad_norm": 0.7147263884544373,
+      "learning_rate": 0.003,
+      "loss": 4.2187,
+      "step": 1751
+    },
+    {
+      "epoch": 0.01752,
+      "grad_norm": 0.8123989701271057,
+      "learning_rate": 0.003,
+      "loss": 4.269,
+      "step": 1752
+    },
+    {
+      "epoch": 0.01753,
+      "grad_norm": 0.8011901378631592,
+      "learning_rate": 0.003,
+      "loss": 4.2178,
+      "step": 1753
+    },
+    {
+      "epoch": 0.01754,
+      "grad_norm": 0.6684347987174988,
+      "learning_rate": 0.003,
+      "loss": 4.2455,
+      "step": 1754
+    },
+    {
+      "epoch": 0.01755,
+      "grad_norm": 0.5351077318191528,
+      "learning_rate": 0.003,
+      "loss": 4.1816,
+      "step": 1755
+    },
+    {
+      "epoch": 0.01756,
+      "grad_norm": 0.5229023098945618,
+      "learning_rate": 0.003,
+      "loss": 4.2561,
+      "step": 1756
+    },
+    {
+      "epoch": 0.01757,
+      "grad_norm": 0.6237030029296875,
+      "learning_rate": 0.003,
+      "loss": 4.2403,
+      "step": 1757
+    },
+    {
+      "epoch": 0.01758,
+      "grad_norm": 0.5928184986114502,
+      "learning_rate": 0.003,
+      "loss": 4.2396,
+      "step": 1758
+    },
+    {
+      "epoch": 0.01759,
+      "grad_norm": 0.5645509362220764,
+      "learning_rate": 0.003,
+      "loss": 4.2107,
+      "step": 1759
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.6238541603088379,
+      "learning_rate": 0.003,
+      "loss": 4.2329,
+      "step": 1760
+    },
+    {
+      "epoch": 0.01761,
+      "grad_norm": 0.8237802982330322,
+      "learning_rate": 0.003,
+      "loss": 4.2266,
+      "step": 1761
+    },
+    {
+      "epoch": 0.01762,
+      "grad_norm": 1.0283610820770264,
+      "learning_rate": 0.003,
+      "loss": 4.2546,
+      "step": 1762
+    },
+    {
+      "epoch": 0.01763,
+      "grad_norm": 0.9172603487968445,
+      "learning_rate": 0.003,
+      "loss": 4.2466,
+      "step": 1763
+    },
+    {
+      "epoch": 0.01764,
+      "grad_norm": 0.7957273721694946,
+      "learning_rate": 0.003,
+      "loss": 4.2112,
+      "step": 1764
+    },
+    {
+      "epoch": 0.01765,
+      "grad_norm": 0.7934739589691162,
+      "learning_rate": 0.003,
+      "loss": 4.2362,
+      "step": 1765
+    },
+    {
+      "epoch": 0.01766,
+      "grad_norm": 0.8055094480514526,
+      "learning_rate": 0.003,
+      "loss": 4.2499,
+      "step": 1766
+    },
+    {
+      "epoch": 0.01767,
+      "grad_norm": 0.7152431011199951,
+      "learning_rate": 0.003,
+      "loss": 4.21,
+      "step": 1767
+    },
+    {
+      "epoch": 0.01768,
+      "grad_norm": 0.6750852465629578,
+      "learning_rate": 0.003,
+      "loss": 4.2147,
+      "step": 1768
+    },
+    {
+      "epoch": 0.01769,
+      "grad_norm": 0.62043297290802,
+      "learning_rate": 0.003,
+      "loss": 4.2126,
+      "step": 1769
+    },
+    {
+      "epoch": 0.0177,
+      "grad_norm": 0.7558618783950806,
+      "learning_rate": 0.003,
+      "loss": 4.2018,
+      "step": 1770
+    },
+    {
+      "epoch": 0.01771,
+      "grad_norm": 0.7306205034255981,
+      "learning_rate": 0.003,
+      "loss": 4.235,
+      "step": 1771
+    },
+    {
+      "epoch": 0.01772,
+      "grad_norm": 0.5451275706291199,
+      "learning_rate": 0.003,
+      "loss": 4.1989,
+      "step": 1772
+    },
+    {
+      "epoch": 0.01773,
+      "grad_norm": 0.5318989157676697,
+      "learning_rate": 0.003,
+      "loss": 4.1925,
+      "step": 1773
+    },
+    {
+      "epoch": 0.01774,
+      "grad_norm": 0.5252489447593689,
+      "learning_rate": 0.003,
+      "loss": 4.2149,
+      "step": 1774
+    },
+    {
+      "epoch": 0.01775,
+      "grad_norm": 0.4652414321899414,
+      "learning_rate": 0.003,
+      "loss": 4.1972,
+      "step": 1775
+    },
+    {
+      "epoch": 0.01776,
+      "grad_norm": 0.4579496681690216,
+      "learning_rate": 0.003,
+      "loss": 4.1905,
+      "step": 1776
+    },
+    {
+      "epoch": 0.01777,
+      "grad_norm": 0.46492844820022583,
+      "learning_rate": 0.003,
+      "loss": 4.2099,
+      "step": 1777
+    },
+    {
+      "epoch": 0.01778,
+      "grad_norm": 0.46276724338531494,
+      "learning_rate": 0.003,
+      "loss": 4.2021,
+      "step": 1778
+    },
+    {
+      "epoch": 0.01779,
+      "grad_norm": 0.6184820532798767,
+      "learning_rate": 0.003,
+      "loss": 4.2227,
+      "step": 1779
+    },
+    {
+      "epoch": 0.0178,
+      "grad_norm": 0.7079681158065796,
+      "learning_rate": 0.003,
+      "loss": 4.1963,
+      "step": 1780
+    },
+    {
+      "epoch": 0.01781,
+      "grad_norm": 0.6819452047348022,
+      "learning_rate": 0.003,
+      "loss": 4.2105,
+      "step": 1781
+    },
+    {
+      "epoch": 0.01782,
+      "grad_norm": 0.6064656376838684,
+      "learning_rate": 0.003,
+      "loss": 4.2325,
+      "step": 1782
+    },
+    {
+      "epoch": 0.01783,
+      "grad_norm": 0.5331481695175171,
+      "learning_rate": 0.003,
+      "loss": 4.2083,
+      "step": 1783
+    },
+    {
+      "epoch": 0.01784,
+      "grad_norm": 0.5265206098556519,
+      "learning_rate": 0.003,
+      "loss": 4.2093,
+      "step": 1784
+    },
+    {
+      "epoch": 0.01785,
+      "grad_norm": 0.5389939546585083,
+      "learning_rate": 0.003,
+      "loss": 4.1711,
+      "step": 1785
+    },
+    {
+      "epoch": 0.01786,
+      "grad_norm": 0.7006595134735107,
+      "learning_rate": 0.003,
+      "loss": 4.2034,
+      "step": 1786
+    },
+    {
+      "epoch": 0.01787,
+      "grad_norm": 0.7858084440231323,
+      "learning_rate": 0.003,
+      "loss": 4.2233,
+      "step": 1787
+    },
+    {
+      "epoch": 0.01788,
+      "grad_norm": 0.7923018336296082,
+      "learning_rate": 0.003,
+      "loss": 4.228,
+      "step": 1788
+    },
+    {
+      "epoch": 0.01789,
+      "grad_norm": 0.593018651008606,
+      "learning_rate": 0.003,
+      "loss": 4.1934,
+      "step": 1789
+    },
+    {
+      "epoch": 0.0179,
+      "grad_norm": 0.6494895815849304,
+      "learning_rate": 0.003,
+      "loss": 4.2138,
+      "step": 1790
+    },
+    {
+      "epoch": 0.01791,
+      "grad_norm": 0.7846472859382629,
+      "learning_rate": 0.003,
+      "loss": 4.2301,
+      "step": 1791
+    },
+    {
+      "epoch": 0.01792,
+      "grad_norm": 0.8576614260673523,
+      "learning_rate": 0.003,
+      "loss": 4.239,
+      "step": 1792
+    },
+    {
+      "epoch": 0.01793,
+      "grad_norm": 0.8833402991294861,
+      "learning_rate": 0.003,
+      "loss": 4.2068,
+      "step": 1793
+    },
+    {
+      "epoch": 0.01794,
+      "grad_norm": 1.0114383697509766,
+      "learning_rate": 0.003,
+      "loss": 4.2273,
+      "step": 1794
+    },
+    {
+      "epoch": 0.01795,
+      "grad_norm": 0.879726231098175,
+      "learning_rate": 0.003,
+      "loss": 4.2331,
+      "step": 1795
+    },
+    {
+      "epoch": 0.01796,
+      "grad_norm": 0.6577330827713013,
+      "learning_rate": 0.003,
+      "loss": 4.2394,
+      "step": 1796
+    },
+    {
+      "epoch": 0.01797,
+      "grad_norm": 0.7508055567741394,
+      "learning_rate": 0.003,
+      "loss": 4.2031,
+      "step": 1797
+    },
+    {
+      "epoch": 0.01798,
+      "grad_norm": 0.7287778854370117,
+      "learning_rate": 0.003,
+      "loss": 4.2178,
+      "step": 1798
+    },
+    {
+      "epoch": 0.01799,
+      "grad_norm": 0.7758158445358276,
+      "learning_rate": 0.003,
+      "loss": 4.2381,
+      "step": 1799
+    },
+    {
+      "epoch": 0.018,
+      "grad_norm": 0.652176558971405,
+      "learning_rate": 0.003,
+      "loss": 4.2204,
+      "step": 1800
+    },
+    {
+      "epoch": 0.01801,
+      "grad_norm": 0.6343629360198975,
+      "learning_rate": 0.003,
+      "loss": 4.2139,
+      "step": 1801
+    },
+    {
+      "epoch": 0.01802,
+      "grad_norm": 0.6776105165481567,
+      "learning_rate": 0.003,
+      "loss": 4.2317,
+      "step": 1802
+    },
+    {
+      "epoch": 0.01803,
+      "grad_norm": 0.7177024483680725,
+      "learning_rate": 0.003,
+      "loss": 4.2423,
+      "step": 1803
+    },
+    {
+      "epoch": 0.01804,
+      "grad_norm": 0.8185440301895142,
+      "learning_rate": 0.003,
+      "loss": 4.2159,
+      "step": 1804
+    },
+    {
+      "epoch": 0.01805,
+      "grad_norm": 0.8301059603691101,
+      "learning_rate": 0.003,
+      "loss": 4.2394,
+      "step": 1805
+    },
+    {
+      "epoch": 0.01806,
+      "grad_norm": 0.7407498359680176,
+      "learning_rate": 0.003,
+      "loss": 4.191,
+      "step": 1806
+    },
+    {
+      "epoch": 0.01807,
+      "grad_norm": 0.7113837003707886,
+      "learning_rate": 0.003,
+      "loss": 4.2277,
+      "step": 1807
+    },
+    {
+      "epoch": 0.01808,
+      "grad_norm": 0.5918108224868774,
+      "learning_rate": 0.003,
+      "loss": 4.2149,
+      "step": 1808
+    },
+    {
+      "epoch": 0.01809,
+      "grad_norm": 0.5566163659095764,
+      "learning_rate": 0.003,
+      "loss": 4.2318,
+      "step": 1809
+    },
+    {
+      "epoch": 0.0181,
+      "grad_norm": 0.4694720506668091,
+      "learning_rate": 0.003,
+      "loss": 4.224,
+      "step": 1810
+    },
+    {
+      "epoch": 0.01811,
+      "grad_norm": 0.45558327436447144,
+      "learning_rate": 0.003,
+      "loss": 4.2076,
+      "step": 1811
+    },
+    {
+      "epoch": 0.01812,
+      "grad_norm": 0.4792705476284027,
+      "learning_rate": 0.003,
+      "loss": 4.1941,
+      "step": 1812
+    },
+    {
+      "epoch": 0.01813,
+      "grad_norm": 0.43152645230293274,
+      "learning_rate": 0.003,
+      "loss": 4.2329,
+      "step": 1813
+    },
+    {
+      "epoch": 0.01814,
+      "grad_norm": 0.4722409248352051,
+      "learning_rate": 0.003,
+      "loss": 4.2001,
+      "step": 1814
+    },
+    {
+      "epoch": 0.01815,
+      "grad_norm": 0.516764760017395,
+      "learning_rate": 0.003,
+      "loss": 4.1882,
+      "step": 1815
+    },
+    {
+      "epoch": 0.01816,
+      "grad_norm": 0.6370428800582886,
+      "learning_rate": 0.003,
+      "loss": 4.2183,
+      "step": 1816
+    },
+    {
+      "epoch": 0.01817,
+      "grad_norm": 0.7442272305488586,
+      "learning_rate": 0.003,
+      "loss": 4.2141,
+      "step": 1817
+    },
+    {
+      "epoch": 0.01818,
+      "grad_norm": 0.7647615671157837,
+      "learning_rate": 0.003,
+      "loss": 4.2145,
+      "step": 1818
+    },
+    {
+      "epoch": 0.01819,
+      "grad_norm": 0.683918833732605,
+      "learning_rate": 0.003,
+      "loss": 4.1944,
+      "step": 1819
+    },
+    {
+      "epoch": 0.0182,
+      "grad_norm": 0.6988905072212219,
+      "learning_rate": 0.003,
+      "loss": 4.2155,
+      "step": 1820
+    },
+    {
+      "epoch": 0.01821,
+      "grad_norm": 0.632123589515686,
+      "learning_rate": 0.003,
+      "loss": 4.1997,
+      "step": 1821
+    },
+    {
+      "epoch": 0.01822,
+      "grad_norm": 0.5962942838668823,
+      "learning_rate": 0.003,
+      "loss": 4.2102,
+      "step": 1822
+    },
+    {
+      "epoch": 0.01823,
+      "grad_norm": 0.5034074783325195,
+      "learning_rate": 0.003,
+      "loss": 4.2224,
+      "step": 1823
+    },
+    {
+      "epoch": 0.01824,
+      "grad_norm": 0.4946107268333435,
+      "learning_rate": 0.003,
+      "loss": 4.2025,
+      "step": 1824
+    },
+    {
+      "epoch": 0.01825,
+      "grad_norm": 0.5331458449363708,
+      "learning_rate": 0.003,
+      "loss": 4.2091,
+      "step": 1825
+    },
+    {
+      "epoch": 0.01826,
+      "grad_norm": 0.4815158247947693,
+      "learning_rate": 0.003,
+      "loss": 4.2164,
+      "step": 1826
+    },
+    {
+      "epoch": 0.01827,
+      "grad_norm": 0.5388825535774231,
+      "learning_rate": 0.003,
+      "loss": 4.1869,
+      "step": 1827
+    },
+    {
+      "epoch": 0.01828,
+      "grad_norm": 0.6578684449195862,
+      "learning_rate": 0.003,
+      "loss": 4.2037,
+      "step": 1828
+    },
+    {
+      "epoch": 0.01829,
+      "grad_norm": 0.7422965168952942,
+      "learning_rate": 0.003,
+      "loss": 4.2232,
+      "step": 1829
+    },
+    {
+      "epoch": 0.0183,
+      "grad_norm": 0.7160323262214661,
+      "learning_rate": 0.003,
+      "loss": 4.2196,
+      "step": 1830
+    },
+    {
+      "epoch": 0.01831,
+      "grad_norm": 0.7727608680725098,
+      "learning_rate": 0.003,
+      "loss": 4.2097,
+      "step": 1831
+    },
+    {
+      "epoch": 0.01832,
+      "grad_norm": 0.9163352847099304,
+      "learning_rate": 0.003,
+      "loss": 4.2196,
+      "step": 1832
+    },
+    {
+      "epoch": 0.01833,
+      "grad_norm": 0.7811264395713806,
+      "learning_rate": 0.003,
+      "loss": 4.2469,
+      "step": 1833
+    },
+    {
+      "epoch": 0.01834,
+      "grad_norm": 0.7502668499946594,
+      "learning_rate": 0.003,
+      "loss": 4.214,
+      "step": 1834
+    },
+    {
+      "epoch": 0.01835,
+      "grad_norm": 0.7814200520515442,
+      "learning_rate": 0.003,
+      "loss": 4.2163,
+      "step": 1835
+    },
+    {
+      "epoch": 0.01836,
+      "grad_norm": 0.7723317742347717,
+      "learning_rate": 0.003,
+      "loss": 4.2127,
+      "step": 1836
+    },
+    {
+      "epoch": 0.01837,
+      "grad_norm": 0.7659316062927246,
+      "learning_rate": 0.003,
+      "loss": 4.2181,
+      "step": 1837
+    },
+    {
+      "epoch": 0.01838,
+      "grad_norm": 0.789987325668335,
+      "learning_rate": 0.003,
+      "loss": 4.2248,
+      "step": 1838
+    },
+    {
+      "epoch": 0.01839,
+      "grad_norm": 0.8428659439086914,
+      "learning_rate": 0.003,
+      "loss": 4.2435,
+      "step": 1839
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.9034631252288818,
+      "learning_rate": 0.003,
+      "loss": 4.2468,
+      "step": 1840
+    },
+    {
+      "epoch": 0.01841,
+      "grad_norm": 0.8657104969024658,
+      "learning_rate": 0.003,
+      "loss": 4.1845,
+      "step": 1841
+    },
+    {
+      "epoch": 0.01842,
+      "grad_norm": 0.8507790565490723,
+      "learning_rate": 0.003,
+      "loss": 4.2207,
+      "step": 1842
+    },
+    {
+      "epoch": 0.01843,
+      "grad_norm": 0.942220151424408,
+      "learning_rate": 0.003,
+      "loss": 4.2062,
+      "step": 1843
+    },
+    {
+      "epoch": 0.01844,
+      "grad_norm": 1.0908007621765137,
+      "learning_rate": 0.003,
+      "loss": 4.2172,
+      "step": 1844
+    },
+    {
+      "epoch": 0.01845,
+      "grad_norm": 1.0858932733535767,
+      "learning_rate": 0.003,
+      "loss": 4.2239,
+      "step": 1845
+    },
+    {
+      "epoch": 0.01846,
+      "grad_norm": 0.8852041959762573,
+      "learning_rate": 0.003,
+      "loss": 4.2518,
+      "step": 1846
+    },
+    {
+      "epoch": 0.01847,
+      "grad_norm": 0.818362295627594,
+      "learning_rate": 0.003,
+      "loss": 4.243,
+      "step": 1847
+    },
+    {
+      "epoch": 0.01848,
+      "grad_norm": 0.8533189296722412,
+      "learning_rate": 0.003,
+      "loss": 4.2588,
+      "step": 1848
+    },
+    {
+      "epoch": 0.01849,
+      "grad_norm": 0.9039400219917297,
+      "learning_rate": 0.003,
+      "loss": 4.2557,
+      "step": 1849
+    },
+    {
+      "epoch": 0.0185,
+      "grad_norm": 0.769622802734375,
+      "learning_rate": 0.003,
+      "loss": 4.2424,
+      "step": 1850
+    },
+    {
+      "epoch": 0.01851,
+      "grad_norm": 0.6524956226348877,
+      "learning_rate": 0.003,
+      "loss": 4.2169,
+      "step": 1851
+    },
+    {
+      "epoch": 0.01852,
+      "grad_norm": 0.6401670575141907,
+      "learning_rate": 0.003,
+      "loss": 4.2199,
+      "step": 1852
+    },
+    {
+      "epoch": 0.01853,
+      "grad_norm": 0.593191921710968,
+      "learning_rate": 0.003,
+      "loss": 4.2181,
+      "step": 1853
+    },
+    {
+      "epoch": 0.01854,
+      "grad_norm": 0.47404181957244873,
+      "learning_rate": 0.003,
+      "loss": 4.2422,
+      "step": 1854
+    },
+    {
+      "epoch": 0.01855,
+      "grad_norm": 0.3873991072177887,
+      "learning_rate": 0.003,
+      "loss": 4.2245,
+      "step": 1855
+    },
+    {
+      "epoch": 0.01856,
+      "grad_norm": 0.3870847821235657,
+      "learning_rate": 0.003,
+      "loss": 4.2074,
+      "step": 1856
+    },
+    {
+      "epoch": 0.01857,
+      "grad_norm": 0.41644003987312317,
+      "learning_rate": 0.003,
+      "loss": 4.19,
+      "step": 1857
+    },
+    {
+      "epoch": 0.01858,
+      "grad_norm": 0.380154550075531,
+      "learning_rate": 0.003,
+      "loss": 4.196,
+      "step": 1858
+    },
+    {
+      "epoch": 0.01859,
+      "grad_norm": 0.39841675758361816,
+      "learning_rate": 0.003,
+      "loss": 4.1921,
+      "step": 1859
+    },
+    {
+      "epoch": 0.0186,
+      "grad_norm": 0.3521161377429962,
+      "learning_rate": 0.003,
+      "loss": 4.1722,
+      "step": 1860
+    },
+    {
+      "epoch": 0.01861,
+      "grad_norm": 0.41872426867485046,
+      "learning_rate": 0.003,
+      "loss": 4.2174,
+      "step": 1861
+    },
+    {
+      "epoch": 0.01862,
+      "grad_norm": 0.47171056270599365,
+      "learning_rate": 0.003,
+      "loss": 4.1831,
+      "step": 1862
+    },
+    {
+      "epoch": 0.01863,
+      "grad_norm": 0.5940234661102295,
+      "learning_rate": 0.003,
+      "loss": 4.1933,
+      "step": 1863
+    },
+    {
+      "epoch": 0.01864,
+      "grad_norm": 0.659092366695404,
+      "learning_rate": 0.003,
+      "loss": 4.2305,
+      "step": 1864
+    },
+    {
+      "epoch": 0.01865,
+      "grad_norm": 0.6307891607284546,
+      "learning_rate": 0.003,
+      "loss": 4.2044,
+      "step": 1865
+    },
+    {
+      "epoch": 0.01866,
+      "grad_norm": 0.46219319105148315,
+      "learning_rate": 0.003,
+      "loss": 4.1993,
+      "step": 1866
+    },
+    {
+      "epoch": 0.01867,
+      "grad_norm": 0.5417385697364807,
+      "learning_rate": 0.003,
+      "loss": 4.1771,
+      "step": 1867
+    },
+    {
+      "epoch": 0.01868,
+      "grad_norm": 0.6495358943939209,
+      "learning_rate": 0.003,
+      "loss": 4.197,
+      "step": 1868
+    },
+    {
+      "epoch": 0.01869,
+      "grad_norm": 0.6564030647277832,
+      "learning_rate": 0.003,
+      "loss": 4.2127,
+      "step": 1869
+    },
+    {
+      "epoch": 0.0187,
+      "grad_norm": 0.6381605267524719,
+      "learning_rate": 0.003,
+      "loss": 4.2029,
+      "step": 1870
+    },
+    {
+      "epoch": 0.01871,
+      "grad_norm": 0.6345733404159546,
+      "learning_rate": 0.003,
+      "loss": 4.1787,
+      "step": 1871
+    },
+    {
+      "epoch": 0.01872,
+      "grad_norm": 0.6442919969558716,
+      "learning_rate": 0.003,
+      "loss": 4.2256,
+      "step": 1872
+    },
+    {
+      "epoch": 0.01873,
+      "grad_norm": 0.6332488656044006,
+      "learning_rate": 0.003,
+      "loss": 4.1849,
+      "step": 1873
+    },
+    {
+      "epoch": 0.01874,
+      "grad_norm": 0.6798062324523926,
+      "learning_rate": 0.003,
+      "loss": 4.191,
+      "step": 1874
+    },
+    {
+      "epoch": 0.01875,
+      "grad_norm": 0.6715080738067627,
+      "learning_rate": 0.003,
+      "loss": 4.2097,
+      "step": 1875
+    },
+    {
+      "epoch": 0.01876,
+      "grad_norm": 0.7180529832839966,
+      "learning_rate": 0.003,
+      "loss": 4.2178,
+      "step": 1876
+    },
+    {
+      "epoch": 0.01877,
+      "grad_norm": 0.663026750087738,
+      "learning_rate": 0.003,
+      "loss": 4.2178,
+      "step": 1877
+    },
+    {
+      "epoch": 0.01878,
+      "grad_norm": 0.7030567526817322,
+      "learning_rate": 0.003,
+      "loss": 4.2041,
+      "step": 1878
+    },
+    {
+      "epoch": 0.01879,
+      "grad_norm": 0.5851782560348511,
+      "learning_rate": 0.003,
+      "loss": 4.1815,
+      "step": 1879
+    },
+    {
+      "epoch": 0.0188,
+      "grad_norm": 0.6897128820419312,
+      "learning_rate": 0.003,
+      "loss": 4.1941,
+      "step": 1880
+    },
+    {
+      "epoch": 0.01881,
+      "grad_norm": 0.7664402723312378,
+      "learning_rate": 0.003,
+      "loss": 4.1954,
+      "step": 1881
+    },
+    {
+      "epoch": 0.01882,
+      "grad_norm": 0.9874089360237122,
+      "learning_rate": 0.003,
+      "loss": 4.2309,
+      "step": 1882
+    },
+    {
+      "epoch": 0.01883,
+      "grad_norm": 1.0450137853622437,
+      "learning_rate": 0.003,
+      "loss": 4.2229,
+      "step": 1883
+    },
+    {
+      "epoch": 0.01884,
+      "grad_norm": 0.8142443895339966,
+      "learning_rate": 0.003,
+      "loss": 4.2104,
+      "step": 1884
+    },
+    {
+      "epoch": 0.01885,
+      "grad_norm": 0.6901335120201111,
+      "learning_rate": 0.003,
+      "loss": 4.1818,
+      "step": 1885
+    },
+    {
+      "epoch": 0.01886,
+      "grad_norm": 0.7235998511314392,
+      "learning_rate": 0.003,
+      "loss": 4.224,
+      "step": 1886
+    },
+    {
+      "epoch": 0.01887,
+      "grad_norm": 0.7312121987342834,
+      "learning_rate": 0.003,
+      "loss": 4.2148,
+      "step": 1887
+    },
+    {
+      "epoch": 0.01888,
+      "grad_norm": 0.772678017616272,
+      "learning_rate": 0.003,
+      "loss": 4.2048,
+      "step": 1888
+    },
+    {
+      "epoch": 0.01889,
+      "grad_norm": 0.7756654024124146,
+      "learning_rate": 0.003,
+      "loss": 4.2176,
+      "step": 1889
+    },
+    {
+      "epoch": 0.0189,
+      "grad_norm": 0.7287212610244751,
+      "learning_rate": 0.003,
+      "loss": 4.1971,
+      "step": 1890
+    },
+    {
+      "epoch": 0.01891,
+      "grad_norm": 0.7076724767684937,
+      "learning_rate": 0.003,
+      "loss": 4.2462,
+      "step": 1891
+    },
+    {
+      "epoch": 0.01892,
+      "grad_norm": 0.6671635508537292,
+      "learning_rate": 0.003,
+      "loss": 4.215,
+      "step": 1892
+    },
+    {
+      "epoch": 0.01893,
+      "grad_norm": 0.6953762769699097,
+      "learning_rate": 0.003,
+      "loss": 4.201,
+      "step": 1893
+    },
+    {
+      "epoch": 0.01894,
+      "grad_norm": 0.5940777063369751,
+      "learning_rate": 0.003,
+      "loss": 4.2018,
+      "step": 1894
+    },
+    {
+      "epoch": 0.01895,
+      "grad_norm": 0.5938246250152588,
+      "learning_rate": 0.003,
+      "loss": 4.1879,
+      "step": 1895
+    },
+    {
+      "epoch": 0.01896,
+      "grad_norm": 0.6224337816238403,
+      "learning_rate": 0.003,
+      "loss": 4.2109,
+      "step": 1896
+    },
+    {
+      "epoch": 0.01897,
+      "grad_norm": 0.5562435388565063,
+      "learning_rate": 0.003,
+      "loss": 4.2071,
+      "step": 1897
+    },
+    {
+      "epoch": 0.01898,
+      "grad_norm": 0.5165086984634399,
+      "learning_rate": 0.003,
+      "loss": 4.1748,
+      "step": 1898
+    },
+    {
+      "epoch": 0.01899,
+      "grad_norm": 0.46164852380752563,
+      "learning_rate": 0.003,
+      "loss": 4.1881,
+      "step": 1899
+    },
+    {
+      "epoch": 0.019,
+      "grad_norm": 0.42392870783805847,
+      "learning_rate": 0.003,
+      "loss": 4.179,
+      "step": 1900
+    },
+    {
+      "epoch": 0.01901,
+      "grad_norm": 0.41818806529045105,
+      "learning_rate": 0.003,
+      "loss": 4.2264,
+      "step": 1901
+    },
+    {
+      "epoch": 0.01902,
+      "grad_norm": 0.5019712448120117,
+      "learning_rate": 0.003,
+      "loss": 4.2029,
+      "step": 1902
+    },
+    {
+      "epoch": 0.01903,
+      "grad_norm": 0.6581493020057678,
+      "learning_rate": 0.003,
+      "loss": 4.1638,
+      "step": 1903
+    },
+    {
+      "epoch": 0.01904,
+      "grad_norm": 0.9066566228866577,
+      "learning_rate": 0.003,
+      "loss": 4.2291,
+      "step": 1904
+    },
+    {
+      "epoch": 0.01905,
+      "grad_norm": 0.9273806214332581,
+      "learning_rate": 0.003,
+      "loss": 4.2144,
+      "step": 1905
+    },
+    {
+      "epoch": 0.01906,
+      "grad_norm": 0.6842753291130066,
+      "learning_rate": 0.003,
+      "loss": 4.2198,
+      "step": 1906
+    },
+    {
+      "epoch": 0.01907,
+      "grad_norm": 0.7864841818809509,
+      "learning_rate": 0.003,
+      "loss": 4.217,
+      "step": 1907
+    },
+    {
+      "epoch": 0.01908,
+      "grad_norm": 0.9169145822525024,
+      "learning_rate": 0.003,
+      "loss": 4.2438,
+      "step": 1908
+    },
+    {
+      "epoch": 0.01909,
+      "grad_norm": 0.7539937496185303,
+      "learning_rate": 0.003,
+      "loss": 4.1979,
+      "step": 1909
+    },
+    {
+      "epoch": 0.0191,
+      "grad_norm": 1.0344467163085938,
+      "learning_rate": 0.003,
+      "loss": 4.2215,
+      "step": 1910
+    },
+    {
+      "epoch": 0.01911,
+      "grad_norm": 0.8631539940834045,
+      "learning_rate": 0.003,
+      "loss": 4.1974,
+      "step": 1911
+    },
+    {
+      "epoch": 0.01912,
+      "grad_norm": 0.691514790058136,
+      "learning_rate": 0.003,
+      "loss": 4.1992,
+      "step": 1912
+    },
+    {
+      "epoch": 0.01913,
+      "grad_norm": 0.6569962501525879,
+      "learning_rate": 0.003,
+      "loss": 4.2167,
+      "step": 1913
+    },
+    {
+      "epoch": 0.01914,
+      "grad_norm": 0.7453441619873047,
+      "learning_rate": 0.003,
+      "loss": 4.2353,
+      "step": 1914
+    },
+    {
+      "epoch": 0.01915,
+      "grad_norm": 0.6612529754638672,
+      "learning_rate": 0.003,
+      "loss": 4.2154,
+      "step": 1915
+    },
+    {
+      "epoch": 0.01916,
+      "grad_norm": 0.5441082119941711,
+      "learning_rate": 0.003,
+      "loss": 4.1923,
+      "step": 1916
+    },
+    {
+      "epoch": 0.01917,
+      "grad_norm": 0.5467451214790344,
+      "learning_rate": 0.003,
+      "loss": 4.2117,
+      "step": 1917
+    },
+    {
+      "epoch": 0.01918,
+      "grad_norm": 0.6295916438102722,
+      "learning_rate": 0.003,
+      "loss": 4.1987,
+      "step": 1918
+    },
+    {
+      "epoch": 0.01919,
+      "grad_norm": 0.6276860237121582,
+      "learning_rate": 0.003,
+      "loss": 4.2016,
+      "step": 1919
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.6265702247619629,
+      "learning_rate": 0.003,
+      "loss": 4.2023,
+      "step": 1920
+    },
+    {
+      "epoch": 0.01921,
+      "grad_norm": 0.6072244644165039,
+      "learning_rate": 0.003,
+      "loss": 4.1924,
+      "step": 1921
+    },
+    {
+      "epoch": 0.01922,
+      "grad_norm": 0.577560305595398,
+      "learning_rate": 0.003,
+      "loss": 4.2062,
+      "step": 1922
+    },
+    {
+      "epoch": 0.01923,
+      "grad_norm": 0.528586208820343,
+      "learning_rate": 0.003,
+      "loss": 4.2098,
+      "step": 1923
+    },
+    {
+      "epoch": 0.01924,
+      "grad_norm": 0.437764436006546,
+      "learning_rate": 0.003,
+      "loss": 4.2065,
+      "step": 1924
+    },
+    {
+      "epoch": 0.01925,
+      "grad_norm": 0.3857106566429138,
+      "learning_rate": 0.003,
+      "loss": 4.1935,
+      "step": 1925
+    },
+    {
+      "epoch": 0.01926,
+      "grad_norm": 0.4072262942790985,
+      "learning_rate": 0.003,
+      "loss": 4.2001,
+      "step": 1926
+    },
+    {
+      "epoch": 0.01927,
+      "grad_norm": 0.5231996774673462,
+      "learning_rate": 0.003,
+      "loss": 4.1922,
+      "step": 1927
+    },
+    {
+      "epoch": 0.01928,
+      "grad_norm": 0.6032196283340454,
+      "learning_rate": 0.003,
+      "loss": 4.1716,
+      "step": 1928
+    },
+    {
+      "epoch": 0.01929,
+      "grad_norm": 0.7323647737503052,
+      "learning_rate": 0.003,
+      "loss": 4.1937,
+      "step": 1929
+    },
+    {
+      "epoch": 0.0193,
+      "grad_norm": 0.7953292727470398,
+      "learning_rate": 0.003,
+      "loss": 4.1919,
+      "step": 1930
+    },
+    {
+      "epoch": 0.01931,
+      "grad_norm": 0.6991308927536011,
+      "learning_rate": 0.003,
+      "loss": 4.1873,
+      "step": 1931
+    },
+    {
+      "epoch": 0.01932,
+      "grad_norm": 0.6141433715820312,
+      "learning_rate": 0.003,
+      "loss": 4.1945,
+      "step": 1932
+    },
+    {
+      "epoch": 0.01933,
+      "grad_norm": 0.6607575416564941,
+      "learning_rate": 0.003,
+      "loss": 4.1875,
+      "step": 1933
+    },
+    {
+      "epoch": 0.01934,
+      "grad_norm": 0.665708065032959,
+      "learning_rate": 0.003,
+      "loss": 4.1913,
+      "step": 1934
+    },
+    {
+      "epoch": 0.01935,
+      "grad_norm": 0.6778123378753662,
+      "learning_rate": 0.003,
+      "loss": 4.2028,
+      "step": 1935
+    },
+    {
+      "epoch": 0.01936,
+      "grad_norm": 0.660306453704834,
+      "learning_rate": 0.003,
+      "loss": 4.188,
+      "step": 1936
+    },
+    {
+      "epoch": 0.01937,
+      "grad_norm": 0.6556859612464905,
+      "learning_rate": 0.003,
+      "loss": 4.201,
+      "step": 1937
+    },
+    {
+      "epoch": 0.01938,
+      "grad_norm": 0.6403252482414246,
+      "learning_rate": 0.003,
+      "loss": 4.1998,
+      "step": 1938
+    },
+    {
+      "epoch": 0.01939,
+      "grad_norm": 0.6520726084709167,
+      "learning_rate": 0.003,
+      "loss": 4.2214,
+      "step": 1939
+    },
+    {
+      "epoch": 0.0194,
+      "grad_norm": 0.6492891907691956,
+      "learning_rate": 0.003,
+      "loss": 4.1979,
+      "step": 1940
+    },
+    {
+      "epoch": 0.01941,
+      "grad_norm": 0.6015598773956299,
+      "learning_rate": 0.003,
+      "loss": 4.18,
+      "step": 1941
+    },
+    {
+      "epoch": 0.01942,
+      "grad_norm": 0.5949015021324158,
+      "learning_rate": 0.003,
+      "loss": 4.2027,
+      "step": 1942
+    },
+    {
+      "epoch": 0.01943,
+      "grad_norm": 0.5752372145652771,
+      "learning_rate": 0.003,
+      "loss": 4.1757,
+      "step": 1943
+    },
+    {
+      "epoch": 0.01944,
+      "grad_norm": 0.5773752331733704,
+      "learning_rate": 0.003,
+      "loss": 4.2158,
+      "step": 1944
+    },
+    {
+      "epoch": 0.01945,
+      "grad_norm": 0.6772210597991943,
+      "learning_rate": 0.003,
+      "loss": 4.1897,
+      "step": 1945
+    },
+    {
+      "epoch": 0.01946,
+      "grad_norm": 0.817459225654602,
+      "learning_rate": 0.003,
+      "loss": 4.1898,
+      "step": 1946
+    },
+    {
+      "epoch": 0.01947,
+      "grad_norm": 0.9603843092918396,
+      "learning_rate": 0.003,
+      "loss": 4.2053,
+      "step": 1947
+    },
+    {
+      "epoch": 0.01948,
+      "grad_norm": 0.9629886746406555,
+      "learning_rate": 0.003,
+      "loss": 4.2304,
+      "step": 1948
+    },
+    {
+      "epoch": 0.01949,
+      "grad_norm": 1.0253958702087402,
+      "learning_rate": 0.003,
+      "loss": 4.2301,
+      "step": 1949
+    },
+    {
+      "epoch": 0.0195,
+      "grad_norm": 1.0213903188705444,
+      "learning_rate": 0.003,
+      "loss": 4.2249,
+      "step": 1950
+    },
+    {
+      "epoch": 0.01951,
+      "grad_norm": 0.7116432189941406,
+      "learning_rate": 0.003,
+      "loss": 4.2054,
+      "step": 1951
+    },
+    {
+      "epoch": 0.01952,
+      "grad_norm": 0.6784964203834534,
+      "learning_rate": 0.003,
+      "loss": 4.2146,
+      "step": 1952
+    },
+    {
+      "epoch": 0.01953,
+      "grad_norm": 0.755315899848938,
+      "learning_rate": 0.003,
+      "loss": 4.2206,
+      "step": 1953
+    },
+    {
+      "epoch": 0.01954,
+      "grad_norm": 0.8843002319335938,
+      "learning_rate": 0.003,
+      "loss": 4.2306,
+      "step": 1954
+    },
+    {
+      "epoch": 0.01955,
+      "grad_norm": 0.9285549521446228,
+      "learning_rate": 0.003,
+      "loss": 4.2258,
+      "step": 1955
+    },
+    {
+      "epoch": 0.01956,
+      "grad_norm": 0.7131898999214172,
+      "learning_rate": 0.003,
+      "loss": 4.2135,
+      "step": 1956
+    },
+    {
+      "epoch": 0.01957,
+      "grad_norm": 0.7296762466430664,
+      "learning_rate": 0.003,
+      "loss": 4.1819,
+      "step": 1957
+    },
+    {
+      "epoch": 0.01958,
+      "grad_norm": 0.76239013671875,
+      "learning_rate": 0.003,
+      "loss": 4.1787,
+      "step": 1958
+    },
+    {
+      "epoch": 0.01959,
+      "grad_norm": 0.7252810001373291,
+      "learning_rate": 0.003,
+      "loss": 4.2032,
+      "step": 1959
+    },
+    {
+      "epoch": 0.0196,
+      "grad_norm": 0.7949764132499695,
+      "learning_rate": 0.003,
+      "loss": 4.1992,
+      "step": 1960
+    },
+    {
+      "epoch": 0.01961,
+      "grad_norm": 0.7223983407020569,
+      "learning_rate": 0.003,
+      "loss": 4.197,
+      "step": 1961
+    },
+    {
+      "epoch": 0.01962,
+      "grad_norm": 0.683202862739563,
+      "learning_rate": 0.003,
+      "loss": 4.1965,
+      "step": 1962
+    },
+    {
+      "epoch": 0.01963,
+      "grad_norm": 0.5978757739067078,
+      "learning_rate": 0.003,
+      "loss": 4.1801,
+      "step": 1963
+    },
+    {
+      "epoch": 0.01964,
+      "grad_norm": 0.6187034249305725,
+      "learning_rate": 0.003,
+      "loss": 4.2107,
+      "step": 1964
+    },
+    {
+      "epoch": 0.01965,
+      "grad_norm": 0.5564098358154297,
+      "learning_rate": 0.003,
+      "loss": 4.1988,
+      "step": 1965
+    },
+    {
+      "epoch": 0.01966,
+      "grad_norm": 0.6056374311447144,
+      "learning_rate": 0.003,
+      "loss": 4.2015,
+      "step": 1966
+    },
+    {
+      "epoch": 0.01967,
+      "grad_norm": 0.5678548812866211,
+      "learning_rate": 0.003,
+      "loss": 4.174,
+      "step": 1967
+    },
+    {
+      "epoch": 0.01968,
+      "grad_norm": 0.5833330750465393,
+      "learning_rate": 0.003,
+      "loss": 4.1938,
+      "step": 1968
+    },
+    {
+      "epoch": 0.01969,
+      "grad_norm": 0.681247889995575,
+      "learning_rate": 0.003,
+      "loss": 4.1923,
+      "step": 1969
+    },
+    {
+      "epoch": 0.0197,
+      "grad_norm": 0.6020233035087585,
+      "learning_rate": 0.003,
+      "loss": 4.1923,
+      "step": 1970
+    },
+    {
+      "epoch": 0.01971,
+      "grad_norm": 0.586289644241333,
+      "learning_rate": 0.003,
+      "loss": 4.1784,
+      "step": 1971
+    },
+    {
+      "epoch": 0.01972,
+      "grad_norm": 0.6138716340065002,
+      "learning_rate": 0.003,
+      "loss": 4.1822,
+      "step": 1972
+    },
+    {
+      "epoch": 0.01973,
+      "grad_norm": 0.6565843820571899,
+      "learning_rate": 0.003,
+      "loss": 4.1649,
+      "step": 1973
+    },
+    {
+      "epoch": 0.01974,
+      "grad_norm": 0.7141625881195068,
+      "learning_rate": 0.003,
+      "loss": 4.2062,
+      "step": 1974
+    },
+    {
+      "epoch": 0.01975,
+      "grad_norm": 0.9425281882286072,
+      "learning_rate": 0.003,
+      "loss": 4.2065,
+      "step": 1975
+    },
+    {
+      "epoch": 0.01976,
+      "grad_norm": 1.02041757106781,
+      "learning_rate": 0.003,
+      "loss": 4.2127,
+      "step": 1976
+    },
+    {
+      "epoch": 0.01977,
+      "grad_norm": 0.7708873748779297,
+      "learning_rate": 0.003,
+      "loss": 4.23,
+      "step": 1977
+    },
+    {
+      "epoch": 0.01978,
+      "grad_norm": 0.6835176348686218,
+      "learning_rate": 0.003,
+      "loss": 4.2285,
+      "step": 1978
+    },
+    {
+      "epoch": 0.01979,
+      "grad_norm": 0.6171081066131592,
+      "learning_rate": 0.003,
+      "loss": 4.2144,
+      "step": 1979
+    },
+    {
+      "epoch": 0.0198,
+      "grad_norm": 0.5269233584403992,
+      "learning_rate": 0.003,
+      "loss": 4.1914,
+      "step": 1980
+    },
+    {
+      "epoch": 0.01981,
+      "grad_norm": 0.5263850092887878,
+      "learning_rate": 0.003,
+      "loss": 4.175,
+      "step": 1981
+    },
+    {
+      "epoch": 0.01982,
+      "grad_norm": 0.50020831823349,
+      "learning_rate": 0.003,
+      "loss": 4.1986,
+      "step": 1982
+    },
+    {
+      "epoch": 0.01983,
+      "grad_norm": 0.5846083760261536,
+      "learning_rate": 0.003,
+      "loss": 4.194,
+      "step": 1983
+    },
+    {
+      "epoch": 0.01984,
+      "grad_norm": 0.6941750645637512,
+      "learning_rate": 0.003,
+      "loss": 4.2073,
+      "step": 1984
+    },
+    {
+      "epoch": 0.01985,
+      "grad_norm": 0.6278181672096252,
+      "learning_rate": 0.003,
+      "loss": 4.1955,
+      "step": 1985
+    },
+    {
+      "epoch": 0.01986,
+      "grad_norm": 0.5164998769760132,
+      "learning_rate": 0.003,
+      "loss": 4.2189,
+      "step": 1986
+    },
+    {
+      "epoch": 0.01987,
+      "grad_norm": 0.6220480799674988,
+      "learning_rate": 0.003,
+      "loss": 4.1777,
+      "step": 1987
+    },
+    {
+      "epoch": 0.01988,
+      "grad_norm": 0.668171226978302,
+      "learning_rate": 0.003,
+      "loss": 4.1924,
+      "step": 1988
+    },
+    {
+      "epoch": 0.01989,
+      "grad_norm": 0.7721487879753113,
+      "learning_rate": 0.003,
+      "loss": 4.1845,
+      "step": 1989
+    },
+    {
+      "epoch": 0.0199,
+      "grad_norm": 0.9032546281814575,
+      "learning_rate": 0.003,
+      "loss": 4.2066,
+      "step": 1990
+    },
+    {
+      "epoch": 0.01991,
+      "grad_norm": 0.9958043098449707,
+      "learning_rate": 0.003,
+      "loss": 4.1939,
+      "step": 1991
+    },
+    {
+      "epoch": 0.01992,
+      "grad_norm": 0.9264521598815918,
+      "learning_rate": 0.003,
+      "loss": 4.2269,
+      "step": 1992
+    },
+    {
+      "epoch": 0.01993,
+      "grad_norm": 0.8177193999290466,
+      "learning_rate": 0.003,
+      "loss": 4.2101,
+      "step": 1993
+    },
+    {
+      "epoch": 0.01994,
+      "grad_norm": 0.6321484446525574,
+      "learning_rate": 0.003,
+      "loss": 4.2233,
+      "step": 1994
+    },
+    {
+      "epoch": 0.01995,
+      "grad_norm": 0.6378874182701111,
+      "learning_rate": 0.003,
+      "loss": 4.216,
+      "step": 1995
+    },
+    {
+      "epoch": 0.01996,
+      "grad_norm": 0.5592439770698547,
+      "learning_rate": 0.003,
+      "loss": 4.1789,
+      "step": 1996
+    },
+    {
+      "epoch": 0.01997,
+      "grad_norm": 0.5063359141349792,
+      "learning_rate": 0.003,
+      "loss": 4.2089,
+      "step": 1997
+    },
+    {
+      "epoch": 0.01998,
+      "grad_norm": 0.498551607131958,
+      "learning_rate": 0.003,
+      "loss": 4.1917,
+      "step": 1998
+    },
+    {
+      "epoch": 0.01999,
+      "grad_norm": 0.5395573377609253,
+      "learning_rate": 0.003,
+      "loss": 4.1836,
+      "step": 1999
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.5799160003662109,
+      "learning_rate": 0.003,
+      "loss": 4.1718,
+      "step": 2000
+    },
+    {
+      "epoch": 0.02001,
+      "grad_norm": 0.6742599606513977,
+      "learning_rate": 0.003,
+      "loss": 4.2094,
+      "step": 2001
+    },
+    {
+      "epoch": 0.02002,
+      "grad_norm": 0.616278350353241,
+      "learning_rate": 0.003,
+      "loss": 4.2036,
+      "step": 2002
+    },
+    {
+      "epoch": 0.02003,
+      "grad_norm": 0.3921935260295868,
+      "learning_rate": 0.003,
+      "loss": 4.1952,
+      "step": 2003
+    },
+    {
+      "epoch": 0.02004,
+      "grad_norm": 0.3965532183647156,
+      "learning_rate": 0.003,
+      "loss": 4.1801,
+      "step": 2004
+    },
+    {
+      "epoch": 0.02005,
+      "grad_norm": 0.4591590166091919,
+      "learning_rate": 0.003,
+      "loss": 4.1874,
+      "step": 2005
+    },
+    {
+      "epoch": 0.02006,
+      "grad_norm": 0.43843862414360046,
+      "learning_rate": 0.003,
+      "loss": 4.1833,
+      "step": 2006
+    },
+    {
+      "epoch": 0.02007,
+      "grad_norm": 0.5198654532432556,
+      "learning_rate": 0.003,
+      "loss": 4.2,
+      "step": 2007
+    },
+    {
+      "epoch": 0.02008,
+      "grad_norm": 0.5515919327735901,
+      "learning_rate": 0.003,
+      "loss": 4.1918,
+      "step": 2008
+    },
+    {
+      "epoch": 0.02009,
+      "grad_norm": 0.5351531505584717,
+      "learning_rate": 0.003,
+      "loss": 4.1889,
+      "step": 2009
+    },
+    {
+      "epoch": 0.0201,
+      "grad_norm": 0.5307760834693909,
+      "learning_rate": 0.003,
+      "loss": 4.1996,
+      "step": 2010
+    },
+    {
+      "epoch": 0.02011,
+      "grad_norm": 0.46869173645973206,
+      "learning_rate": 0.003,
+      "loss": 4.183,
+      "step": 2011
+    },
+    {
+      "epoch": 0.02012,
+      "grad_norm": 0.45753300189971924,
+      "learning_rate": 0.003,
+      "loss": 4.1822,
+      "step": 2012
+    },
+    {
+      "epoch": 0.02013,
+      "grad_norm": 0.5114355087280273,
+      "learning_rate": 0.003,
+      "loss": 4.1902,
+      "step": 2013
+    },
+    {
+      "epoch": 0.02014,
+      "grad_norm": 0.62235426902771,
+      "learning_rate": 0.003,
+      "loss": 4.1784,
+      "step": 2014
+    },
+    {
+      "epoch": 0.02015,
+      "grad_norm": 0.8762567043304443,
+      "learning_rate": 0.003,
+      "loss": 4.1818,
+      "step": 2015
+    },
+    {
+      "epoch": 0.02016,
+      "grad_norm": 1.1758772134780884,
+      "learning_rate": 0.003,
+      "loss": 4.2184,
+      "step": 2016
+    },
+    {
+      "epoch": 0.02017,
+      "grad_norm": 0.8140804171562195,
+      "learning_rate": 0.003,
+      "loss": 4.196,
+      "step": 2017
+    },
+    {
+      "epoch": 0.02018,
+      "grad_norm": 0.8363609910011292,
+      "learning_rate": 0.003,
+      "loss": 4.201,
+      "step": 2018
+    },
+    {
+      "epoch": 0.02019,
+      "grad_norm": 0.8395445346832275,
+      "learning_rate": 0.003,
+      "loss": 4.2077,
+      "step": 2019
+    },
+    {
+      "epoch": 0.0202,
+      "grad_norm": 0.7397228479385376,
+      "learning_rate": 0.003,
+      "loss": 4.2085,
+      "step": 2020
+    },
+    {
+      "epoch": 0.02021,
+      "grad_norm": 0.8446124792098999,
+      "learning_rate": 0.003,
+      "loss": 4.2309,
+      "step": 2021
+    },
+    {
+      "epoch": 0.02022,
+      "grad_norm": 0.790037989616394,
+      "learning_rate": 0.003,
+      "loss": 4.1867,
+      "step": 2022
+    },
+    {
+      "epoch": 0.02023,
+      "grad_norm": 0.8900567889213562,
+      "learning_rate": 0.003,
+      "loss": 4.1644,
+      "step": 2023
+    },
+    {
+      "epoch": 0.02024,
+      "grad_norm": 0.8839752078056335,
+      "learning_rate": 0.003,
+      "loss": 4.1963,
+      "step": 2024
+    },
+    {
+      "epoch": 0.02025,
+      "grad_norm": 0.9088333249092102,
+      "learning_rate": 0.003,
+      "loss": 4.2446,
+      "step": 2025
+    },
+    {
+      "epoch": 0.02026,
+      "grad_norm": 0.8618097901344299,
+      "learning_rate": 0.003,
+      "loss": 4.226,
+      "step": 2026
+    },
+    {
+      "epoch": 0.02027,
+      "grad_norm": 0.6961638927459717,
+      "learning_rate": 0.003,
+      "loss": 4.2132,
+      "step": 2027
+    },
+    {
+      "epoch": 0.02028,
+      "grad_norm": 0.7510376572608948,
+      "learning_rate": 0.003,
+      "loss": 4.2002,
+      "step": 2028
+    },
+    {
+      "epoch": 0.02029,
+      "grad_norm": 0.6982995271682739,
+      "learning_rate": 0.003,
+      "loss": 4.2032,
+      "step": 2029
+    },
+    {
+      "epoch": 0.0203,
+      "grad_norm": 0.8095003366470337,
+      "learning_rate": 0.003,
+      "loss": 4.1988,
+      "step": 2030
+    },
+    {
+      "epoch": 0.02031,
+      "grad_norm": 0.8760609030723572,
+      "learning_rate": 0.003,
+      "loss": 4.2121,
+      "step": 2031
+    },
+    {
+      "epoch": 0.02032,
+      "grad_norm": 0.8105490207672119,
+      "learning_rate": 0.003,
+      "loss": 4.1833,
+      "step": 2032
+    },
+    {
+      "epoch": 0.02033,
+      "grad_norm": 0.7400681376457214,
+      "learning_rate": 0.003,
+      "loss": 4.2221,
+      "step": 2033
+    },
+    {
+      "epoch": 0.02034,
+      "grad_norm": 0.8544787168502808,
+      "learning_rate": 0.003,
+      "loss": 4.2195,
+      "step": 2034
+    },
+    {
+      "epoch": 0.02035,
+      "grad_norm": 0.8666634559631348,
+      "learning_rate": 0.003,
+      "loss": 4.2081,
+      "step": 2035
+    },
+    {
+      "epoch": 0.02036,
+      "grad_norm": 0.7516500949859619,
+      "learning_rate": 0.003,
+      "loss": 4.2028,
+      "step": 2036
+    },
+    {
+      "epoch": 0.02037,
+      "grad_norm": 0.6464591026306152,
+      "learning_rate": 0.003,
+      "loss": 4.2133,
+      "step": 2037
+    },
+    {
+      "epoch": 0.02038,
+      "grad_norm": 0.5785326957702637,
+      "learning_rate": 0.003,
+      "loss": 4.1922,
+      "step": 2038
+    },
+    {
+      "epoch": 0.02039,
+      "grad_norm": 0.5690309405326843,
+      "learning_rate": 0.003,
+      "loss": 4.2006,
+      "step": 2039
+    },
+    {
+      "epoch": 0.0204,
+      "grad_norm": 0.47884753346443176,
+      "learning_rate": 0.003,
+      "loss": 4.2009,
+      "step": 2040
+    },
+    {
+      "epoch": 0.02041,
+      "grad_norm": 0.43236783146858215,
+      "learning_rate": 0.003,
+      "loss": 4.1612,
+      "step": 2041
+    },
+    {
+      "epoch": 0.02042,
+      "grad_norm": 0.3718264102935791,
+      "learning_rate": 0.003,
+      "loss": 4.16,
+      "step": 2042
+    },
+    {
+      "epoch": 0.02043,
+      "grad_norm": 0.38901960849761963,
+      "learning_rate": 0.003,
+      "loss": 4.2028,
+      "step": 2043
+    },
+    {
+      "epoch": 0.02044,
+      "grad_norm": 0.391589492559433,
+      "learning_rate": 0.003,
+      "loss": 4.1754,
+      "step": 2044
+    },
+    {
+      "epoch": 0.02045,
+      "grad_norm": 0.35344189405441284,
+      "learning_rate": 0.003,
+      "loss": 4.1881,
+      "step": 2045
+    },
+    {
+      "epoch": 0.02046,
+      "grad_norm": 0.36751314997673035,
+      "learning_rate": 0.003,
+      "loss": 4.1643,
+      "step": 2046
+    },
+    {
+      "epoch": 0.02047,
+      "grad_norm": 0.3489588499069214,
+      "learning_rate": 0.003,
+      "loss": 4.1999,
+      "step": 2047
+    },
+    {
+      "epoch": 0.02048,
+      "grad_norm": 0.46102413535118103,
+      "learning_rate": 0.003,
+      "loss": 4.1826,
+      "step": 2048
+    },
+    {
+      "epoch": 0.02049,
+      "grad_norm": 0.5971166491508484,
+      "learning_rate": 0.003,
+      "loss": 4.2081,
+      "step": 2049
+    },
+    {
+      "epoch": 0.0205,
+      "grad_norm": 0.8196846842765808,
+      "learning_rate": 0.003,
+      "loss": 4.1627,
+      "step": 2050
+    },
+    {
+      "epoch": 0.02051,
+      "grad_norm": 1.046412467956543,
+      "learning_rate": 0.003,
+      "loss": 4.179,
+      "step": 2051
+    },
+    {
+      "epoch": 0.02052,
+      "grad_norm": 0.8385469317436218,
+      "learning_rate": 0.003,
+      "loss": 4.1822,
+      "step": 2052
+    },
+    {
+      "epoch": 0.02053,
+      "grad_norm": 0.7001754641532898,
+      "learning_rate": 0.003,
+      "loss": 4.1758,
+      "step": 2053
+    },
+    {
+      "epoch": 0.02054,
+      "grad_norm": 0.7512269020080566,
+      "learning_rate": 0.003,
+      "loss": 4.2184,
+      "step": 2054
+    },
+    {
+      "epoch": 0.02055,
+      "grad_norm": 0.7145476937294006,
+      "learning_rate": 0.003,
+      "loss": 4.1785,
+      "step": 2055
+    },
+    {
+      "epoch": 0.02056,
+      "grad_norm": 0.5185825824737549,
+      "learning_rate": 0.003,
+      "loss": 4.1667,
+      "step": 2056
+    },
+    {
+      "epoch": 0.02057,
+      "grad_norm": 0.7188176512718201,
+      "learning_rate": 0.003,
+      "loss": 4.2106,
+      "step": 2057
+    },
+    {
+      "epoch": 0.02058,
+      "grad_norm": 0.6570911407470703,
+      "learning_rate": 0.003,
+      "loss": 4.1847,
+      "step": 2058
+    },
+    {
+      "epoch": 0.02059,
+      "grad_norm": 0.6945111155509949,
+      "learning_rate": 0.003,
+      "loss": 4.1692,
+      "step": 2059
+    },
+    {
+      "epoch": 0.0206,
+      "grad_norm": 0.6718025207519531,
+      "learning_rate": 0.003,
+      "loss": 4.2144,
+      "step": 2060
+    },
+    {
+      "epoch": 0.02061,
+      "grad_norm": 0.7464396953582764,
+      "learning_rate": 0.003,
+      "loss": 4.1868,
+      "step": 2061
+    },
+    {
+      "epoch": 0.02062,
+      "grad_norm": 0.8595008850097656,
+      "learning_rate": 0.003,
+      "loss": 4.1982,
+      "step": 2062
+    },
+    {
+      "epoch": 0.02063,
+      "grad_norm": 0.7188435196876526,
+      "learning_rate": 0.003,
+      "loss": 4.186,
+      "step": 2063
+    },
+    {
+      "epoch": 0.02064,
+      "grad_norm": 0.7377521991729736,
+      "learning_rate": 0.003,
+      "loss": 4.1964,
+      "step": 2064
+    },
+    {
+      "epoch": 0.02065,
+      "grad_norm": 0.8290489315986633,
+      "learning_rate": 0.003,
+      "loss": 4.1817,
+      "step": 2065
+    },
+    {
+      "epoch": 0.02066,
+      "grad_norm": 0.8223605155944824,
+      "learning_rate": 0.003,
+      "loss": 4.1986,
+      "step": 2066
+    },
+    {
+      "epoch": 0.02067,
+      "grad_norm": 0.6963474154472351,
+      "learning_rate": 0.003,
+      "loss": 4.1927,
+      "step": 2067
+    },
+    {
+      "epoch": 0.02068,
+      "grad_norm": 0.6809067130088806,
+      "learning_rate": 0.003,
+      "loss": 4.1722,
+      "step": 2068
+    },
+    {
+      "epoch": 0.02069,
+      "grad_norm": 0.6437063813209534,
+      "learning_rate": 0.003,
+      "loss": 4.1953,
+      "step": 2069
+    },
+    {
+      "epoch": 0.0207,
+      "grad_norm": 0.6656236052513123,
+      "learning_rate": 0.003,
+      "loss": 4.1692,
+      "step": 2070
+    },
+    {
+      "epoch": 0.02071,
+      "grad_norm": 0.5361942052841187,
+      "learning_rate": 0.003,
+      "loss": 4.1939,
+      "step": 2071
+    },
+    {
+      "epoch": 0.02072,
+      "grad_norm": 0.5347724556922913,
+      "learning_rate": 0.003,
+      "loss": 4.199,
+      "step": 2072
+    },
+    {
+      "epoch": 0.02073,
+      "grad_norm": 0.5662330985069275,
+      "learning_rate": 0.003,
+      "loss": 4.1792,
+      "step": 2073
+    },
+    {
+      "epoch": 0.02074,
+      "grad_norm": 0.7545875310897827,
+      "learning_rate": 0.003,
+      "loss": 4.1916,
+      "step": 2074
+    },
+    {
+      "epoch": 0.02075,
+      "grad_norm": 0.7776836156845093,
+      "learning_rate": 0.003,
+      "loss": 4.2211,
+      "step": 2075
+    },
+    {
+      "epoch": 0.02076,
+      "grad_norm": 0.665153443813324,
+      "learning_rate": 0.003,
+      "loss": 4.2109,
+      "step": 2076
+    },
+    {
+      "epoch": 0.02077,
+      "grad_norm": 0.6429176926612854,
+      "learning_rate": 0.003,
+      "loss": 4.1758,
+      "step": 2077
+    },
+    {
+      "epoch": 0.02078,
+      "grad_norm": 0.6930047273635864,
+      "learning_rate": 0.003,
+      "loss": 4.1756,
+      "step": 2078
+    },
+    {
+      "epoch": 0.02079,
+      "grad_norm": 0.885644793510437,
+      "learning_rate": 0.003,
+      "loss": 4.1694,
+      "step": 2079
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.8631641864776611,
+      "learning_rate": 0.003,
+      "loss": 4.1873,
+      "step": 2080
+    },
+    {
+      "epoch": 0.02081,
+      "grad_norm": 0.7464285492897034,
+      "learning_rate": 0.003,
+      "loss": 4.2136,
+      "step": 2081
+    },
+    {
+      "epoch": 0.02082,
+      "grad_norm": 0.6033406257629395,
+      "learning_rate": 0.003,
+      "loss": 4.2032,
+      "step": 2082
+    },
+    {
+      "epoch": 0.02083,
+      "grad_norm": 0.5763504505157471,
+      "learning_rate": 0.003,
+      "loss": 4.1974,
+      "step": 2083
+    },
+    {
+      "epoch": 0.02084,
+      "grad_norm": 0.5885117650032043,
+      "learning_rate": 0.003,
+      "loss": 4.1894,
+      "step": 2084
+    },
+    {
+      "epoch": 0.02085,
+      "grad_norm": 0.6575955152511597,
+      "learning_rate": 0.003,
+      "loss": 4.1685,
+      "step": 2085
+    },
+    {
+      "epoch": 0.02086,
+      "grad_norm": 0.7422707080841064,
+      "learning_rate": 0.003,
+      "loss": 4.1768,
+      "step": 2086
+    },
+    {
+      "epoch": 0.02087,
+      "grad_norm": 0.8073996901512146,
+      "learning_rate": 0.003,
+      "loss": 4.183,
+      "step": 2087
+    },
+    {
+      "epoch": 0.02088,
+      "grad_norm": 0.7600259184837341,
+      "learning_rate": 0.003,
+      "loss": 4.2126,
+      "step": 2088
+    },
+    {
+      "epoch": 0.02089,
+      "grad_norm": 0.7548388242721558,
+      "learning_rate": 0.003,
+      "loss": 4.1805,
+      "step": 2089
+    },
+    {
+      "epoch": 0.0209,
+      "grad_norm": 0.6929354667663574,
+      "learning_rate": 0.003,
+      "loss": 4.1946,
+      "step": 2090
+    },
+    {
+      "epoch": 0.02091,
+      "grad_norm": 0.6573588252067566,
+      "learning_rate": 0.003,
+      "loss": 4.1839,
+      "step": 2091
+    },
+    {
+      "epoch": 0.02092,
+      "grad_norm": 0.5934063792228699,
+      "learning_rate": 0.003,
+      "loss": 4.1787,
+      "step": 2092
+    },
+    {
+      "epoch": 0.02093,
+      "grad_norm": 0.5749050974845886,
+      "learning_rate": 0.003,
+      "loss": 4.1452,
+      "step": 2093
+    },
+    {
+      "epoch": 0.02094,
+      "grad_norm": 0.6522166728973389,
+      "learning_rate": 0.003,
+      "loss": 4.1551,
+      "step": 2094
+    },
+    {
+      "epoch": 0.02095,
+      "grad_norm": 0.6819972991943359,
+      "learning_rate": 0.003,
+      "loss": 4.1987,
+      "step": 2095
+    },
+    {
+      "epoch": 0.02096,
+      "grad_norm": 0.8537513613700867,
+      "learning_rate": 0.003,
+      "loss": 4.1954,
+      "step": 2096
+    },
+    {
+      "epoch": 0.02097,
+      "grad_norm": 0.8873555660247803,
+      "learning_rate": 0.003,
+      "loss": 4.1812,
+      "step": 2097
+    },
+    {
+      "epoch": 0.02098,
+      "grad_norm": 0.8346237540245056,
+      "learning_rate": 0.003,
+      "loss": 4.1983,
+      "step": 2098
+    },
+    {
+      "epoch": 0.02099,
+      "grad_norm": 0.7050203084945679,
+      "learning_rate": 0.003,
+      "loss": 4.2048,
+      "step": 2099
+    },
+    {
+      "epoch": 0.021,
+      "grad_norm": 0.7459020614624023,
+      "learning_rate": 0.003,
+      "loss": 4.219,
+      "step": 2100
+    },
+    {
+      "epoch": 0.02101,
+      "grad_norm": 0.6101992726325989,
+      "learning_rate": 0.003,
+      "loss": 4.205,
+      "step": 2101
+    },
+    {
+      "epoch": 0.02102,
+      "grad_norm": 0.5766392350196838,
+      "learning_rate": 0.003,
+      "loss": 4.208,
+      "step": 2102
+    },
+    {
+      "epoch": 0.02103,
+      "grad_norm": 0.4940197765827179,
+      "learning_rate": 0.003,
+      "loss": 4.1645,
+      "step": 2103
+    },
+    {
+      "epoch": 0.02104,
+      "grad_norm": 0.5101218819618225,
+      "learning_rate": 0.003,
+      "loss": 4.1715,
+      "step": 2104
+    },
+    {
+      "epoch": 0.02105,
+      "grad_norm": 0.4358375072479248,
+      "learning_rate": 0.003,
+      "loss": 4.1703,
+      "step": 2105
+    },
+    {
+      "epoch": 0.02106,
+      "grad_norm": 0.4337053596973419,
+      "learning_rate": 0.003,
+      "loss": 4.1631,
+      "step": 2106
+    },
+    {
+      "epoch": 0.02107,
+      "grad_norm": 0.42299988865852356,
+      "learning_rate": 0.003,
+      "loss": 4.1743,
+      "step": 2107
+    },
+    {
+      "epoch": 0.02108,
+      "grad_norm": 0.408107191324234,
+      "learning_rate": 0.003,
+      "loss": 4.1546,
+      "step": 2108
+    },
+    {
+      "epoch": 0.02109,
+      "grad_norm": 0.36309558153152466,
+      "learning_rate": 0.003,
+      "loss": 4.1448,
+      "step": 2109
+    },
+    {
+      "epoch": 0.0211,
+      "grad_norm": 0.44200608134269714,
+      "learning_rate": 0.003,
+      "loss": 4.1607,
+      "step": 2110
+    },
+    {
+      "epoch": 0.02111,
+      "grad_norm": 0.5945785641670227,
+      "learning_rate": 0.003,
+      "loss": 4.1726,
+      "step": 2111
+    },
+    {
+      "epoch": 0.02112,
+      "grad_norm": 0.8337014317512512,
+      "learning_rate": 0.003,
+      "loss": 4.1799,
+      "step": 2112
+    },
+    {
+      "epoch": 0.02113,
+      "grad_norm": 0.9096565246582031,
+      "learning_rate": 0.003,
+      "loss": 4.1882,
+      "step": 2113
+    },
+    {
+      "epoch": 0.02114,
+      "grad_norm": 0.8218700289726257,
+      "learning_rate": 0.003,
+      "loss": 4.1728,
+      "step": 2114
+    },
+    {
+      "epoch": 0.02115,
+      "grad_norm": 0.7097974419593811,
+      "learning_rate": 0.003,
+      "loss": 4.1612,
+      "step": 2115
+    },
+    {
+      "epoch": 0.02116,
+      "grad_norm": 0.7614986300468445,
+      "learning_rate": 0.003,
+      "loss": 4.1951,
+      "step": 2116
+    },
+    {
+      "epoch": 0.02117,
+      "grad_norm": 0.7237206697463989,
+      "learning_rate": 0.003,
+      "loss": 4.1853,
+      "step": 2117
+    },
+    {
+      "epoch": 0.02118,
+      "grad_norm": 0.6130908727645874,
+      "learning_rate": 0.003,
+      "loss": 4.1776,
+      "step": 2118
+    },
+    {
+      "epoch": 0.02119,
+      "grad_norm": 0.5855314135551453,
+      "learning_rate": 0.003,
+      "loss": 4.1889,
+      "step": 2119
+    },
+    {
+      "epoch": 0.0212,
+      "grad_norm": 0.6109283566474915,
+      "learning_rate": 0.003,
+      "loss": 4.1619,
+      "step": 2120
+    },
+    {
+      "epoch": 0.02121,
+      "grad_norm": 0.610881507396698,
+      "learning_rate": 0.003,
+      "loss": 4.158,
+      "step": 2121
+    },
+    {
+      "epoch": 0.02122,
+      "grad_norm": 0.5126116275787354,
+      "learning_rate": 0.003,
+      "loss": 4.1642,
+      "step": 2122
+    },
+    {
+      "epoch": 0.02123,
+      "grad_norm": 0.50606369972229,
+      "learning_rate": 0.003,
+      "loss": 4.1448,
+      "step": 2123
+    },
+    {
+      "epoch": 0.02124,
+      "grad_norm": 0.5823047161102295,
+      "learning_rate": 0.003,
+      "loss": 4.1677,
+      "step": 2124
+    },
+    {
+      "epoch": 0.02125,
+      "grad_norm": 0.6061305403709412,
+      "learning_rate": 0.003,
+      "loss": 4.1848,
+      "step": 2125
+    },
+    {
+      "epoch": 0.02126,
+      "grad_norm": 0.6525632739067078,
+      "learning_rate": 0.003,
+      "loss": 4.1807,
+      "step": 2126
+    },
+    {
+      "epoch": 0.02127,
+      "grad_norm": 0.6677929759025574,
+      "learning_rate": 0.003,
+      "loss": 4.1921,
+      "step": 2127
+    },
+    {
+      "epoch": 0.02128,
+      "grad_norm": 0.6721459031105042,
+      "learning_rate": 0.003,
+      "loss": 4.1793,
+      "step": 2128
+    },
+    {
+      "epoch": 0.02129,
+      "grad_norm": 0.5789133310317993,
+      "learning_rate": 0.003,
+      "loss": 4.1784,
+      "step": 2129
+    },
+    {
+      "epoch": 0.0213,
+      "grad_norm": 0.48718854784965515,
+      "learning_rate": 0.003,
+      "loss": 4.1491,
+      "step": 2130
+    },
+    {
+      "epoch": 0.02131,
+      "grad_norm": 0.5584967136383057,
+      "learning_rate": 0.003,
+      "loss": 4.1797,
+      "step": 2131
+    },
+    {
+      "epoch": 0.02132,
+      "grad_norm": 0.6503726840019226,
+      "learning_rate": 0.003,
+      "loss": 4.1942,
+      "step": 2132
+    },
+    {
+      "epoch": 0.02133,
+      "grad_norm": 0.8396037817001343,
+      "learning_rate": 0.003,
+      "loss": 4.1927,
+      "step": 2133
+    },
+    {
+      "epoch": 0.02134,
+      "grad_norm": 0.9249106645584106,
+      "learning_rate": 0.003,
+      "loss": 4.2046,
+      "step": 2134
+    },
+    {
+      "epoch": 0.02135,
+      "grad_norm": 0.9640594124794006,
+      "learning_rate": 0.003,
+      "loss": 4.1994,
+      "step": 2135
+    },
+    {
+      "epoch": 0.02136,
+      "grad_norm": 0.8782548904418945,
+      "learning_rate": 0.003,
+      "loss": 4.177,
+      "step": 2136
+    },
+    {
+      "epoch": 0.02137,
+      "grad_norm": 0.8113679885864258,
+      "learning_rate": 0.003,
+      "loss": 4.2115,
+      "step": 2137
+    },
+    {
+      "epoch": 0.02138,
+      "grad_norm": 0.8208100199699402,
+      "learning_rate": 0.003,
+      "loss": 4.2211,
+      "step": 2138
+    },
+    {
+      "epoch": 0.02139,
+      "grad_norm": 0.8544081449508667,
+      "learning_rate": 0.003,
+      "loss": 4.2178,
+      "step": 2139
+    },
+    {
+      "epoch": 0.0214,
+      "grad_norm": 0.9318257570266724,
+      "learning_rate": 0.003,
+      "loss": 4.2125,
+      "step": 2140
+    },
+    {
+      "epoch": 0.02141,
+      "grad_norm": 0.9511024951934814,
+      "learning_rate": 0.003,
+      "loss": 4.1925,
+      "step": 2141
+    },
+    {
+      "epoch": 0.02142,
+      "grad_norm": 0.8391996026039124,
+      "learning_rate": 0.003,
+      "loss": 4.1851,
+      "step": 2142
+    },
+    {
+      "epoch": 0.02143,
+      "grad_norm": 0.8300689458847046,
+      "learning_rate": 0.003,
+      "loss": 4.2047,
+      "step": 2143
+    },
+    {
+      "epoch": 0.02144,
+      "grad_norm": 0.7621930837631226,
+      "learning_rate": 0.003,
+      "loss": 4.2055,
+      "step": 2144
+    },
+    {
+      "epoch": 0.02145,
+      "grad_norm": 0.7281203269958496,
+      "learning_rate": 0.003,
+      "loss": 4.1893,
+      "step": 2145
+    },
+    {
+      "epoch": 0.02146,
+      "grad_norm": 0.6785754561424255,
+      "learning_rate": 0.003,
+      "loss": 4.1995,
+      "step": 2146
+    },
+    {
+      "epoch": 0.02147,
+      "grad_norm": 0.6476329565048218,
+      "learning_rate": 0.003,
+      "loss": 4.2144,
+      "step": 2147
+    },
+    {
+      "epoch": 0.02148,
+      "grad_norm": 0.5711135268211365,
+      "learning_rate": 0.003,
+      "loss": 4.2235,
+      "step": 2148
+    },
+    {
+      "epoch": 0.02149,
+      "grad_norm": 0.5608777403831482,
+      "learning_rate": 0.003,
+      "loss": 4.2064,
+      "step": 2149
+    },
+    {
+      "epoch": 0.0215,
+      "grad_norm": 0.5449571013450623,
+      "learning_rate": 0.003,
+      "loss": 4.1918,
+      "step": 2150
+    },
+    {
+      "epoch": 0.02151,
+      "grad_norm": 0.5975837707519531,
+      "learning_rate": 0.003,
+      "loss": 4.1981,
+      "step": 2151
+    },
+    {
+      "epoch": 0.02152,
+      "grad_norm": 0.6778717041015625,
+      "learning_rate": 0.003,
+      "loss": 4.1751,
+      "step": 2152
+    },
+    {
+      "epoch": 0.02153,
+      "grad_norm": 0.6393144726753235,
+      "learning_rate": 0.003,
+      "loss": 4.1858,
+      "step": 2153
+    },
+    {
+      "epoch": 0.02154,
+      "grad_norm": 0.6100119352340698,
+      "learning_rate": 0.003,
+      "loss": 4.1743,
+      "step": 2154
+    },
+    {
+      "epoch": 0.02155,
+      "grad_norm": 0.5750409364700317,
+      "learning_rate": 0.003,
+      "loss": 4.2008,
+      "step": 2155
+    },
+    {
+      "epoch": 0.02156,
+      "grad_norm": 0.6067488789558411,
+      "learning_rate": 0.003,
+      "loss": 4.1875,
+      "step": 2156
+    },
+    {
+      "epoch": 0.02157,
+      "grad_norm": 0.6373497247695923,
+      "learning_rate": 0.003,
+      "loss": 4.1553,
+      "step": 2157
+    },
+    {
+      "epoch": 0.02158,
+      "grad_norm": 0.6033819317817688,
+      "learning_rate": 0.003,
+      "loss": 4.1755,
+      "step": 2158
+    },
+    {
+      "epoch": 0.02159,
+      "grad_norm": 0.5368500351905823,
+      "learning_rate": 0.003,
+      "loss": 4.1852,
+      "step": 2159
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.5477511882781982,
+      "learning_rate": 0.003,
+      "loss": 4.1876,
+      "step": 2160
+    },
+    {
+      "epoch": 0.02161,
+      "grad_norm": 0.6277697086334229,
+      "learning_rate": 0.003,
+      "loss": 4.1773,
+      "step": 2161
+    },
+    {
+      "epoch": 0.02162,
+      "grad_norm": 0.6784037351608276,
+      "learning_rate": 0.003,
+      "loss": 4.1846,
+      "step": 2162
+    },
+    {
+      "epoch": 0.02163,
+      "grad_norm": 0.6868694424629211,
+      "learning_rate": 0.003,
+      "loss": 4.1997,
+      "step": 2163
+    },
+    {
+      "epoch": 0.02164,
+      "grad_norm": 0.7093749642372131,
+      "learning_rate": 0.003,
+      "loss": 4.1944,
+      "step": 2164
+    },
+    {
+      "epoch": 0.02165,
+      "grad_norm": 0.7334204912185669,
+      "learning_rate": 0.003,
+      "loss": 4.1627,
+      "step": 2165
+    },
+    {
+      "epoch": 0.02166,
+      "grad_norm": 0.8240237236022949,
+      "learning_rate": 0.003,
+      "loss": 4.2062,
+      "step": 2166
+    },
+    {
+      "epoch": 0.02167,
+      "grad_norm": 0.8271484375,
+      "learning_rate": 0.003,
+      "loss": 4.1857,
+      "step": 2167
+    },
+    {
+      "epoch": 0.02168,
+      "grad_norm": 0.9684041142463684,
+      "learning_rate": 0.003,
+      "loss": 4.1878,
+      "step": 2168
+    },
+    {
+      "epoch": 0.02169,
+      "grad_norm": 0.9603534936904907,
+      "learning_rate": 0.003,
+      "loss": 4.1956,
+      "step": 2169
+    },
+    {
+      "epoch": 0.0217,
+      "grad_norm": 0.9623264074325562,
+      "learning_rate": 0.003,
+      "loss": 4.2029,
+      "step": 2170
+    },
+    {
+      "epoch": 0.02171,
+      "grad_norm": 0.8587528467178345,
+      "learning_rate": 0.003,
+      "loss": 4.2163,
+      "step": 2171
+    },
+    {
+      "epoch": 0.02172,
+      "grad_norm": 0.7502791285514832,
+      "learning_rate": 0.003,
+      "loss": 4.1819,
+      "step": 2172
+    },
+    {
+      "epoch": 0.02173,
+      "grad_norm": 0.6895846724510193,
+      "learning_rate": 0.003,
+      "loss": 4.1826,
+      "step": 2173
+    },
+    {
+      "epoch": 0.02174,
+      "grad_norm": 0.6692303419113159,
+      "learning_rate": 0.003,
+      "loss": 4.1793,
+      "step": 2174
+    },
+    {
+      "epoch": 0.02175,
+      "grad_norm": 0.6576927900314331,
+      "learning_rate": 0.003,
+      "loss": 4.2023,
+      "step": 2175
+    },
+    {
+      "epoch": 0.02176,
+      "grad_norm": 0.6110131740570068,
+      "learning_rate": 0.003,
+      "loss": 4.1815,
+      "step": 2176
+    },
+    {
+      "epoch": 0.02177,
+      "grad_norm": 0.5583458542823792,
+      "learning_rate": 0.003,
+      "loss": 4.1855,
+      "step": 2177
+    },
+    {
+      "epoch": 0.02178,
+      "grad_norm": 0.5579090118408203,
+      "learning_rate": 0.003,
+      "loss": 4.1655,
+      "step": 2178
+    },
+    {
+      "epoch": 0.02179,
+      "grad_norm": 0.54999840259552,
+      "learning_rate": 0.003,
+      "loss": 4.1765,
+      "step": 2179
+    },
+    {
+      "epoch": 0.0218,
+      "grad_norm": 0.5893214344978333,
+      "learning_rate": 0.003,
+      "loss": 4.193,
+      "step": 2180
+    },
+    {
+      "epoch": 0.02181,
+      "grad_norm": 0.5285120606422424,
+      "learning_rate": 0.003,
+      "loss": 4.19,
+      "step": 2181
+    },
+    {
+      "epoch": 0.02182,
+      "grad_norm": 0.537371814250946,
+      "learning_rate": 0.003,
+      "loss": 4.1755,
+      "step": 2182
+    },
+    {
+      "epoch": 0.02183,
+      "grad_norm": 0.7419714331626892,
+      "learning_rate": 0.003,
+      "loss": 4.1927,
+      "step": 2183
+    },
+    {
+      "epoch": 0.02184,
+      "grad_norm": 0.8745968341827393,
+      "learning_rate": 0.003,
+      "loss": 4.1753,
+      "step": 2184
+    },
+    {
+      "epoch": 0.02185,
+      "grad_norm": 0.8310950398445129,
+      "learning_rate": 0.003,
+      "loss": 4.1708,
+      "step": 2185
+    },
+    {
+      "epoch": 0.02186,
+      "grad_norm": 0.6509782075881958,
+      "learning_rate": 0.003,
+      "loss": 4.1821,
+      "step": 2186
+    },
+    {
+      "epoch": 0.02187,
+      "grad_norm": 0.6584124565124512,
+      "learning_rate": 0.003,
+      "loss": 4.1801,
+      "step": 2187
+    },
+    {
+      "epoch": 0.02188,
+      "grad_norm": 0.6546313762664795,
+      "learning_rate": 0.003,
+      "loss": 4.1586,
+      "step": 2188
+    },
+    {
+      "epoch": 0.02189,
+      "grad_norm": 0.6025269031524658,
+      "learning_rate": 0.003,
+      "loss": 4.1873,
+      "step": 2189
+    },
+    {
+      "epoch": 0.0219,
+      "grad_norm": 0.5339919924736023,
+      "learning_rate": 0.003,
+      "loss": 4.1761,
+      "step": 2190
+    },
+    {
+      "epoch": 0.02191,
+      "grad_norm": 0.5982394814491272,
+      "learning_rate": 0.003,
+      "loss": 4.1693,
+      "step": 2191
+    },
+    {
+      "epoch": 0.02192,
+      "grad_norm": 0.6466907262802124,
+      "learning_rate": 0.003,
+      "loss": 4.1981,
+      "step": 2192
+    },
+    {
+      "epoch": 0.02193,
+      "grad_norm": 0.6886336207389832,
+      "learning_rate": 0.003,
+      "loss": 4.1664,
+      "step": 2193
+    },
+    {
+      "epoch": 0.02194,
+      "grad_norm": 0.6955612897872925,
+      "learning_rate": 0.003,
+      "loss": 4.2136,
+      "step": 2194
+    },
+    {
+      "epoch": 0.02195,
+      "grad_norm": 0.634350061416626,
+      "learning_rate": 0.003,
+      "loss": 4.1789,
+      "step": 2195
+    },
+    {
+      "epoch": 0.02196,
+      "grad_norm": 0.6464403867721558,
+      "learning_rate": 0.003,
+      "loss": 4.1877,
+      "step": 2196
+    },
+    {
+      "epoch": 0.02197,
+      "grad_norm": 0.7355444431304932,
+      "learning_rate": 0.003,
+      "loss": 4.1843,
+      "step": 2197
+    },
+    {
+      "epoch": 0.02198,
+      "grad_norm": 0.6861805319786072,
+      "learning_rate": 0.003,
+      "loss": 4.1732,
+      "step": 2198
+    },
+    {
+      "epoch": 0.02199,
+      "grad_norm": 0.5390181541442871,
+      "learning_rate": 0.003,
+      "loss": 4.1372,
+      "step": 2199
+    },
+    {
+      "epoch": 0.022,
+      "grad_norm": 0.5449665188789368,
+      "learning_rate": 0.003,
+      "loss": 4.1658,
+      "step": 2200
+    },
+    {
+      "epoch": 0.02201,
+      "grad_norm": 0.5550421476364136,
+      "learning_rate": 0.003,
+      "loss": 4.1819,
+      "step": 2201
+    },
+    {
+      "epoch": 0.02202,
+      "grad_norm": 0.7112802863121033,
+      "learning_rate": 0.003,
+      "loss": 4.1711,
+      "step": 2202
+    },
+    {
+      "epoch": 0.02203,
+      "grad_norm": 0.806663990020752,
+      "learning_rate": 0.003,
+      "loss": 4.2082,
+      "step": 2203
+    },
+    {
+      "epoch": 0.02204,
+      "grad_norm": 0.7858209013938904,
+      "learning_rate": 0.003,
+      "loss": 4.1998,
+      "step": 2204
+    },
+    {
+      "epoch": 0.02205,
+      "grad_norm": 0.782494306564331,
+      "learning_rate": 0.003,
+      "loss": 4.2034,
+      "step": 2205
+    },
+    {
+      "epoch": 0.02206,
+      "grad_norm": 0.6905097365379333,
+      "learning_rate": 0.003,
+      "loss": 4.1663,
+      "step": 2206
+    },
+    {
+      "epoch": 0.02207,
+      "grad_norm": 0.7511074542999268,
+      "learning_rate": 0.003,
+      "loss": 4.1796,
+      "step": 2207
+    },
+    {
+      "epoch": 0.02208,
+      "grad_norm": 0.801884651184082,
+      "learning_rate": 0.003,
+      "loss": 4.1654,
+      "step": 2208
+    },
+    {
+      "epoch": 0.02209,
+      "grad_norm": 0.7684618234634399,
+      "learning_rate": 0.003,
+      "loss": 4.1791,
+      "step": 2209
+    },
+    {
+      "epoch": 0.0221,
+      "grad_norm": 0.7751032114028931,
+      "learning_rate": 0.003,
+      "loss": 4.1653,
+      "step": 2210
+    },
+    {
+      "epoch": 0.02211,
+      "grad_norm": 0.7961976528167725,
+      "learning_rate": 0.003,
+      "loss": 4.1777,
+      "step": 2211
+    },
+    {
+      "epoch": 0.02212,
+      "grad_norm": 0.694131076335907,
+      "learning_rate": 0.003,
+      "loss": 4.174,
+      "step": 2212
+    },
+    {
+      "epoch": 0.02213,
+      "grad_norm": 0.731791079044342,
+      "learning_rate": 0.003,
+      "loss": 4.1834,
+      "step": 2213
+    },
+    {
+      "epoch": 0.02214,
+      "grad_norm": 0.5831606388092041,
+      "learning_rate": 0.003,
+      "loss": 4.171,
+      "step": 2214
+    },
+    {
+      "epoch": 0.02215,
+      "grad_norm": 0.6400359869003296,
+      "learning_rate": 0.003,
+      "loss": 4.1778,
+      "step": 2215
+    },
+    {
+      "epoch": 0.02216,
+      "grad_norm": 0.6402711868286133,
+      "learning_rate": 0.003,
+      "loss": 4.156,
+      "step": 2216
+    },
+    {
+      "epoch": 0.02217,
+      "grad_norm": 0.5657291412353516,
+      "learning_rate": 0.003,
+      "loss": 4.153,
+      "step": 2217
+    },
+    {
+      "epoch": 0.02218,
+      "grad_norm": 0.5510069727897644,
+      "learning_rate": 0.003,
+      "loss": 4.1828,
+      "step": 2218
+    },
+    {
+      "epoch": 0.02219,
+      "grad_norm": 0.567196786403656,
+      "learning_rate": 0.003,
+      "loss": 4.173,
+      "step": 2219
+    },
+    {
+      "epoch": 0.0222,
+      "grad_norm": 0.5382395386695862,
+      "learning_rate": 0.003,
+      "loss": 4.1691,
+      "step": 2220
+    },
+    {
+      "epoch": 0.02221,
+      "grad_norm": 0.5838103890419006,
+      "learning_rate": 0.003,
+      "loss": 4.1595,
+      "step": 2221
+    },
+    {
+      "epoch": 0.02222,
+      "grad_norm": 0.6669290661811829,
+      "learning_rate": 0.003,
+      "loss": 4.1608,
+      "step": 2222
+    },
+    {
+      "epoch": 0.02223,
+      "grad_norm": 0.8421687483787537,
+      "learning_rate": 0.003,
+      "loss": 4.1833,
+      "step": 2223
+    },
+    {
+      "epoch": 0.02224,
+      "grad_norm": 1.0844712257385254,
+      "learning_rate": 0.003,
+      "loss": 4.1789,
+      "step": 2224
+    },
+    {
+      "epoch": 0.02225,
+      "grad_norm": 0.7447740435600281,
+      "learning_rate": 0.003,
+      "loss": 4.1706,
+      "step": 2225
+    },
+    {
+      "epoch": 0.02226,
+      "grad_norm": 0.6103411912918091,
+      "learning_rate": 0.003,
+      "loss": 4.184,
+      "step": 2226
+    },
+    {
+      "epoch": 0.02227,
+      "grad_norm": 0.6984069347381592,
+      "learning_rate": 0.003,
+      "loss": 4.1806,
+      "step": 2227
+    },
+    {
+      "epoch": 0.02228,
+      "grad_norm": 0.6870824694633484,
+      "learning_rate": 0.003,
+      "loss": 4.1674,
+      "step": 2228
+    },
+    {
+      "epoch": 0.02229,
+      "grad_norm": 0.6159563064575195,
+      "learning_rate": 0.003,
+      "loss": 4.1446,
+      "step": 2229
+    },
+    {
+      "epoch": 0.0223,
+      "grad_norm": 0.6422266364097595,
+      "learning_rate": 0.003,
+      "loss": 4.1847,
+      "step": 2230
+    },
+    {
+      "epoch": 0.02231,
+      "grad_norm": 0.6304430961608887,
+      "learning_rate": 0.003,
+      "loss": 4.1891,
+      "step": 2231
+    },
+    {
+      "epoch": 0.02232,
+      "grad_norm": 0.6334310173988342,
+      "learning_rate": 0.003,
+      "loss": 4.1933,
+      "step": 2232
+    },
+    {
+      "epoch": 0.02233,
+      "grad_norm": 0.729522705078125,
+      "learning_rate": 0.003,
+      "loss": 4.1662,
+      "step": 2233
+    },
+    {
+      "epoch": 0.02234,
+      "grad_norm": 0.839550256729126,
+      "learning_rate": 0.003,
+      "loss": 4.1911,
+      "step": 2234
+    },
+    {
+      "epoch": 0.02235,
+      "grad_norm": 0.8429378271102905,
+      "learning_rate": 0.003,
+      "loss": 4.2154,
+      "step": 2235
+    },
+    {
+      "epoch": 0.02236,
+      "grad_norm": 0.7225827574729919,
+      "learning_rate": 0.003,
+      "loss": 4.1659,
+      "step": 2236
+    },
+    {
+      "epoch": 0.02237,
+      "grad_norm": 0.6989867091178894,
+      "learning_rate": 0.003,
+      "loss": 4.1975,
+      "step": 2237
+    },
+    {
+      "epoch": 0.02238,
+      "grad_norm": 0.772613525390625,
+      "learning_rate": 0.003,
+      "loss": 4.1837,
+      "step": 2238
+    },
+    {
+      "epoch": 0.02239,
+      "grad_norm": 0.7962609529495239,
+      "learning_rate": 0.003,
+      "loss": 4.1688,
+      "step": 2239
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.7535995244979858,
+      "learning_rate": 0.003,
+      "loss": 4.1562,
+      "step": 2240
+    },
+    {
+      "epoch": 0.02241,
+      "grad_norm": 0.8866783976554871,
+      "learning_rate": 0.003,
+      "loss": 4.1902,
+      "step": 2241
+    },
+    {
+      "epoch": 0.02242,
+      "grad_norm": 1.0102195739746094,
+      "learning_rate": 0.003,
+      "loss": 4.1875,
+      "step": 2242
+    },
+    {
+      "epoch": 0.02243,
+      "grad_norm": 1.013528823852539,
+      "learning_rate": 0.003,
+      "loss": 4.2245,
+      "step": 2243
+    },
+    {
+      "epoch": 0.02244,
+      "grad_norm": 0.8773812055587769,
+      "learning_rate": 0.003,
+      "loss": 4.1919,
+      "step": 2244
+    },
+    {
+      "epoch": 0.02245,
+      "grad_norm": 0.8694789409637451,
+      "learning_rate": 0.003,
+      "loss": 4.182,
+      "step": 2245
+    },
+    {
+      "epoch": 0.02246,
+      "grad_norm": 0.8814519047737122,
+      "learning_rate": 0.003,
+      "loss": 4.2131,
+      "step": 2246
+    },
+    {
+      "epoch": 0.02247,
+      "grad_norm": 0.7133143544197083,
+      "learning_rate": 0.003,
+      "loss": 4.1799,
+      "step": 2247
+    },
+    {
+      "epoch": 0.02248,
+      "grad_norm": 0.6402725577354431,
+      "learning_rate": 0.003,
+      "loss": 4.1839,
+      "step": 2248
+    },
+    {
+      "epoch": 0.02249,
+      "grad_norm": 0.5918011665344238,
+      "learning_rate": 0.003,
+      "loss": 4.1724,
+      "step": 2249
+    },
+    {
+      "epoch": 0.0225,
+      "grad_norm": 0.5062255859375,
+      "learning_rate": 0.003,
+      "loss": 4.1824,
+      "step": 2250
+    },
+    {
+      "epoch": 0.02251,
+      "grad_norm": 0.48150739073753357,
+      "learning_rate": 0.003,
+      "loss": 4.2037,
+      "step": 2251
+    },
+    {
+      "epoch": 0.02252,
+      "grad_norm": 0.4211646616458893,
+      "learning_rate": 0.003,
+      "loss": 4.1917,
+      "step": 2252
+    },
+    {
+      "epoch": 0.02253,
+      "grad_norm": 0.4155518114566803,
+      "learning_rate": 0.003,
+      "loss": 4.1719,
+      "step": 2253
+    },
+    {
+      "epoch": 0.02254,
+      "grad_norm": 0.3938300311565399,
+      "learning_rate": 0.003,
+      "loss": 4.1747,
+      "step": 2254
+    },
+    {
+      "epoch": 0.02255,
+      "grad_norm": 0.3917562961578369,
+      "learning_rate": 0.003,
+      "loss": 4.1579,
+      "step": 2255
+    },
+    {
+      "epoch": 0.02256,
+      "grad_norm": 0.34900254011154175,
+      "learning_rate": 0.003,
+      "loss": 4.1446,
+      "step": 2256
+    },
+    {
+      "epoch": 0.02257,
+      "grad_norm": 0.46977952122688293,
+      "learning_rate": 0.003,
+      "loss": 4.1641,
+      "step": 2257
+    },
+    {
+      "epoch": 0.02258,
+      "grad_norm": 0.6035822629928589,
+      "learning_rate": 0.003,
+      "loss": 4.1756,
+      "step": 2258
+    },
+    {
+      "epoch": 0.02259,
+      "grad_norm": 0.8085506558418274,
+      "learning_rate": 0.003,
+      "loss": 4.1444,
+      "step": 2259
+    },
+    {
+      "epoch": 0.0226,
+      "grad_norm": 1.0156970024108887,
+      "learning_rate": 0.003,
+      "loss": 4.2107,
+      "step": 2260
+    },
+    {
+      "epoch": 0.02261,
+      "grad_norm": 0.8568210601806641,
+      "learning_rate": 0.003,
+      "loss": 4.1861,
+      "step": 2261
+    },
+    {
+      "epoch": 0.02262,
+      "grad_norm": 0.7774007320404053,
+      "learning_rate": 0.003,
+      "loss": 4.1822,
+      "step": 2262
+    },
+    {
+      "epoch": 0.02263,
+      "grad_norm": 0.9870522022247314,
+      "learning_rate": 0.003,
+      "loss": 4.2046,
+      "step": 2263
+    },
+    {
+      "epoch": 0.02264,
+      "grad_norm": 0.7220656871795654,
+      "learning_rate": 0.003,
+      "loss": 4.1915,
+      "step": 2264
+    },
+    {
+      "epoch": 0.02265,
+      "grad_norm": 0.5900349020957947,
+      "learning_rate": 0.003,
+      "loss": 4.1438,
+      "step": 2265
+    },
+    {
+      "epoch": 0.02266,
+      "grad_norm": 0.6756983995437622,
+      "learning_rate": 0.003,
+      "loss": 4.1642,
+      "step": 2266
+    },
+    {
+      "epoch": 0.02267,
+      "grad_norm": 0.6196969747543335,
+      "learning_rate": 0.003,
+      "loss": 4.1867,
+      "step": 2267
+    },
+    {
+      "epoch": 0.02268,
+      "grad_norm": 0.4943087697029114,
+      "learning_rate": 0.003,
+      "loss": 4.1589,
+      "step": 2268
+    },
+    {
+      "epoch": 0.02269,
+      "grad_norm": 0.45949792861938477,
+      "learning_rate": 0.003,
+      "loss": 4.1791,
+      "step": 2269
+    },
+    {
+      "epoch": 0.0227,
+      "grad_norm": 0.5057196617126465,
+      "learning_rate": 0.003,
+      "loss": 4.176,
+      "step": 2270
+    },
+    {
+      "epoch": 0.02271,
+      "grad_norm": 0.587051510810852,
+      "learning_rate": 0.003,
+      "loss": 4.1796,
+      "step": 2271
+    },
+    {
+      "epoch": 0.02272,
+      "grad_norm": 0.6179952621459961,
+      "learning_rate": 0.003,
+      "loss": 4.1812,
+      "step": 2272
+    },
+    {
+      "epoch": 0.02273,
+      "grad_norm": 0.549956738948822,
+      "learning_rate": 0.003,
+      "loss": 4.1786,
+      "step": 2273
+    },
+    {
+      "epoch": 0.02274,
+      "grad_norm": 0.5753885507583618,
+      "learning_rate": 0.003,
+      "loss": 4.1651,
+      "step": 2274
+    },
+    {
+      "epoch": 0.02275,
+      "grad_norm": 0.5730156898498535,
+      "learning_rate": 0.003,
+      "loss": 4.1582,
+      "step": 2275
+    },
+    {
+      "epoch": 0.02276,
+      "grad_norm": 0.633211076259613,
+      "learning_rate": 0.003,
+      "loss": 4.1545,
+      "step": 2276
+    },
+    {
+      "epoch": 0.02277,
+      "grad_norm": 0.6606276631355286,
+      "learning_rate": 0.003,
+      "loss": 4.1668,
+      "step": 2277
+    },
+    {
+      "epoch": 0.02278,
+      "grad_norm": 0.6212934255599976,
+      "learning_rate": 0.003,
+      "loss": 4.1699,
+      "step": 2278
+    },
+    {
+      "epoch": 0.02279,
+      "grad_norm": 0.5762414932250977,
+      "learning_rate": 0.003,
+      "loss": 4.1803,
+      "step": 2279
+    },
+    {
+      "epoch": 0.0228,
+      "grad_norm": 0.6208809018135071,
+      "learning_rate": 0.003,
+      "loss": 4.1721,
+      "step": 2280
+    },
+    {
+      "epoch": 0.02281,
+      "grad_norm": 0.5790427327156067,
+      "learning_rate": 0.003,
+      "loss": 4.1659,
+      "step": 2281
+    },
+    {
+      "epoch": 0.02282,
+      "grad_norm": 0.5124678015708923,
+      "learning_rate": 0.003,
+      "loss": 4.1737,
+      "step": 2282
+    },
+    {
+      "epoch": 0.02283,
+      "grad_norm": 0.5517794489860535,
+      "learning_rate": 0.003,
+      "loss": 4.1895,
+      "step": 2283
+    },
+    {
+      "epoch": 0.02284,
+      "grad_norm": 0.5746618509292603,
+      "learning_rate": 0.003,
+      "loss": 4.1861,
+      "step": 2284
+    },
+    {
+      "epoch": 0.02285,
+      "grad_norm": 0.6934999823570251,
+      "learning_rate": 0.003,
+      "loss": 4.1284,
+      "step": 2285
+    },
+    {
+      "epoch": 0.02286,
+      "grad_norm": 0.8188515305519104,
+      "learning_rate": 0.003,
+      "loss": 4.1994,
+      "step": 2286
+    },
+    {
+      "epoch": 0.02287,
+      "grad_norm": 0.7512263655662537,
+      "learning_rate": 0.003,
+      "loss": 4.1749,
+      "step": 2287
+    },
+    {
+      "epoch": 0.02288,
+      "grad_norm": 0.7637566328048706,
+      "learning_rate": 0.003,
+      "loss": 4.2039,
+      "step": 2288
+    },
+    {
+      "epoch": 0.02289,
+      "grad_norm": 0.700842559337616,
+      "learning_rate": 0.003,
+      "loss": 4.1993,
+      "step": 2289
+    },
+    {
+      "epoch": 0.0229,
+      "grad_norm": 0.6298280358314514,
+      "learning_rate": 0.003,
+      "loss": 4.1719,
+      "step": 2290
+    },
+    {
+      "epoch": 0.02291,
+      "grad_norm": 0.6435158848762512,
+      "learning_rate": 0.003,
+      "loss": 4.1525,
+      "step": 2291
+    },
+    {
+      "epoch": 0.02292,
+      "grad_norm": 0.5874335169792175,
+      "learning_rate": 0.003,
+      "loss": 4.1459,
+      "step": 2292
+    },
+    {
+      "epoch": 0.02293,
+      "grad_norm": 0.6026025414466858,
+      "learning_rate": 0.003,
+      "loss": 4.1504,
+      "step": 2293
+    },
+    {
+      "epoch": 0.02294,
+      "grad_norm": 0.7460681200027466,
+      "learning_rate": 0.003,
+      "loss": 4.1566,
+      "step": 2294
+    },
+    {
+      "epoch": 0.02295,
+      "grad_norm": 0.7140766978263855,
+      "learning_rate": 0.003,
+      "loss": 4.1531,
+      "step": 2295
+    },
+    {
+      "epoch": 0.02296,
+      "grad_norm": 0.6355901956558228,
+      "learning_rate": 0.003,
+      "loss": 4.1821,
+      "step": 2296
+    },
+    {
+      "epoch": 0.02297,
+      "grad_norm": 0.673784077167511,
+      "learning_rate": 0.003,
+      "loss": 4.1554,
+      "step": 2297
+    },
+    {
+      "epoch": 0.02298,
+      "grad_norm": 0.6826474666595459,
+      "learning_rate": 0.003,
+      "loss": 4.1671,
+      "step": 2298
+    },
+    {
+      "epoch": 0.02299,
+      "grad_norm": 0.6509355306625366,
+      "learning_rate": 0.003,
+      "loss": 4.1693,
+      "step": 2299
+    },
+    {
+      "epoch": 0.023,
+      "grad_norm": 0.6510788202285767,
+      "learning_rate": 0.003,
+      "loss": 4.1909,
+      "step": 2300
+    },
+    {
+      "epoch": 0.02301,
+      "grad_norm": 0.6393023133277893,
+      "learning_rate": 0.003,
+      "loss": 4.1552,
+      "step": 2301
+    },
+    {
+      "epoch": 0.02302,
+      "grad_norm": 0.7208176255226135,
+      "learning_rate": 0.003,
+      "loss": 4.1649,
+      "step": 2302
+    },
+    {
+      "epoch": 0.02303,
+      "grad_norm": 0.8771877884864807,
+      "learning_rate": 0.003,
+      "loss": 4.1613,
+      "step": 2303
+    },
+    {
+      "epoch": 0.02304,
+      "grad_norm": 1.1240605115890503,
+      "learning_rate": 0.003,
+      "loss": 4.2129,
+      "step": 2304
+    },
+    {
+      "epoch": 0.02305,
+      "grad_norm": 1.0586861371994019,
+      "learning_rate": 0.003,
+      "loss": 4.1816,
+      "step": 2305
+    },
+    {
+      "epoch": 0.02306,
+      "grad_norm": 0.7870573401451111,
+      "learning_rate": 0.003,
+      "loss": 4.1572,
+      "step": 2306
+    },
+    {
+      "epoch": 0.02307,
+      "grad_norm": 0.7953494191169739,
+      "learning_rate": 0.003,
+      "loss": 4.1864,
+      "step": 2307
+    },
+    {
+      "epoch": 0.02308,
+      "grad_norm": 0.800268292427063,
+      "learning_rate": 0.003,
+      "loss": 4.1782,
+      "step": 2308
+    },
+    {
+      "epoch": 0.02309,
+      "grad_norm": 0.7769814133644104,
+      "learning_rate": 0.003,
+      "loss": 4.2068,
+      "step": 2309
+    },
+    {
+      "epoch": 0.0231,
+      "grad_norm": 0.8404530882835388,
+      "learning_rate": 0.003,
+      "loss": 4.1894,
+      "step": 2310
+    },
+    {
+      "epoch": 0.02311,
+      "grad_norm": 0.8363377451896667,
+      "learning_rate": 0.003,
+      "loss": 4.1952,
+      "step": 2311
+    },
+    {
+      "epoch": 0.02312,
+      "grad_norm": 0.770296037197113,
+      "learning_rate": 0.003,
+      "loss": 4.1643,
+      "step": 2312
+    },
+    {
+      "epoch": 0.02313,
+      "grad_norm": 0.7528126835823059,
+      "learning_rate": 0.003,
+      "loss": 4.1622,
+      "step": 2313
+    },
+    {
+      "epoch": 0.02314,
+      "grad_norm": 0.74263995885849,
+      "learning_rate": 0.003,
+      "loss": 4.1779,
+      "step": 2314
+    },
+    {
+      "epoch": 0.02315,
+      "grad_norm": 0.6081147789955139,
+      "learning_rate": 0.003,
+      "loss": 4.1766,
+      "step": 2315
+    },
+    {
+      "epoch": 0.02316,
+      "grad_norm": 0.5721566677093506,
+      "learning_rate": 0.003,
+      "loss": 4.1882,
+      "step": 2316
+    },
+    {
+      "epoch": 0.02317,
+      "grad_norm": 0.5732864141464233,
+      "learning_rate": 0.003,
+      "loss": 4.1684,
+      "step": 2317
+    },
+    {
+      "epoch": 0.02318,
+      "grad_norm": 0.5256015062332153,
+      "learning_rate": 0.003,
+      "loss": 4.1551,
+      "step": 2318
+    },
+    {
+      "epoch": 0.02319,
+      "grad_norm": 0.49379223585128784,
+      "learning_rate": 0.003,
+      "loss": 4.1772,
+      "step": 2319
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.5656861662864685,
+      "learning_rate": 0.003,
+      "loss": 4.1857,
+      "step": 2320
+    },
+    {
+      "epoch": 0.02321,
+      "grad_norm": 0.5173032879829407,
+      "learning_rate": 0.003,
+      "loss": 4.1628,
+      "step": 2321
+    },
+    {
+      "epoch": 0.02322,
+      "grad_norm": 0.4566805362701416,
+      "learning_rate": 0.003,
+      "loss": 4.1702,
+      "step": 2322
+    },
+    {
+      "epoch": 0.02323,
+      "grad_norm": 0.42646524310112,
+      "learning_rate": 0.003,
+      "loss": 4.1427,
+      "step": 2323
+    },
+    {
+      "epoch": 0.02324,
+      "grad_norm": 0.46551966667175293,
+      "learning_rate": 0.003,
+      "loss": 4.1415,
+      "step": 2324
+    },
+    {
+      "epoch": 0.02325,
+      "grad_norm": 0.4420718252658844,
+      "learning_rate": 0.003,
+      "loss": 4.1646,
+      "step": 2325
+    },
+    {
+      "epoch": 0.02326,
+      "grad_norm": 0.42437517642974854,
+      "learning_rate": 0.003,
+      "loss": 4.1773,
+      "step": 2326
+    },
+    {
+      "epoch": 0.02327,
+      "grad_norm": 0.457309365272522,
+      "learning_rate": 0.003,
+      "loss": 4.164,
+      "step": 2327
+    },
+    {
+      "epoch": 0.02328,
+      "grad_norm": 0.515424370765686,
+      "learning_rate": 0.003,
+      "loss": 4.1652,
+      "step": 2328
+    },
+    {
+      "epoch": 0.02329,
+      "grad_norm": 0.5420231223106384,
+      "learning_rate": 0.003,
+      "loss": 4.1544,
+      "step": 2329
+    },
+    {
+      "epoch": 0.0233,
+      "grad_norm": 0.5254723429679871,
+      "learning_rate": 0.003,
+      "loss": 4.1502,
+      "step": 2330
+    },
+    {
+      "epoch": 0.02331,
+      "grad_norm": 0.47864091396331787,
+      "learning_rate": 0.003,
+      "loss": 4.1851,
+      "step": 2331
+    },
+    {
+      "epoch": 0.02332,
+      "grad_norm": 0.6108183264732361,
+      "learning_rate": 0.003,
+      "loss": 4.1511,
+      "step": 2332
+    },
+    {
+      "epoch": 0.02333,
+      "grad_norm": 0.7649471759796143,
+      "learning_rate": 0.003,
+      "loss": 4.1613,
+      "step": 2333
+    },
+    {
+      "epoch": 0.02334,
+      "grad_norm": 0.9433549642562866,
+      "learning_rate": 0.003,
+      "loss": 4.1411,
+      "step": 2334
+    },
+    {
+      "epoch": 0.02335,
+      "grad_norm": 1.05660080909729,
+      "learning_rate": 0.003,
+      "loss": 4.1889,
+      "step": 2335
+    },
+    {
+      "epoch": 0.02336,
+      "grad_norm": 0.7674508690834045,
+      "learning_rate": 0.003,
+      "loss": 4.1752,
+      "step": 2336
+    },
+    {
+      "epoch": 0.02337,
+      "grad_norm": 0.6039113402366638,
+      "learning_rate": 0.003,
+      "loss": 4.1744,
+      "step": 2337
+    },
+    {
+      "epoch": 0.02338,
+      "grad_norm": 0.678247332572937,
+      "learning_rate": 0.003,
+      "loss": 4.1635,
+      "step": 2338
+    },
+    {
+      "epoch": 0.02339,
+      "grad_norm": 0.6683263182640076,
+      "learning_rate": 0.003,
+      "loss": 4.1573,
+      "step": 2339
+    },
+    {
+      "epoch": 0.0234,
+      "grad_norm": 0.6628241539001465,
+      "learning_rate": 0.003,
+      "loss": 4.1711,
+      "step": 2340
+    },
+    {
+      "epoch": 0.02341,
+      "grad_norm": 0.6385108232498169,
+      "learning_rate": 0.003,
+      "loss": 4.145,
+      "step": 2341
+    },
+    {
+      "epoch": 0.02342,
+      "grad_norm": 0.7277436852455139,
+      "learning_rate": 0.003,
+      "loss": 4.1673,
+      "step": 2342
+    },
+    {
+      "epoch": 0.02343,
+      "grad_norm": 0.7746118903160095,
+      "learning_rate": 0.003,
+      "loss": 4.1654,
+      "step": 2343
+    },
+    {
+      "epoch": 0.02344,
+      "grad_norm": 0.8185151815414429,
+      "learning_rate": 0.003,
+      "loss": 4.1668,
+      "step": 2344
+    },
+    {
+      "epoch": 0.02345,
+      "grad_norm": 0.8307933211326599,
+      "learning_rate": 0.003,
+      "loss": 4.163,
+      "step": 2345
+    },
+    {
+      "epoch": 0.02346,
+      "grad_norm": 0.7920862436294556,
+      "learning_rate": 0.003,
+      "loss": 4.1608,
+      "step": 2346
+    },
+    {
+      "epoch": 0.02347,
+      "grad_norm": 0.7341541051864624,
+      "learning_rate": 0.003,
+      "loss": 4.1955,
+      "step": 2347
+    },
+    {
+      "epoch": 0.02348,
+      "grad_norm": 0.7185869216918945,
+      "learning_rate": 0.003,
+      "loss": 4.1581,
+      "step": 2348
+    },
+    {
+      "epoch": 0.02349,
+      "grad_norm": 0.7363864183425903,
+      "learning_rate": 0.003,
+      "loss": 4.1781,
+      "step": 2349
+    },
+    {
+      "epoch": 0.0235,
+      "grad_norm": 0.8115167021751404,
+      "learning_rate": 0.003,
+      "loss": 4.1777,
+      "step": 2350
+    },
+    {
+      "epoch": 0.02351,
+      "grad_norm": 0.8712372779846191,
+      "learning_rate": 0.003,
+      "loss": 4.1958,
+      "step": 2351
+    },
+    {
+      "epoch": 0.02352,
+      "grad_norm": 0.7720403075218201,
+      "learning_rate": 0.003,
+      "loss": 4.1512,
+      "step": 2352
+    },
+    {
+      "epoch": 0.02353,
+      "grad_norm": 0.7881413102149963,
+      "learning_rate": 0.003,
+      "loss": 4.1809,
+      "step": 2353
+    },
+    {
+      "epoch": 0.02354,
+      "grad_norm": 0.9158604741096497,
+      "learning_rate": 0.003,
+      "loss": 4.1959,
+      "step": 2354
+    },
+    {
+      "epoch": 0.02355,
+      "grad_norm": 0.8635379076004028,
+      "learning_rate": 0.003,
+      "loss": 4.1914,
+      "step": 2355
+    },
+    {
+      "epoch": 0.02356,
+      "grad_norm": 0.9214644432067871,
+      "learning_rate": 0.003,
+      "loss": 4.182,
+      "step": 2356
+    },
+    {
+      "epoch": 0.02357,
+      "grad_norm": 1.004699945449829,
+      "learning_rate": 0.003,
+      "loss": 4.1873,
+      "step": 2357
+    },
+    {
+      "epoch": 0.02358,
+      "grad_norm": 0.8610268235206604,
+      "learning_rate": 0.003,
+      "loss": 4.1999,
+      "step": 2358
+    },
+    {
+      "epoch": 0.02359,
+      "grad_norm": 0.7828254699707031,
+      "learning_rate": 0.003,
+      "loss": 4.1779,
+      "step": 2359
+    },
+    {
+      "epoch": 0.0236,
+      "grad_norm": 0.6673213839530945,
+      "learning_rate": 0.003,
+      "loss": 4.2078,
+      "step": 2360
+    },
+    {
+      "epoch": 0.02361,
+      "grad_norm": 0.6459137201309204,
+      "learning_rate": 0.003,
+      "loss": 4.1883,
+      "step": 2361
+    },
+    {
+      "epoch": 0.02362,
+      "grad_norm": 0.5431698560714722,
+      "learning_rate": 0.003,
+      "loss": 4.1762,
+      "step": 2362
+    },
+    {
+      "epoch": 0.02363,
+      "grad_norm": 0.551616907119751,
+      "learning_rate": 0.003,
+      "loss": 4.151,
+      "step": 2363
+    },
+    {
+      "epoch": 0.02364,
+      "grad_norm": 0.5682998299598694,
+      "learning_rate": 0.003,
+      "loss": 4.1714,
+      "step": 2364
+    },
+    {
+      "epoch": 0.02365,
+      "grad_norm": 0.651782751083374,
+      "learning_rate": 0.003,
+      "loss": 4.1916,
+      "step": 2365
+    },
+    {
+      "epoch": 0.02366,
+      "grad_norm": 0.653306782245636,
+      "learning_rate": 0.003,
+      "loss": 4.164,
+      "step": 2366
+    },
+    {
+      "epoch": 0.02367,
+      "grad_norm": 0.6368426084518433,
+      "learning_rate": 0.003,
+      "loss": 4.1718,
+      "step": 2367
+    },
+    {
+      "epoch": 0.02368,
+      "grad_norm": 0.713473379611969,
+      "learning_rate": 0.003,
+      "loss": 4.1462,
+      "step": 2368
+    },
+    {
+      "epoch": 0.02369,
+      "grad_norm": 0.8085301518440247,
+      "learning_rate": 0.003,
+      "loss": 4.1615,
+      "step": 2369
+    },
+    {
+      "epoch": 0.0237,
+      "grad_norm": 0.7926542162895203,
+      "learning_rate": 0.003,
+      "loss": 4.1888,
+      "step": 2370
+    },
+    {
+      "epoch": 0.02371,
+      "grad_norm": 0.828406572341919,
+      "learning_rate": 0.003,
+      "loss": 4.1795,
+      "step": 2371
+    },
+    {
+      "epoch": 0.02372,
+      "grad_norm": 0.7606370449066162,
+      "learning_rate": 0.003,
+      "loss": 4.1649,
+      "step": 2372
+    },
+    {
+      "epoch": 0.02373,
+      "grad_norm": 0.6590718626976013,
+      "learning_rate": 0.003,
+      "loss": 4.1489,
+      "step": 2373
+    },
+    {
+      "epoch": 0.02374,
+      "grad_norm": 0.5918402075767517,
+      "learning_rate": 0.003,
+      "loss": 4.1536,
+      "step": 2374
+    },
+    {
+      "epoch": 0.02375,
+      "grad_norm": 0.6583318710327148,
+      "learning_rate": 0.003,
+      "loss": 4.1578,
+      "step": 2375
+    },
+    {
+      "epoch": 0.02376,
+      "grad_norm": 0.6908389329910278,
+      "learning_rate": 0.003,
+      "loss": 4.1806,
+      "step": 2376
+    },
+    {
+      "epoch": 0.02377,
+      "grad_norm": 0.6131519675254822,
+      "learning_rate": 0.003,
+      "loss": 4.1471,
+      "step": 2377
+    },
+    {
+      "epoch": 0.02378,
+      "grad_norm": 0.4756273031234741,
+      "learning_rate": 0.003,
+      "loss": 4.1538,
+      "step": 2378
+    },
+    {
+      "epoch": 0.02379,
+      "grad_norm": 0.5182189345359802,
+      "learning_rate": 0.003,
+      "loss": 4.1425,
+      "step": 2379
+    },
+    {
+      "epoch": 0.0238,
+      "grad_norm": 0.4939170181751251,
+      "learning_rate": 0.003,
+      "loss": 4.1141,
+      "step": 2380
+    },
+    {
+      "epoch": 0.02381,
+      "grad_norm": 0.4250797629356384,
+      "learning_rate": 0.003,
+      "loss": 4.1259,
+      "step": 2381
+    },
+    {
+      "epoch": 0.02382,
+      "grad_norm": 0.44852757453918457,
+      "learning_rate": 0.003,
+      "loss": 4.1818,
+      "step": 2382
+    },
+    {
+      "epoch": 0.02383,
+      "grad_norm": 0.4787205457687378,
+      "learning_rate": 0.003,
+      "loss": 4.1449,
+      "step": 2383
+    },
+    {
+      "epoch": 0.02384,
+      "grad_norm": 0.5154638290405273,
+      "learning_rate": 0.003,
+      "loss": 4.1313,
+      "step": 2384
+    },
+    {
+      "epoch": 0.02385,
+      "grad_norm": 0.5779361128807068,
+      "learning_rate": 0.003,
+      "loss": 4.1594,
+      "step": 2385
+    },
+    {
+      "epoch": 0.02386,
+      "grad_norm": 0.6726837158203125,
+      "learning_rate": 0.003,
+      "loss": 4.1392,
+      "step": 2386
+    },
+    {
+      "epoch": 0.02387,
+      "grad_norm": 0.6705544590950012,
+      "learning_rate": 0.003,
+      "loss": 4.138,
+      "step": 2387
+    },
+    {
+      "epoch": 0.02388,
+      "grad_norm": 0.520875096321106,
+      "learning_rate": 0.003,
+      "loss": 4.1869,
+      "step": 2388
+    },
+    {
+      "epoch": 0.02389,
+      "grad_norm": 0.4918033182621002,
+      "learning_rate": 0.003,
+      "loss": 4.1476,
+      "step": 2389
+    },
+    {
+      "epoch": 0.0239,
+      "grad_norm": 0.5085783004760742,
+      "learning_rate": 0.003,
+      "loss": 4.1404,
+      "step": 2390
+    },
+    {
+      "epoch": 0.02391,
+      "grad_norm": 0.5585421323776245,
+      "learning_rate": 0.003,
+      "loss": 4.1645,
+      "step": 2391
+    },
+    {
+      "epoch": 0.02392,
+      "grad_norm": 0.6892051100730896,
+      "learning_rate": 0.003,
+      "loss": 4.173,
+      "step": 2392
+    },
+    {
+      "epoch": 0.02393,
+      "grad_norm": 0.7382843494415283,
+      "learning_rate": 0.003,
+      "loss": 4.1352,
+      "step": 2393
+    },
+    {
+      "epoch": 0.02394,
+      "grad_norm": 0.6831554770469666,
+      "learning_rate": 0.003,
+      "loss": 4.1493,
+      "step": 2394
+    },
+    {
+      "epoch": 0.02395,
+      "grad_norm": 0.5855069756507874,
+      "learning_rate": 0.003,
+      "loss": 4.1618,
+      "step": 2395
+    },
+    {
+      "epoch": 0.02396,
+      "grad_norm": 0.618294894695282,
+      "learning_rate": 0.003,
+      "loss": 4.1749,
+      "step": 2396
+    },
+    {
+      "epoch": 0.02397,
+      "grad_norm": 0.6775050163269043,
+      "learning_rate": 0.003,
+      "loss": 4.1604,
+      "step": 2397
+    },
+    {
+      "epoch": 0.02398,
+      "grad_norm": 0.6511969566345215,
+      "learning_rate": 0.003,
+      "loss": 4.1663,
+      "step": 2398
+    },
+    {
+      "epoch": 0.02399,
+      "grad_norm": 0.538463294506073,
+      "learning_rate": 0.003,
+      "loss": 4.142,
+      "step": 2399
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.6128776669502258,
+      "learning_rate": 0.003,
+      "loss": 4.1581,
+      "step": 2400
+    },
+    {
+      "epoch": 0.02401,
+      "grad_norm": 0.7419440150260925,
+      "learning_rate": 0.003,
+      "loss": 4.1379,
+      "step": 2401
+    },
+    {
+      "epoch": 0.02402,
+      "grad_norm": 0.8263295888900757,
+      "learning_rate": 0.003,
+      "loss": 4.1752,
+      "step": 2402
+    },
+    {
+      "epoch": 0.02403,
+      "grad_norm": 0.7187276482582092,
+      "learning_rate": 0.003,
+      "loss": 4.143,
+      "step": 2403
+    },
+    {
+      "epoch": 0.02404,
+      "grad_norm": 0.6155825257301331,
+      "learning_rate": 0.003,
+      "loss": 4.1775,
+      "step": 2404
+    },
+    {
+      "epoch": 0.02405,
+      "grad_norm": 0.584798276424408,
+      "learning_rate": 0.003,
+      "loss": 4.1805,
+      "step": 2405
+    },
+    {
+      "epoch": 0.02406,
+      "grad_norm": 0.7314527034759521,
+      "learning_rate": 0.003,
+      "loss": 4.169,
+      "step": 2406
+    },
+    {
+      "epoch": 0.02407,
+      "grad_norm": 0.9434296488761902,
+      "learning_rate": 0.003,
+      "loss": 4.1562,
+      "step": 2407
+    },
+    {
+      "epoch": 0.02408,
+      "grad_norm": 1.003302812576294,
+      "learning_rate": 0.003,
+      "loss": 4.1727,
+      "step": 2408
+    },
+    {
+      "epoch": 0.02409,
+      "grad_norm": 0.9447243809700012,
+      "learning_rate": 0.003,
+      "loss": 4.1773,
+      "step": 2409
+    },
+    {
+      "epoch": 0.0241,
+      "grad_norm": 0.7472692131996155,
+      "learning_rate": 0.003,
+      "loss": 4.1837,
+      "step": 2410
+    },
+    {
+      "epoch": 0.02411,
+      "grad_norm": 0.8666222095489502,
+      "learning_rate": 0.003,
+      "loss": 4.1576,
+      "step": 2411
+    },
+    {
+      "epoch": 0.02412,
+      "grad_norm": 0.9741396307945251,
+      "learning_rate": 0.003,
+      "loss": 4.1698,
+      "step": 2412
+    },
+    {
+      "epoch": 0.02413,
+      "grad_norm": 1.0521008968353271,
+      "learning_rate": 0.003,
+      "loss": 4.1812,
+      "step": 2413
+    },
+    {
+      "epoch": 0.02414,
+      "grad_norm": 1.098198652267456,
+      "learning_rate": 0.003,
+      "loss": 4.198,
+      "step": 2414
+    },
+    {
+      "epoch": 0.02415,
+      "grad_norm": 1.0509668588638306,
+      "learning_rate": 0.003,
+      "loss": 4.1933,
+      "step": 2415
+    },
+    {
+      "epoch": 0.02416,
+      "grad_norm": 0.9923770427703857,
+      "learning_rate": 0.003,
+      "loss": 4.1791,
+      "step": 2416
+    },
+    {
+      "epoch": 0.02417,
+      "grad_norm": 0.9507772922515869,
+      "learning_rate": 0.003,
+      "loss": 4.2103,
+      "step": 2417
+    },
+    {
+      "epoch": 0.02418,
+      "grad_norm": 0.7804825305938721,
+      "learning_rate": 0.003,
+      "loss": 4.1914,
+      "step": 2418
+    },
+    {
+      "epoch": 0.02419,
+      "grad_norm": 0.6353334188461304,
+      "learning_rate": 0.003,
+      "loss": 4.1871,
+      "step": 2419
+    },
+    {
+      "epoch": 0.0242,
+      "grad_norm": 0.7049343585968018,
+      "learning_rate": 0.003,
+      "loss": 4.1989,
+      "step": 2420
+    },
+    {
+      "epoch": 0.02421,
+      "grad_norm": 0.8157212734222412,
+      "learning_rate": 0.003,
+      "loss": 4.1685,
+      "step": 2421
+    },
+    {
+      "epoch": 0.02422,
+      "grad_norm": 0.9302619099617004,
+      "learning_rate": 0.003,
+      "loss": 4.2265,
+      "step": 2422
+    },
+    {
+      "epoch": 0.02423,
+      "grad_norm": 1.0272700786590576,
+      "learning_rate": 0.003,
+      "loss": 4.2247,
+      "step": 2423
+    },
+    {
+      "epoch": 0.02424,
+      "grad_norm": 1.0545587539672852,
+      "learning_rate": 0.003,
+      "loss": 4.2182,
+      "step": 2424
+    },
+    {
+      "epoch": 0.02425,
+      "grad_norm": 0.8540631532669067,
+      "learning_rate": 0.003,
+      "loss": 4.2022,
+      "step": 2425
+    },
+    {
+      "epoch": 0.02426,
+      "grad_norm": 0.7790244817733765,
+      "learning_rate": 0.003,
+      "loss": 4.2053,
+      "step": 2426
+    },
+    {
+      "epoch": 0.02427,
+      "grad_norm": 0.6194785237312317,
+      "learning_rate": 0.003,
+      "loss": 4.2245,
+      "step": 2427
+    },
+    {
+      "epoch": 0.02428,
+      "grad_norm": 0.723381519317627,
+      "learning_rate": 0.003,
+      "loss": 4.1967,
+      "step": 2428
+    },
+    {
+      "epoch": 0.02429,
+      "grad_norm": 0.8042016625404358,
+      "learning_rate": 0.003,
+      "loss": 4.1857,
+      "step": 2429
+    },
+    {
+      "epoch": 0.0243,
+      "grad_norm": 0.8838525414466858,
+      "learning_rate": 0.003,
+      "loss": 4.211,
+      "step": 2430
+    },
+    {
+      "epoch": 0.02431,
+      "grad_norm": 0.9250835180282593,
+      "learning_rate": 0.003,
+      "loss": 4.1912,
+      "step": 2431
+    },
+    {
+      "epoch": 0.02432,
+      "grad_norm": 0.7146485447883606,
+      "learning_rate": 0.003,
+      "loss": 4.2038,
+      "step": 2432
+    },
+    {
+      "epoch": 0.02433,
+      "grad_norm": 0.689758837223053,
+      "learning_rate": 0.003,
+      "loss": 4.2175,
+      "step": 2433
+    },
+    {
+      "epoch": 0.02434,
+      "grad_norm": 0.7109194993972778,
+      "learning_rate": 0.003,
+      "loss": 4.1797,
+      "step": 2434
+    },
+    {
+      "epoch": 0.02435,
+      "grad_norm": 0.6944730281829834,
+      "learning_rate": 0.003,
+      "loss": 4.1786,
+      "step": 2435
+    },
+    {
+      "epoch": 0.02436,
+      "grad_norm": 0.5854509472846985,
+      "learning_rate": 0.003,
+      "loss": 4.1804,
+      "step": 2436
+    },
+    {
+      "epoch": 0.02437,
+      "grad_norm": 0.5410053730010986,
+      "learning_rate": 0.003,
+      "loss": 4.1621,
+      "step": 2437
+    },
+    {
+      "epoch": 0.02438,
+      "grad_norm": 0.4632667303085327,
+      "learning_rate": 0.003,
+      "loss": 4.1763,
+      "step": 2438
+    },
+    {
+      "epoch": 0.02439,
+      "grad_norm": 0.47085174918174744,
+      "learning_rate": 0.003,
+      "loss": 4.143,
+      "step": 2439
+    },
+    {
+      "epoch": 0.0244,
+      "grad_norm": 0.4175461232662201,
+      "learning_rate": 0.003,
+      "loss": 4.1908,
+      "step": 2440
+    },
+    {
+      "epoch": 0.02441,
+      "grad_norm": 0.4487816095352173,
+      "learning_rate": 0.003,
+      "loss": 4.1559,
+      "step": 2441
+    },
+    {
+      "epoch": 0.02442,
+      "grad_norm": 0.3787577450275421,
+      "learning_rate": 0.003,
+      "loss": 4.1845,
+      "step": 2442
+    },
+    {
+      "epoch": 0.02443,
+      "grad_norm": 0.4035150706768036,
+      "learning_rate": 0.003,
+      "loss": 4.1219,
+      "step": 2443
+    },
+    {
+      "epoch": 0.02444,
+      "grad_norm": 0.399797648191452,
+      "learning_rate": 0.003,
+      "loss": 4.1495,
+      "step": 2444
+    },
+    {
+      "epoch": 0.02445,
+      "grad_norm": 0.4260754883289337,
+      "learning_rate": 0.003,
+      "loss": 4.1538,
+      "step": 2445
+    },
+    {
+      "epoch": 0.02446,
+      "grad_norm": 0.5048103928565979,
+      "learning_rate": 0.003,
+      "loss": 4.1567,
+      "step": 2446
+    },
+    {
+      "epoch": 0.02447,
+      "grad_norm": 0.6289964914321899,
+      "learning_rate": 0.003,
+      "loss": 4.1667,
+      "step": 2447
+    },
+    {
+      "epoch": 0.02448,
+      "grad_norm": 0.8102890253067017,
+      "learning_rate": 0.003,
+      "loss": 4.1719,
+      "step": 2448
+    },
+    {
+      "epoch": 0.02449,
+      "grad_norm": 0.8461275100708008,
+      "learning_rate": 0.003,
+      "loss": 4.1705,
+      "step": 2449
+    },
+    {
+      "epoch": 0.0245,
+      "grad_norm": 0.7120311260223389,
+      "learning_rate": 0.003,
+      "loss": 4.1692,
+      "step": 2450
+    },
+    {
+      "epoch": 0.02451,
+      "grad_norm": 0.4928838908672333,
+      "learning_rate": 0.003,
+      "loss": 4.1398,
+      "step": 2451
+    },
+    {
+      "epoch": 0.02452,
+      "grad_norm": 0.517375648021698,
+      "learning_rate": 0.003,
+      "loss": 4.1492,
+      "step": 2452
+    },
+    {
+      "epoch": 0.02453,
+      "grad_norm": 0.5950134992599487,
+      "learning_rate": 0.003,
+      "loss": 4.1282,
+      "step": 2453
+    },
+    {
+      "epoch": 0.02454,
+      "grad_norm": 0.6498395800590515,
+      "learning_rate": 0.003,
+      "loss": 4.1374,
+      "step": 2454
+    },
+    {
+      "epoch": 0.02455,
+      "grad_norm": 0.666580319404602,
+      "learning_rate": 0.003,
+      "loss": 4.1482,
+      "step": 2455
+    },
+    {
+      "epoch": 0.02456,
+      "grad_norm": 0.6841810941696167,
+      "learning_rate": 0.003,
+      "loss": 4.1183,
+      "step": 2456
+    },
+    {
+      "epoch": 0.02457,
+      "grad_norm": 0.6185785531997681,
+      "learning_rate": 0.003,
+      "loss": 4.1655,
+      "step": 2457
+    },
+    {
+      "epoch": 0.02458,
+      "grad_norm": 0.6619728207588196,
+      "learning_rate": 0.003,
+      "loss": 4.1399,
+      "step": 2458
+    },
+    {
+      "epoch": 0.02459,
+      "grad_norm": 0.721472978591919,
+      "learning_rate": 0.003,
+      "loss": 4.1746,
+      "step": 2459
+    },
+    {
+      "epoch": 0.0246,
+      "grad_norm": 0.7174915075302124,
+      "learning_rate": 0.003,
+      "loss": 4.1756,
+      "step": 2460
+    },
+    {
+      "epoch": 0.02461,
+      "grad_norm": 0.7339833378791809,
+      "learning_rate": 0.003,
+      "loss": 4.1527,
+      "step": 2461
+    },
+    {
+      "epoch": 0.02462,
+      "grad_norm": 0.738292932510376,
+      "learning_rate": 0.003,
+      "loss": 4.1817,
+      "step": 2462
+    },
+    {
+      "epoch": 0.02463,
+      "grad_norm": 0.6757227778434753,
+      "learning_rate": 0.003,
+      "loss": 4.1783,
+      "step": 2463
+    },
+    {
+      "epoch": 0.02464,
+      "grad_norm": 0.5629785656929016,
+      "learning_rate": 0.003,
+      "loss": 4.1522,
+      "step": 2464
+    },
+    {
+      "epoch": 0.02465,
+      "grad_norm": 0.619929850101471,
+      "learning_rate": 0.003,
+      "loss": 4.1723,
+      "step": 2465
+    },
+    {
+      "epoch": 0.02466,
+      "grad_norm": 0.5323424339294434,
+      "learning_rate": 0.003,
+      "loss": 4.1498,
+      "step": 2466
+    },
+    {
+      "epoch": 0.02467,
+      "grad_norm": 0.5933912396430969,
+      "learning_rate": 0.003,
+      "loss": 4.1304,
+      "step": 2467
+    },
+    {
+      "epoch": 0.02468,
+      "grad_norm": 0.5487152338027954,
+      "learning_rate": 0.003,
+      "loss": 4.1707,
+      "step": 2468
+    },
+    {
+      "epoch": 0.02469,
+      "grad_norm": 0.5824567675590515,
+      "learning_rate": 0.003,
+      "loss": 4.1523,
+      "step": 2469
+    },
+    {
+      "epoch": 0.0247,
+      "grad_norm": 0.674126148223877,
+      "learning_rate": 0.003,
+      "loss": 4.1498,
+      "step": 2470
+    },
+    {
+      "epoch": 0.02471,
+      "grad_norm": 0.6640561819076538,
+      "learning_rate": 0.003,
+      "loss": 4.1387,
+      "step": 2471
+    },
+    {
+      "epoch": 0.02472,
+      "grad_norm": 0.6775668859481812,
+      "learning_rate": 0.003,
+      "loss": 4.1435,
+      "step": 2472
+    },
+    {
+      "epoch": 0.02473,
+      "grad_norm": 0.668128252029419,
+      "learning_rate": 0.003,
+      "loss": 4.1273,
+      "step": 2473
+    },
+    {
+      "epoch": 0.02474,
+      "grad_norm": 0.6847355961799622,
+      "learning_rate": 0.003,
+      "loss": 4.1625,
+      "step": 2474
+    },
+    {
+      "epoch": 0.02475,
+      "grad_norm": 0.8420220017433167,
+      "learning_rate": 0.003,
+      "loss": 4.1414,
+      "step": 2475
+    },
+    {
+      "epoch": 0.02476,
+      "grad_norm": 0.8548656702041626,
+      "learning_rate": 0.003,
+      "loss": 4.1541,
+      "step": 2476
+    },
+    {
+      "epoch": 0.02477,
+      "grad_norm": 0.6734539866447449,
+      "learning_rate": 0.003,
+      "loss": 4.1705,
+      "step": 2477
+    },
+    {
+      "epoch": 0.02478,
+      "grad_norm": 0.6893376111984253,
+      "learning_rate": 0.003,
+      "loss": 4.1474,
+      "step": 2478
+    },
+    {
+      "epoch": 0.02479,
+      "grad_norm": 0.5793731808662415,
+      "learning_rate": 0.003,
+      "loss": 4.1333,
+      "step": 2479
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.5962329506874084,
+      "learning_rate": 0.003,
+      "loss": 4.1754,
+      "step": 2480
+    },
+    {
+      "epoch": 0.02481,
+      "grad_norm": 0.5665598511695862,
+      "learning_rate": 0.003,
+      "loss": 4.1561,
+      "step": 2481
+    },
+    {
+      "epoch": 0.02482,
+      "grad_norm": 0.6422098875045776,
+      "learning_rate": 0.003,
+      "loss": 4.1658,
+      "step": 2482
+    },
+    {
+      "epoch": 0.02483,
+      "grad_norm": 0.6842206716537476,
+      "learning_rate": 0.003,
+      "loss": 4.1214,
+      "step": 2483
+    },
+    {
+      "epoch": 0.02484,
+      "grad_norm": 0.6730716824531555,
+      "learning_rate": 0.003,
+      "loss": 4.1517,
+      "step": 2484
+    },
+    {
+      "epoch": 0.02485,
+      "grad_norm": 0.7416301369667053,
+      "learning_rate": 0.003,
+      "loss": 4.1761,
+      "step": 2485
+    },
+    {
+      "epoch": 0.02486,
+      "grad_norm": 0.8094339370727539,
+      "learning_rate": 0.003,
+      "loss": 4.2033,
+      "step": 2486
+    },
+    {
+      "epoch": 0.02487,
+      "grad_norm": 0.7401948571205139,
+      "learning_rate": 0.003,
+      "loss": 4.1575,
+      "step": 2487
+    },
+    {
+      "epoch": 0.02488,
+      "grad_norm": 0.7254106402397156,
+      "learning_rate": 0.003,
+      "loss": 4.1347,
+      "step": 2488
+    },
+    {
+      "epoch": 0.02489,
+      "grad_norm": 0.7380350232124329,
+      "learning_rate": 0.003,
+      "loss": 4.1254,
+      "step": 2489
+    },
+    {
+      "epoch": 0.0249,
+      "grad_norm": 0.8402035236358643,
+      "learning_rate": 0.003,
+      "loss": 4.1326,
+      "step": 2490
+    },
+    {
+      "epoch": 0.02491,
+      "grad_norm": 0.9696373343467712,
+      "learning_rate": 0.003,
+      "loss": 4.1667,
+      "step": 2491
+    },
+    {
+      "epoch": 0.02492,
+      "grad_norm": 0.9300618171691895,
+      "learning_rate": 0.003,
+      "loss": 4.1853,
+      "step": 2492
+    },
+    {
+      "epoch": 0.02493,
+      "grad_norm": 0.7866774201393127,
+      "learning_rate": 0.003,
+      "loss": 4.1475,
+      "step": 2493
+    },
+    {
+      "epoch": 0.02494,
+      "grad_norm": 0.7883460521697998,
+      "learning_rate": 0.003,
+      "loss": 4.1631,
+      "step": 2494
+    },
+    {
+      "epoch": 0.02495,
+      "grad_norm": 0.7333797216415405,
+      "learning_rate": 0.003,
+      "loss": 4.1705,
+      "step": 2495
+    },
+    {
+      "epoch": 0.02496,
+      "grad_norm": 0.7359468936920166,
+      "learning_rate": 0.003,
+      "loss": 4.1637,
+      "step": 2496
+    },
+    {
+      "epoch": 0.02497,
+      "grad_norm": 0.7477230429649353,
+      "learning_rate": 0.003,
+      "loss": 4.1701,
+      "step": 2497
+    },
+    {
+      "epoch": 0.02498,
+      "grad_norm": 0.6632980704307556,
+      "learning_rate": 0.003,
+      "loss": 4.1549,
+      "step": 2498
+    },
+    {
+      "epoch": 0.02499,
+      "grad_norm": 0.5524125099182129,
+      "learning_rate": 0.003,
+      "loss": 4.1622,
+      "step": 2499
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 0.5341687202453613,
+      "learning_rate": 0.003,
+      "loss": 4.1471,
+      "step": 2500
+    },
+    {
+      "epoch": 0.02501,
+      "grad_norm": 0.5111349821090698,
+      "learning_rate": 0.003,
+      "loss": 4.1766,
+      "step": 2501
+    },
+    {
+      "epoch": 0.02502,
+      "grad_norm": 0.5449102520942688,
+      "learning_rate": 0.003,
+      "loss": 4.159,
+      "step": 2502
+    },
+    {
+      "epoch": 0.02503,
+      "grad_norm": 0.5349955558776855,
+      "learning_rate": 0.003,
+      "loss": 4.1642,
+      "step": 2503
+    },
+    {
+      "epoch": 0.02504,
+      "grad_norm": 0.48626482486724854,
+      "learning_rate": 0.003,
+      "loss": 4.1409,
+      "step": 2504
+    },
+    {
+      "epoch": 0.02505,
+      "grad_norm": 0.527682363986969,
+      "learning_rate": 0.003,
+      "loss": 4.1423,
+      "step": 2505
+    },
+    {
+      "epoch": 0.02506,
+      "grad_norm": 0.6103768348693848,
+      "learning_rate": 0.003,
+      "loss": 4.1565,
+      "step": 2506
+    },
+    {
+      "epoch": 0.02507,
+      "grad_norm": 0.6964853405952454,
+      "learning_rate": 0.003,
+      "loss": 4.1472,
+      "step": 2507
+    },
+    {
+      "epoch": 0.02508,
+      "grad_norm": 0.8648868203163147,
+      "learning_rate": 0.003,
+      "loss": 4.1494,
+      "step": 2508
+    },
+    {
+      "epoch": 0.02509,
+      "grad_norm": 0.9269753694534302,
+      "learning_rate": 0.003,
+      "loss": 4.1639,
+      "step": 2509
+    },
+    {
+      "epoch": 0.0251,
+      "grad_norm": 1.022930383682251,
+      "learning_rate": 0.003,
+      "loss": 4.1832,
+      "step": 2510
+    },
+    {
+      "epoch": 0.02511,
+      "grad_norm": 0.7745629549026489,
+      "learning_rate": 0.003,
+      "loss": 4.1837,
+      "step": 2511
+    },
+    {
+      "epoch": 0.02512,
+      "grad_norm": 0.6751075983047485,
+      "learning_rate": 0.003,
+      "loss": 4.1635,
+      "step": 2512
+    },
+    {
+      "epoch": 0.02513,
+      "grad_norm": 0.7629876732826233,
+      "learning_rate": 0.003,
+      "loss": 4.1626,
+      "step": 2513
+    },
+    {
+      "epoch": 0.02514,
+      "grad_norm": 0.7923519015312195,
+      "learning_rate": 0.003,
+      "loss": 4.1521,
+      "step": 2514
+    },
+    {
+      "epoch": 0.02515,
+      "grad_norm": 0.7363923192024231,
+      "learning_rate": 0.003,
+      "loss": 4.1379,
+      "step": 2515
+    },
+    {
+      "epoch": 0.02516,
+      "grad_norm": 0.6523436903953552,
+      "learning_rate": 0.003,
+      "loss": 4.1588,
+      "step": 2516
+    },
+    {
+      "epoch": 0.02517,
+      "grad_norm": 0.669247031211853,
+      "learning_rate": 0.003,
+      "loss": 4.1618,
+      "step": 2517
+    },
+    {
+      "epoch": 0.02518,
+      "grad_norm": 0.5932192802429199,
+      "learning_rate": 0.003,
+      "loss": 4.1656,
+      "step": 2518
+    },
+    {
+      "epoch": 0.02519,
+      "grad_norm": 0.5920613408088684,
+      "learning_rate": 0.003,
+      "loss": 4.1449,
+      "step": 2519
+    },
+    {
+      "epoch": 0.0252,
+      "grad_norm": 0.5889076590538025,
+      "learning_rate": 0.003,
+      "loss": 4.1566,
+      "step": 2520
+    },
+    {
+      "epoch": 0.02521,
+      "grad_norm": 0.650814414024353,
+      "learning_rate": 0.003,
+      "loss": 4.1509,
+      "step": 2521
+    },
+    {
+      "epoch": 0.02522,
+      "grad_norm": 0.5603437423706055,
+      "learning_rate": 0.003,
+      "loss": 4.1446,
+      "step": 2522
+    },
+    {
+      "epoch": 0.02523,
+      "grad_norm": 0.4928959906101227,
+      "learning_rate": 0.003,
+      "loss": 4.146,
+      "step": 2523
+    },
+    {
+      "epoch": 0.02524,
+      "grad_norm": 0.4817637801170349,
+      "learning_rate": 0.003,
+      "loss": 4.1571,
+      "step": 2524
+    },
+    {
+      "epoch": 0.02525,
+      "grad_norm": 0.4819410741329193,
+      "learning_rate": 0.003,
+      "loss": 4.169,
+      "step": 2525
+    },
+    {
+      "epoch": 0.02526,
+      "grad_norm": 0.5578210353851318,
+      "learning_rate": 0.003,
+      "loss": 4.1154,
+      "step": 2526
+    },
+    {
+      "epoch": 0.02527,
+      "grad_norm": 0.7309604287147522,
+      "learning_rate": 0.003,
+      "loss": 4.157,
+      "step": 2527
+    },
+    {
+      "epoch": 0.02528,
+      "grad_norm": 0.9689713716506958,
+      "learning_rate": 0.003,
+      "loss": 4.1882,
+      "step": 2528
+    },
+    {
+      "epoch": 0.02529,
+      "grad_norm": 1.002740502357483,
+      "learning_rate": 0.003,
+      "loss": 4.1697,
+      "step": 2529
+    },
+    {
+      "epoch": 0.0253,
+      "grad_norm": 0.845522940158844,
+      "learning_rate": 0.003,
+      "loss": 4.1791,
+      "step": 2530
+    },
+    {
+      "epoch": 0.02531,
+      "grad_norm": 0.8186164498329163,
+      "learning_rate": 0.003,
+      "loss": 4.181,
+      "step": 2531
+    },
+    {
+      "epoch": 0.02532,
+      "grad_norm": 0.7776713371276855,
+      "learning_rate": 0.003,
+      "loss": 4.15,
+      "step": 2532
+    },
+    {
+      "epoch": 0.02533,
+      "grad_norm": 0.6073544025421143,
+      "learning_rate": 0.003,
+      "loss": 4.1843,
+      "step": 2533
+    },
+    {
+      "epoch": 0.02534,
+      "grad_norm": 0.7059157490730286,
+      "learning_rate": 0.003,
+      "loss": 4.1674,
+      "step": 2534
+    },
+    {
+      "epoch": 0.02535,
+      "grad_norm": 0.6908336281776428,
+      "learning_rate": 0.003,
+      "loss": 4.1601,
+      "step": 2535
+    },
+    {
+      "epoch": 0.02536,
+      "grad_norm": 0.6330790519714355,
+      "learning_rate": 0.003,
+      "loss": 4.1547,
+      "step": 2536
+    },
+    {
+      "epoch": 0.02537,
+      "grad_norm": 0.5009950995445251,
+      "learning_rate": 0.003,
+      "loss": 4.1424,
+      "step": 2537
+    },
+    {
+      "epoch": 0.02538,
+      "grad_norm": 0.5122132897377014,
+      "learning_rate": 0.003,
+      "loss": 4.1182,
+      "step": 2538
+    },
+    {
+      "epoch": 0.02539,
+      "grad_norm": 0.5557868480682373,
+      "learning_rate": 0.003,
+      "loss": 4.1698,
+      "step": 2539
+    },
+    {
+      "epoch": 0.0254,
+      "grad_norm": 0.5659569501876831,
+      "learning_rate": 0.003,
+      "loss": 4.1601,
+      "step": 2540
+    },
+    {
+      "epoch": 0.02541,
+      "grad_norm": 0.4916652739048004,
+      "learning_rate": 0.003,
+      "loss": 4.1476,
+      "step": 2541
+    },
+    {
+      "epoch": 0.02542,
+      "grad_norm": 0.5319804549217224,
+      "learning_rate": 0.003,
+      "loss": 4.1273,
+      "step": 2542
+    },
+    {
+      "epoch": 0.02543,
+      "grad_norm": 0.4849517047405243,
+      "learning_rate": 0.003,
+      "loss": 4.1525,
+      "step": 2543
+    },
+    {
+      "epoch": 0.02544,
+      "grad_norm": 0.46546679735183716,
+      "learning_rate": 0.003,
+      "loss": 4.1407,
+      "step": 2544
+    },
+    {
+      "epoch": 0.02545,
+      "grad_norm": 0.4903736710548401,
+      "learning_rate": 0.003,
+      "loss": 4.1226,
+      "step": 2545
+    },
+    {
+      "epoch": 0.02546,
+      "grad_norm": 0.5320796370506287,
+      "learning_rate": 0.003,
+      "loss": 4.1328,
+      "step": 2546
+    },
+    {
+      "epoch": 0.02547,
+      "grad_norm": 0.6876251101493835,
+      "learning_rate": 0.003,
+      "loss": 4.1736,
+      "step": 2547
+    },
+    {
+      "epoch": 0.02548,
+      "grad_norm": 0.8988205194473267,
+      "learning_rate": 0.003,
+      "loss": 4.1808,
+      "step": 2548
+    },
+    {
+      "epoch": 0.02549,
+      "grad_norm": 0.9427040815353394,
+      "learning_rate": 0.003,
+      "loss": 4.1752,
+      "step": 2549
+    },
+    {
+      "epoch": 0.0255,
+      "grad_norm": 0.8955228924751282,
+      "learning_rate": 0.003,
+      "loss": 4.1429,
+      "step": 2550
+    },
+    {
+      "epoch": 0.02551,
+      "grad_norm": 0.6933260560035706,
+      "learning_rate": 0.003,
+      "loss": 4.1649,
+      "step": 2551
+    },
+    {
+      "epoch": 0.02552,
+      "grad_norm": 0.6283506751060486,
+      "learning_rate": 0.003,
+      "loss": 4.1318,
+      "step": 2552
+    },
+    {
+      "epoch": 0.02553,
+      "grad_norm": 0.6629008054733276,
+      "learning_rate": 0.003,
+      "loss": 4.1984,
+      "step": 2553
+    },
+    {
+      "epoch": 0.02554,
+      "grad_norm": 0.6356085538864136,
+      "learning_rate": 0.003,
+      "loss": 4.1468,
+      "step": 2554
+    },
+    {
+      "epoch": 0.02555,
+      "grad_norm": 0.6418399214744568,
+      "learning_rate": 0.003,
+      "loss": 4.1375,
+      "step": 2555
+    },
+    {
+      "epoch": 0.02556,
+      "grad_norm": 0.689191997051239,
+      "learning_rate": 0.003,
+      "loss": 4.1505,
+      "step": 2556
+    },
+    {
+      "epoch": 0.02557,
+      "grad_norm": 0.6599099636077881,
+      "learning_rate": 0.003,
+      "loss": 4.1536,
+      "step": 2557
+    },
+    {
+      "epoch": 0.02558,
+      "grad_norm": 0.6565999984741211,
+      "learning_rate": 0.003,
+      "loss": 4.1668,
+      "step": 2558
+    },
+    {
+      "epoch": 0.02559,
+      "grad_norm": 0.6738297939300537,
+      "learning_rate": 0.003,
+      "loss": 4.1674,
+      "step": 2559
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.5509478449821472,
+      "learning_rate": 0.003,
+      "loss": 4.1382,
+      "step": 2560
+    },
+    {
+      "epoch": 0.02561,
+      "grad_norm": 0.5239382982254028,
+      "learning_rate": 0.003,
+      "loss": 4.1361,
+      "step": 2561
+    },
+    {
+      "epoch": 0.02562,
+      "grad_norm": 0.5102459788322449,
+      "learning_rate": 0.003,
+      "loss": 4.1531,
+      "step": 2562
+    },
+    {
+      "epoch": 0.02563,
+      "grad_norm": 0.5850344896316528,
+      "learning_rate": 0.003,
+      "loss": 4.1384,
+      "step": 2563
+    },
+    {
+      "epoch": 0.02564,
+      "grad_norm": 0.6901075839996338,
+      "learning_rate": 0.003,
+      "loss": 4.1621,
+      "step": 2564
+    },
+    {
+      "epoch": 0.02565,
+      "grad_norm": 0.8146087527275085,
+      "learning_rate": 0.003,
+      "loss": 4.1471,
+      "step": 2565
+    },
+    {
+      "epoch": 0.02566,
+      "grad_norm": 0.8885420560836792,
+      "learning_rate": 0.003,
+      "loss": 4.1712,
+      "step": 2566
+    },
+    {
+      "epoch": 0.02567,
+      "grad_norm": 0.9885812997817993,
+      "learning_rate": 0.003,
+      "loss": 4.1808,
+      "step": 2567
+    },
+    {
+      "epoch": 0.02568,
+      "grad_norm": 1.0153335332870483,
+      "learning_rate": 0.003,
+      "loss": 4.1801,
+      "step": 2568
+    },
+    {
+      "epoch": 0.02569,
+      "grad_norm": 0.9595790505409241,
+      "learning_rate": 0.003,
+      "loss": 4.1596,
+      "step": 2569
+    },
+    {
+      "epoch": 0.0257,
+      "grad_norm": 1.0054306983947754,
+      "learning_rate": 0.003,
+      "loss": 4.1689,
+      "step": 2570
+    },
+    {
+      "epoch": 0.02571,
+      "grad_norm": 0.997405469417572,
+      "learning_rate": 0.003,
+      "loss": 4.1908,
+      "step": 2571
+    },
+    {
+      "epoch": 0.02572,
+      "grad_norm": 0.9328084588050842,
+      "learning_rate": 0.003,
+      "loss": 4.1687,
+      "step": 2572
+    },
+    {
+      "epoch": 0.02573,
+      "grad_norm": 0.8243227005004883,
+      "learning_rate": 0.003,
+      "loss": 4.1707,
+      "step": 2573
+    },
+    {
+      "epoch": 0.02574,
+      "grad_norm": 0.7659904360771179,
+      "learning_rate": 0.003,
+      "loss": 4.1984,
+      "step": 2574
+    },
+    {
+      "epoch": 0.02575,
+      "grad_norm": 0.7300676107406616,
+      "learning_rate": 0.003,
+      "loss": 4.1334,
+      "step": 2575
+    },
+    {
+      "epoch": 0.02576,
+      "grad_norm": 0.6190217733383179,
+      "learning_rate": 0.003,
+      "loss": 4.1369,
+      "step": 2576
+    },
+    {
+      "epoch": 0.02577,
+      "grad_norm": 0.6222746968269348,
+      "learning_rate": 0.003,
+      "loss": 4.138,
+      "step": 2577
+    },
+    {
+      "epoch": 0.02578,
+      "grad_norm": 0.698691725730896,
+      "learning_rate": 0.003,
+      "loss": 4.1629,
+      "step": 2578
+    },
+    {
+      "epoch": 0.02579,
+      "grad_norm": 0.6583962440490723,
+      "learning_rate": 0.003,
+      "loss": 4.1089,
+      "step": 2579
+    },
+    {
+      "epoch": 0.0258,
+      "grad_norm": 0.6283578872680664,
+      "learning_rate": 0.003,
+      "loss": 4.1658,
+      "step": 2580
+    },
+    {
+      "epoch": 0.02581,
+      "grad_norm": 0.48666515946388245,
+      "learning_rate": 0.003,
+      "loss": 4.1808,
+      "step": 2581
+    },
+    {
+      "epoch": 0.02582,
+      "grad_norm": 0.5155734419822693,
+      "learning_rate": 0.003,
+      "loss": 4.1757,
+      "step": 2582
+    },
+    {
+      "epoch": 0.02583,
+      "grad_norm": 0.46086037158966064,
+      "learning_rate": 0.003,
+      "loss": 4.1498,
+      "step": 2583
+    },
+    {
+      "epoch": 0.02584,
+      "grad_norm": 0.4467635154724121,
+      "learning_rate": 0.003,
+      "loss": 4.1578,
+      "step": 2584
+    },
+    {
+      "epoch": 0.02585,
+      "grad_norm": 0.4732924997806549,
+      "learning_rate": 0.003,
+      "loss": 4.1407,
+      "step": 2585
+    },
+    {
+      "epoch": 0.02586,
+      "grad_norm": 0.40596628189086914,
+      "learning_rate": 0.003,
+      "loss": 4.159,
+      "step": 2586
+    },
+    {
+      "epoch": 0.02587,
+      "grad_norm": 0.41176727414131165,
+      "learning_rate": 0.003,
+      "loss": 4.1467,
+      "step": 2587
+    },
+    {
+      "epoch": 0.02588,
+      "grad_norm": 0.4998745620250702,
+      "learning_rate": 0.003,
+      "loss": 4.1476,
+      "step": 2588
+    },
+    {
+      "epoch": 0.02589,
+      "grad_norm": 0.6378880143165588,
+      "learning_rate": 0.003,
+      "loss": 4.1526,
+      "step": 2589
+    },
+    {
+      "epoch": 0.0259,
+      "grad_norm": 0.9064441919326782,
+      "learning_rate": 0.003,
+      "loss": 4.1707,
+      "step": 2590
+    },
+    {
+      "epoch": 0.02591,
+      "grad_norm": 1.16274094581604,
+      "learning_rate": 0.003,
+      "loss": 4.1263,
+      "step": 2591
+    },
+    {
+      "epoch": 0.02592,
+      "grad_norm": 0.8324390053749084,
+      "learning_rate": 0.003,
+      "loss": 4.1564,
+      "step": 2592
+    },
+    {
+      "epoch": 0.02593,
+      "grad_norm": 0.8290135860443115,
+      "learning_rate": 0.003,
+      "loss": 4.1637,
+      "step": 2593
+    },
+    {
+      "epoch": 0.02594,
+      "grad_norm": 0.8181279897689819,
+      "learning_rate": 0.003,
+      "loss": 4.1811,
+      "step": 2594
+    },
+    {
+      "epoch": 0.02595,
+      "grad_norm": 0.7736456394195557,
+      "learning_rate": 0.003,
+      "loss": 4.1375,
+      "step": 2595
+    },
+    {
+      "epoch": 0.02596,
+      "grad_norm": 0.7409265041351318,
+      "learning_rate": 0.003,
+      "loss": 4.1736,
+      "step": 2596
+    },
+    {
+      "epoch": 0.02597,
+      "grad_norm": 0.7836101651191711,
+      "learning_rate": 0.003,
+      "loss": 4.1438,
+      "step": 2597
+    },
+    {
+      "epoch": 0.02598,
+      "grad_norm": 0.6943440437316895,
+      "learning_rate": 0.003,
+      "loss": 4.1827,
+      "step": 2598
+    },
+    {
+      "epoch": 0.02599,
+      "grad_norm": 0.6729668378829956,
+      "learning_rate": 0.003,
+      "loss": 4.1732,
+      "step": 2599
+    },
+    {
+      "epoch": 0.026,
+      "grad_norm": 0.7131978869438171,
+      "learning_rate": 0.003,
+      "loss": 4.1756,
+      "step": 2600
+    },
+    {
+      "epoch": 0.02601,
+      "grad_norm": 0.7535547018051147,
+      "learning_rate": 0.003,
+      "loss": 4.1785,
+      "step": 2601
+    },
+    {
+      "epoch": 0.02602,
+      "grad_norm": 0.8210467100143433,
+      "learning_rate": 0.003,
+      "loss": 4.1562,
+      "step": 2602
+    },
+    {
+      "epoch": 0.02603,
+      "grad_norm": 0.7161123752593994,
+      "learning_rate": 0.003,
+      "loss": 4.1505,
+      "step": 2603
+    },
+    {
+      "epoch": 0.02604,
+      "grad_norm": 0.6834149956703186,
+      "learning_rate": 0.003,
+      "loss": 4.1371,
+      "step": 2604
+    },
+    {
+      "epoch": 0.02605,
+      "grad_norm": 0.5697470307350159,
+      "learning_rate": 0.003,
+      "loss": 4.1503,
+      "step": 2605
+    },
+    {
+      "epoch": 0.02606,
+      "grad_norm": 0.5395636558532715,
+      "learning_rate": 0.003,
+      "loss": 4.1673,
+      "step": 2606
+    },
+    {
+      "epoch": 0.02607,
+      "grad_norm": 0.47388994693756104,
+      "learning_rate": 0.003,
+      "loss": 4.157,
+      "step": 2607
+    },
+    {
+      "epoch": 0.02608,
+      "grad_norm": 0.435537189245224,
+      "learning_rate": 0.003,
+      "loss": 4.1355,
+      "step": 2608
+    },
+    {
+      "epoch": 0.02609,
+      "grad_norm": 0.4369211792945862,
+      "learning_rate": 0.003,
+      "loss": 4.1788,
+      "step": 2609
+    },
+    {
+      "epoch": 0.0261,
+      "grad_norm": 0.46976083517074585,
+      "learning_rate": 0.003,
+      "loss": 4.1283,
+      "step": 2610
+    },
+    {
+      "epoch": 0.02611,
+      "grad_norm": 0.5262687802314758,
+      "learning_rate": 0.003,
+      "loss": 4.1186,
+      "step": 2611
+    },
+    {
+      "epoch": 0.02612,
+      "grad_norm": 0.7051580548286438,
+      "learning_rate": 0.003,
+      "loss": 4.1591,
+      "step": 2612
+    },
+    {
+      "epoch": 0.02613,
+      "grad_norm": 0.8811540603637695,
+      "learning_rate": 0.003,
+      "loss": 4.1618,
+      "step": 2613
+    },
+    {
+      "epoch": 0.02614,
+      "grad_norm": 0.8296162486076355,
+      "learning_rate": 0.003,
+      "loss": 4.1426,
+      "step": 2614
+    },
+    {
+      "epoch": 0.02615,
+      "grad_norm": 0.5731111764907837,
+      "learning_rate": 0.003,
+      "loss": 4.1239,
+      "step": 2615
+    },
+    {
+      "epoch": 0.02616,
+      "grad_norm": 0.6465540528297424,
+      "learning_rate": 0.003,
+      "loss": 4.1387,
+      "step": 2616
+    },
+    {
+      "epoch": 0.02617,
+      "grad_norm": 0.7469308972358704,
+      "learning_rate": 0.003,
+      "loss": 4.1301,
+      "step": 2617
+    },
+    {
+      "epoch": 0.02618,
+      "grad_norm": 0.7493590712547302,
+      "learning_rate": 0.003,
+      "loss": 4.1589,
+      "step": 2618
+    },
+    {
+      "epoch": 0.02619,
+      "grad_norm": 0.6884176731109619,
+      "learning_rate": 0.003,
+      "loss": 4.1704,
+      "step": 2619
+    },
+    {
+      "epoch": 0.0262,
+      "grad_norm": 0.6231204271316528,
+      "learning_rate": 0.003,
+      "loss": 4.1306,
+      "step": 2620
+    },
+    {
+      "epoch": 0.02621,
+      "grad_norm": 0.5638678669929504,
+      "learning_rate": 0.003,
+      "loss": 4.1548,
+      "step": 2621
+    },
+    {
+      "epoch": 0.02622,
+      "grad_norm": 0.6251258254051208,
+      "learning_rate": 0.003,
+      "loss": 4.1545,
+      "step": 2622
+    },
+    {
+      "epoch": 0.02623,
+      "grad_norm": 0.6526143550872803,
+      "learning_rate": 0.003,
+      "loss": 4.1604,
+      "step": 2623
+    },
+    {
+      "epoch": 0.02624,
+      "grad_norm": 0.7207937836647034,
+      "learning_rate": 0.003,
+      "loss": 4.1973,
+      "step": 2624
+    },
+    {
+      "epoch": 0.02625,
+      "grad_norm": 0.7291254997253418,
+      "learning_rate": 0.003,
+      "loss": 4.1951,
+      "step": 2625
+    },
+    {
+      "epoch": 0.02626,
+      "grad_norm": 0.6461899280548096,
+      "learning_rate": 0.003,
+      "loss": 4.1285,
+      "step": 2626
+    },
+    {
+      "epoch": 0.02627,
+      "grad_norm": 0.7499544620513916,
+      "learning_rate": 0.003,
+      "loss": 4.1449,
+      "step": 2627
+    },
+    {
+      "epoch": 0.02628,
+      "grad_norm": 0.7836819887161255,
+      "learning_rate": 0.003,
+      "loss": 4.1795,
+      "step": 2628
+    },
+    {
+      "epoch": 0.02629,
+      "grad_norm": 0.7647614479064941,
+      "learning_rate": 0.003,
+      "loss": 4.1169,
+      "step": 2629
+    },
+    {
+      "epoch": 0.0263,
+      "grad_norm": 0.6021679639816284,
+      "learning_rate": 0.003,
+      "loss": 4.1405,
+      "step": 2630
+    },
+    {
+      "epoch": 0.02631,
+      "grad_norm": 0.5774012804031372,
+      "learning_rate": 0.003,
+      "loss": 4.1643,
+      "step": 2631
+    },
+    {
+      "epoch": 0.02632,
+      "grad_norm": 0.5787044167518616,
+      "learning_rate": 0.003,
+      "loss": 4.1232,
+      "step": 2632
+    },
+    {
+      "epoch": 0.02633,
+      "grad_norm": 0.6322444677352905,
+      "learning_rate": 0.003,
+      "loss": 4.14,
+      "step": 2633
+    },
+    {
+      "epoch": 0.02634,
+      "grad_norm": 0.6454491019248962,
+      "learning_rate": 0.003,
+      "loss": 4.161,
+      "step": 2634
+    },
+    {
+      "epoch": 0.02635,
+      "grad_norm": 0.6432276368141174,
+      "learning_rate": 0.003,
+      "loss": 4.1552,
+      "step": 2635
+    },
+    {
+      "epoch": 0.02636,
+      "grad_norm": 0.7024194002151489,
+      "learning_rate": 0.003,
+      "loss": 4.1424,
+      "step": 2636
+    },
+    {
+      "epoch": 0.02637,
+      "grad_norm": 0.8076905012130737,
+      "learning_rate": 0.003,
+      "loss": 4.1774,
+      "step": 2637
+    },
+    {
+      "epoch": 0.02638,
+      "grad_norm": 0.7787435054779053,
+      "learning_rate": 0.003,
+      "loss": 4.1404,
+      "step": 2638
+    },
+    {
+      "epoch": 0.02639,
+      "grad_norm": 0.6584726572036743,
+      "learning_rate": 0.003,
+      "loss": 4.1295,
+      "step": 2639
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.6226917505264282,
+      "learning_rate": 0.003,
+      "loss": 4.1266,
+      "step": 2640
+    },
+    {
+      "epoch": 0.02641,
+      "grad_norm": 0.5553937554359436,
+      "learning_rate": 0.003,
+      "loss": 4.1385,
+      "step": 2641
+    },
+    {
+      "epoch": 0.02642,
+      "grad_norm": 0.5754583477973938,
+      "learning_rate": 0.003,
+      "loss": 4.1442,
+      "step": 2642
+    },
+    {
+      "epoch": 0.02643,
+      "grad_norm": 0.6242693662643433,
+      "learning_rate": 0.003,
+      "loss": 4.1325,
+      "step": 2643
+    },
+    {
+      "epoch": 0.02644,
+      "grad_norm": 0.6401628255844116,
+      "learning_rate": 0.003,
+      "loss": 4.1486,
+      "step": 2644
+    },
+    {
+      "epoch": 0.02645,
+      "grad_norm": 0.6735085844993591,
+      "learning_rate": 0.003,
+      "loss": 4.1629,
+      "step": 2645
+    },
+    {
+      "epoch": 0.02646,
+      "grad_norm": 0.7502104640007019,
+      "learning_rate": 0.003,
+      "loss": 4.1325,
+      "step": 2646
+    },
+    {
+      "epoch": 0.02647,
+      "grad_norm": 0.9201660752296448,
+      "learning_rate": 0.003,
+      "loss": 4.1609,
+      "step": 2647
+    },
+    {
+      "epoch": 0.02648,
+      "grad_norm": 0.8611843585968018,
+      "learning_rate": 0.003,
+      "loss": 4.1815,
+      "step": 2648
+    },
+    {
+      "epoch": 0.02649,
+      "grad_norm": 0.6259444355964661,
+      "learning_rate": 0.003,
+      "loss": 4.1591,
+      "step": 2649
+    },
+    {
+      "epoch": 0.0265,
+      "grad_norm": 0.6097885966300964,
+      "learning_rate": 0.003,
+      "loss": 4.1187,
+      "step": 2650
+    },
+    {
+      "epoch": 0.02651,
+      "grad_norm": 0.696152925491333,
+      "learning_rate": 0.003,
+      "loss": 4.1424,
+      "step": 2651
+    },
+    {
+      "epoch": 0.02652,
+      "grad_norm": 0.6412234902381897,
+      "learning_rate": 0.003,
+      "loss": 4.1679,
+      "step": 2652
+    },
+    {
+      "epoch": 0.02653,
+      "grad_norm": 0.6418108940124512,
+      "learning_rate": 0.003,
+      "loss": 4.1255,
+      "step": 2653
+    },
+    {
+      "epoch": 0.02654,
+      "grad_norm": 0.5534683465957642,
+      "learning_rate": 0.003,
+      "loss": 4.1338,
+      "step": 2654
+    },
+    {
+      "epoch": 0.02655,
+      "grad_norm": 0.5514299273490906,
+      "learning_rate": 0.003,
+      "loss": 4.1394,
+      "step": 2655
+    },
+    {
+      "epoch": 0.02656,
+      "grad_norm": 0.48489049077033997,
+      "learning_rate": 0.003,
+      "loss": 4.1108,
+      "step": 2656
+    },
+    {
+      "epoch": 0.02657,
+      "grad_norm": 0.5331873297691345,
+      "learning_rate": 0.003,
+      "loss": 4.1423,
+      "step": 2657
+    },
+    {
+      "epoch": 0.02658,
+      "grad_norm": 0.5732285380363464,
+      "learning_rate": 0.003,
+      "loss": 4.149,
+      "step": 2658
+    },
+    {
+      "epoch": 0.02659,
+      "grad_norm": 0.6610122323036194,
+      "learning_rate": 0.003,
+      "loss": 4.1479,
+      "step": 2659
+    },
+    {
+      "epoch": 0.0266,
+      "grad_norm": 0.7037291526794434,
+      "learning_rate": 0.003,
+      "loss": 4.1333,
+      "step": 2660
+    },
+    {
+      "epoch": 0.02661,
+      "grad_norm": 0.8676835894584656,
+      "learning_rate": 0.003,
+      "loss": 4.161,
+      "step": 2661
+    },
+    {
+      "epoch": 0.02662,
+      "grad_norm": 0.9771322011947632,
+      "learning_rate": 0.003,
+      "loss": 4.1542,
+      "step": 2662
+    },
+    {
+      "epoch": 0.02663,
+      "grad_norm": 1.1501520872116089,
+      "learning_rate": 0.003,
+      "loss": 4.1488,
+      "step": 2663
+    },
+    {
+      "epoch": 0.02664,
+      "grad_norm": 0.8324311971664429,
+      "learning_rate": 0.003,
+      "loss": 4.1441,
+      "step": 2664
+    },
+    {
+      "epoch": 0.02665,
+      "grad_norm": 0.852374255657196,
+      "learning_rate": 0.003,
+      "loss": 4.1931,
+      "step": 2665
+    },
+    {
+      "epoch": 0.02666,
+      "grad_norm": 1.0820196866989136,
+      "learning_rate": 0.003,
+      "loss": 4.198,
+      "step": 2666
+    },
+    {
+      "epoch": 0.02667,
+      "grad_norm": 0.7490083575248718,
+      "learning_rate": 0.003,
+      "loss": 4.1632,
+      "step": 2667
+    },
+    {
+      "epoch": 0.02668,
+      "grad_norm": 0.679391622543335,
+      "learning_rate": 0.003,
+      "loss": 4.1331,
+      "step": 2668
+    },
+    {
+      "epoch": 0.02669,
+      "grad_norm": 0.6063788533210754,
+      "learning_rate": 0.003,
+      "loss": 4.1519,
+      "step": 2669
+    },
+    {
+      "epoch": 0.0267,
+      "grad_norm": 0.6576395034790039,
+      "learning_rate": 0.003,
+      "loss": 4.1517,
+      "step": 2670
+    },
+    {
+      "epoch": 0.02671,
+      "grad_norm": 0.8710551261901855,
+      "learning_rate": 0.003,
+      "loss": 4.1782,
+      "step": 2671
+    },
+    {
+      "epoch": 0.02672,
+      "grad_norm": 1.043360710144043,
+      "learning_rate": 0.003,
+      "loss": 4.179,
+      "step": 2672
+    },
+    {
+      "epoch": 0.02673,
+      "grad_norm": 0.8524636030197144,
+      "learning_rate": 0.003,
+      "loss": 4.1702,
+      "step": 2673
+    },
+    {
+      "epoch": 0.02674,
+      "grad_norm": 0.7167258858680725,
+      "learning_rate": 0.003,
+      "loss": 4.135,
+      "step": 2674
+    },
+    {
+      "epoch": 0.02675,
+      "grad_norm": 0.8229854106903076,
+      "learning_rate": 0.003,
+      "loss": 4.1392,
+      "step": 2675
+    },
+    {
+      "epoch": 0.02676,
+      "grad_norm": 0.8725418448448181,
+      "learning_rate": 0.003,
+      "loss": 4.1608,
+      "step": 2676
+    },
+    {
+      "epoch": 0.02677,
+      "grad_norm": 0.8751718997955322,
+      "learning_rate": 0.003,
+      "loss": 4.189,
+      "step": 2677
+    },
+    {
+      "epoch": 0.02678,
+      "grad_norm": 0.9639006853103638,
+      "learning_rate": 0.003,
+      "loss": 4.1897,
+      "step": 2678
+    },
+    {
+      "epoch": 0.02679,
+      "grad_norm": 0.9028424024581909,
+      "learning_rate": 0.003,
+      "loss": 4.1596,
+      "step": 2679
+    },
+    {
+      "epoch": 0.0268,
+      "grad_norm": 0.8419076204299927,
+      "learning_rate": 0.003,
+      "loss": 4.1719,
+      "step": 2680
+    },
+    {
+      "epoch": 0.02681,
+      "grad_norm": 0.8327796459197998,
+      "learning_rate": 0.003,
+      "loss": 4.1966,
+      "step": 2681
+    },
+    {
+      "epoch": 0.02682,
+      "grad_norm": 0.6412435173988342,
+      "learning_rate": 0.003,
+      "loss": 4.1667,
+      "step": 2682
+    },
+    {
+      "epoch": 0.02683,
+      "grad_norm": 0.5319331288337708,
+      "learning_rate": 0.003,
+      "loss": 4.1829,
+      "step": 2683
+    },
+    {
+      "epoch": 0.02684,
+      "grad_norm": 0.5018413662910461,
+      "learning_rate": 0.003,
+      "loss": 4.1559,
+      "step": 2684
+    },
+    {
+      "epoch": 0.02685,
+      "grad_norm": 0.4933565557003021,
+      "learning_rate": 0.003,
+      "loss": 4.1592,
+      "step": 2685
+    },
+    {
+      "epoch": 0.02686,
+      "grad_norm": 0.5294405221939087,
+      "learning_rate": 0.003,
+      "loss": 4.1404,
+      "step": 2686
+    },
+    {
+      "epoch": 0.02687,
+      "grad_norm": 0.6580759882926941,
+      "learning_rate": 0.003,
+      "loss": 4.1532,
+      "step": 2687
+    },
+    {
+      "epoch": 0.02688,
+      "grad_norm": 0.7823148369789124,
+      "learning_rate": 0.003,
+      "loss": 4.1334,
+      "step": 2688
+    },
+    {
+      "epoch": 0.02689,
+      "grad_norm": 0.8333227038383484,
+      "learning_rate": 0.003,
+      "loss": 4.1677,
+      "step": 2689
+    },
+    {
+      "epoch": 0.0269,
+      "grad_norm": 0.6743582487106323,
+      "learning_rate": 0.003,
+      "loss": 4.1651,
+      "step": 2690
+    },
+    {
+      "epoch": 0.02691,
+      "grad_norm": 0.5275107622146606,
+      "learning_rate": 0.003,
+      "loss": 4.154,
+      "step": 2691
+    },
+    {
+      "epoch": 0.02692,
+      "grad_norm": 0.585838258266449,
+      "learning_rate": 0.003,
+      "loss": 4.1589,
+      "step": 2692
+    },
+    {
+      "epoch": 0.02693,
+      "grad_norm": 0.6433229446411133,
+      "learning_rate": 0.003,
+      "loss": 4.1748,
+      "step": 2693
+    },
+    {
+      "epoch": 0.02694,
+      "grad_norm": 0.5260990858078003,
+      "learning_rate": 0.003,
+      "loss": 4.1297,
+      "step": 2694
+    },
+    {
+      "epoch": 0.02695,
+      "grad_norm": 0.49656516313552856,
+      "learning_rate": 0.003,
+      "loss": 4.1442,
+      "step": 2695
+    },
+    {
+      "epoch": 0.02696,
+      "grad_norm": 0.46913599967956543,
+      "learning_rate": 0.003,
+      "loss": 4.1441,
+      "step": 2696
+    },
+    {
+      "epoch": 0.02697,
+      "grad_norm": 0.4466596841812134,
+      "learning_rate": 0.003,
+      "loss": 4.1534,
+      "step": 2697
+    },
+    {
+      "epoch": 0.02698,
+      "grad_norm": 0.4291961193084717,
+      "learning_rate": 0.003,
+      "loss": 4.1359,
+      "step": 2698
+    },
+    {
+      "epoch": 0.02699,
+      "grad_norm": 0.43313321471214294,
+      "learning_rate": 0.003,
+      "loss": 4.1533,
+      "step": 2699
+    },
+    {
+      "epoch": 0.027,
+      "grad_norm": 0.4425189793109894,
+      "learning_rate": 0.003,
+      "loss": 4.1247,
+      "step": 2700
+    },
+    {
+      "epoch": 0.02701,
+      "grad_norm": 0.41129767894744873,
+      "learning_rate": 0.003,
+      "loss": 4.112,
+      "step": 2701
+    },
+    {
+      "epoch": 0.02702,
+      "grad_norm": 0.3028920888900757,
+      "learning_rate": 0.003,
+      "loss": 4.1613,
+      "step": 2702
+    },
+    {
+      "epoch": 0.02703,
+      "grad_norm": 0.36113715171813965,
+      "learning_rate": 0.003,
+      "loss": 4.1575,
+      "step": 2703
+    },
+    {
+      "epoch": 0.02704,
+      "grad_norm": 0.42614156007766724,
+      "learning_rate": 0.003,
+      "loss": 4.1013,
+      "step": 2704
+    },
+    {
+      "epoch": 0.02705,
+      "grad_norm": 0.4877123534679413,
+      "learning_rate": 0.003,
+      "loss": 4.1578,
+      "step": 2705
+    },
+    {
+      "epoch": 0.02706,
+      "grad_norm": 0.6430928707122803,
+      "learning_rate": 0.003,
+      "loss": 4.1234,
+      "step": 2706
+    },
+    {
+      "epoch": 0.02707,
+      "grad_norm": 0.7538681626319885,
+      "learning_rate": 0.003,
+      "loss": 4.1289,
+      "step": 2707
+    },
+    {
+      "epoch": 0.02708,
+      "grad_norm": 0.9270827770233154,
+      "learning_rate": 0.003,
+      "loss": 4.1612,
+      "step": 2708
+    },
+    {
+      "epoch": 0.02709,
+      "grad_norm": 1.0371726751327515,
+      "learning_rate": 0.003,
+      "loss": 4.1442,
+      "step": 2709
+    },
+    {
+      "epoch": 0.0271,
+      "grad_norm": 0.7252341508865356,
+      "learning_rate": 0.003,
+      "loss": 4.1418,
+      "step": 2710
+    },
+    {
+      "epoch": 0.02711,
+      "grad_norm": 0.6913469433784485,
+      "learning_rate": 0.003,
+      "loss": 4.1271,
+      "step": 2711
+    },
+    {
+      "epoch": 0.02712,
+      "grad_norm": 0.7029024362564087,
+      "learning_rate": 0.003,
+      "loss": 4.142,
+      "step": 2712
+    },
+    {
+      "epoch": 0.02713,
+      "grad_norm": 0.6771293878555298,
+      "learning_rate": 0.003,
+      "loss": 4.1313,
+      "step": 2713
+    },
+    {
+      "epoch": 0.02714,
+      "grad_norm": 0.6258983016014099,
+      "learning_rate": 0.003,
+      "loss": 4.1322,
+      "step": 2714
+    },
+    {
+      "epoch": 0.02715,
+      "grad_norm": 0.7235097289085388,
+      "learning_rate": 0.003,
+      "loss": 4.1319,
+      "step": 2715
+    },
+    {
+      "epoch": 0.02716,
+      "grad_norm": 0.8248934745788574,
+      "learning_rate": 0.003,
+      "loss": 4.1475,
+      "step": 2716
+    },
+    {
+      "epoch": 0.02717,
+      "grad_norm": 0.8002756237983704,
+      "learning_rate": 0.003,
+      "loss": 4.1502,
+      "step": 2717
+    },
+    {
+      "epoch": 0.02718,
+      "grad_norm": 0.7360194325447083,
+      "learning_rate": 0.003,
+      "loss": 4.1647,
+      "step": 2718
+    },
+    {
+      "epoch": 0.02719,
+      "grad_norm": 0.8694934844970703,
+      "learning_rate": 0.003,
+      "loss": 4.145,
+      "step": 2719
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.8528920412063599,
+      "learning_rate": 0.003,
+      "loss": 4.137,
+      "step": 2720
+    },
+    {
+      "epoch": 0.02721,
+      "grad_norm": 1.0115561485290527,
+      "learning_rate": 0.003,
+      "loss": 4.1594,
+      "step": 2721
+    },
+    {
+      "epoch": 0.02722,
+      "grad_norm": 1.050325632095337,
+      "learning_rate": 0.003,
+      "loss": 4.1686,
+      "step": 2722
+    },
+    {
+      "epoch": 0.02723,
+      "grad_norm": 0.8528364896774292,
+      "learning_rate": 0.003,
+      "loss": 4.1629,
+      "step": 2723
+    },
+    {
+      "epoch": 0.02724,
+      "grad_norm": 0.8148915767669678,
+      "learning_rate": 0.003,
+      "loss": 4.1665,
+      "step": 2724
+    },
+    {
+      "epoch": 0.02725,
+      "grad_norm": 0.707343339920044,
+      "learning_rate": 0.003,
+      "loss": 4.1539,
+      "step": 2725
+    },
+    {
+      "epoch": 0.02726,
+      "grad_norm": 0.7041299939155579,
+      "learning_rate": 0.003,
+      "loss": 4.1281,
+      "step": 2726
+    },
+    {
+      "epoch": 0.02727,
+      "grad_norm": 0.5910518765449524,
+      "learning_rate": 0.003,
+      "loss": 4.1551,
+      "step": 2727
+    },
+    {
+      "epoch": 0.02728,
+      "grad_norm": 0.6046881079673767,
+      "learning_rate": 0.003,
+      "loss": 4.1422,
+      "step": 2728
+    },
+    {
+      "epoch": 0.02729,
+      "grad_norm": 0.5683781504631042,
+      "learning_rate": 0.003,
+      "loss": 4.1716,
+      "step": 2729
+    },
+    {
+      "epoch": 0.0273,
+      "grad_norm": 0.5593625903129578,
+      "learning_rate": 0.003,
+      "loss": 4.143,
+      "step": 2730
+    },
+    {
+      "epoch": 0.02731,
+      "grad_norm": 0.5543289184570312,
+      "learning_rate": 0.003,
+      "loss": 4.1417,
+      "step": 2731
+    },
+    {
+      "epoch": 0.02732,
+      "grad_norm": 0.5714956521987915,
+      "learning_rate": 0.003,
+      "loss": 4.135,
+      "step": 2732
+    },
+    {
+      "epoch": 0.02733,
+      "grad_norm": 0.5906261205673218,
+      "learning_rate": 0.003,
+      "loss": 4.1645,
+      "step": 2733
+    },
+    {
+      "epoch": 0.02734,
+      "grad_norm": 0.7874712944030762,
+      "learning_rate": 0.003,
+      "loss": 4.1208,
+      "step": 2734
+    },
+    {
+      "epoch": 0.02735,
+      "grad_norm": 0.9163006544113159,
+      "learning_rate": 0.003,
+      "loss": 4.174,
+      "step": 2735
+    },
+    {
+      "epoch": 0.02736,
+      "grad_norm": 0.8792824745178223,
+      "learning_rate": 0.003,
+      "loss": 4.1495,
+      "step": 2736
+    },
+    {
+      "epoch": 0.02737,
+      "grad_norm": 0.7901833057403564,
+      "learning_rate": 0.003,
+      "loss": 4.1377,
+      "step": 2737
+    },
+    {
+      "epoch": 0.02738,
+      "grad_norm": 0.8548569083213806,
+      "learning_rate": 0.003,
+      "loss": 4.1629,
+      "step": 2738
+    },
+    {
+      "epoch": 0.02739,
+      "grad_norm": 0.7134705185890198,
+      "learning_rate": 0.003,
+      "loss": 4.1544,
+      "step": 2739
+    },
+    {
+      "epoch": 0.0274,
+      "grad_norm": 0.6430677175521851,
+      "learning_rate": 0.003,
+      "loss": 4.139,
+      "step": 2740
+    },
+    {
+      "epoch": 0.02741,
+      "grad_norm": 0.5635836720466614,
+      "learning_rate": 0.003,
+      "loss": 4.1495,
+      "step": 2741
+    },
+    {
+      "epoch": 0.02742,
+      "grad_norm": 0.5489112138748169,
+      "learning_rate": 0.003,
+      "loss": 4.1367,
+      "step": 2742
+    },
+    {
+      "epoch": 0.02743,
+      "grad_norm": 0.5366271734237671,
+      "learning_rate": 0.003,
+      "loss": 4.1572,
+      "step": 2743
+    },
+    {
+      "epoch": 0.02744,
+      "grad_norm": 0.489491730928421,
+      "learning_rate": 0.003,
+      "loss": 4.1372,
+      "step": 2744
+    },
+    {
+      "epoch": 0.02745,
+      "grad_norm": 0.47551363706588745,
+      "learning_rate": 0.003,
+      "loss": 4.1468,
+      "step": 2745
+    },
+    {
+      "epoch": 0.02746,
+      "grad_norm": 0.39822816848754883,
+      "learning_rate": 0.003,
+      "loss": 4.1191,
+      "step": 2746
+    },
+    {
+      "epoch": 0.02747,
+      "grad_norm": 0.3974153399467468,
+      "learning_rate": 0.003,
+      "loss": 4.1538,
+      "step": 2747
+    },
+    {
+      "epoch": 0.02748,
+      "grad_norm": 0.47189366817474365,
+      "learning_rate": 0.003,
+      "loss": 4.107,
+      "step": 2748
+    },
+    {
+      "epoch": 0.02749,
+      "grad_norm": 0.5153442025184631,
+      "learning_rate": 0.003,
+      "loss": 4.1113,
+      "step": 2749
+    },
+    {
+      "epoch": 0.0275,
+      "grad_norm": 0.565412700176239,
+      "learning_rate": 0.003,
+      "loss": 4.1461,
+      "step": 2750
+    },
+    {
+      "epoch": 0.02751,
+      "grad_norm": 0.6354933977127075,
+      "learning_rate": 0.003,
+      "loss": 4.1638,
+      "step": 2751
+    },
+    {
+      "epoch": 0.02752,
+      "grad_norm": 0.6897895932197571,
+      "learning_rate": 0.003,
+      "loss": 4.1494,
+      "step": 2752
+    },
+    {
+      "epoch": 0.02753,
+      "grad_norm": 0.6377696394920349,
+      "learning_rate": 0.003,
+      "loss": 4.1306,
+      "step": 2753
+    },
+    {
+      "epoch": 0.02754,
+      "grad_norm": 0.5976959466934204,
+      "learning_rate": 0.003,
+      "loss": 4.1139,
+      "step": 2754
+    },
+    {
+      "epoch": 0.02755,
+      "grad_norm": 0.702274739742279,
+      "learning_rate": 0.003,
+      "loss": 4.1146,
+      "step": 2755
+    },
+    {
+      "epoch": 0.02756,
+      "grad_norm": 0.7118279337882996,
+      "learning_rate": 0.003,
+      "loss": 4.1439,
+      "step": 2756
+    },
+    {
+      "epoch": 0.02757,
+      "grad_norm": 0.660001277923584,
+      "learning_rate": 0.003,
+      "loss": 4.1563,
+      "step": 2757
+    },
+    {
+      "epoch": 0.02758,
+      "grad_norm": 0.7159073948860168,
+      "learning_rate": 0.003,
+      "loss": 4.106,
+      "step": 2758
+    },
+    {
+      "epoch": 0.02759,
+      "grad_norm": 0.8859509825706482,
+      "learning_rate": 0.003,
+      "loss": 4.1452,
+      "step": 2759
+    },
+    {
+      "epoch": 0.0276,
+      "grad_norm": 0.9749924540519714,
+      "learning_rate": 0.003,
+      "loss": 4.1562,
+      "step": 2760
+    },
+    {
+      "epoch": 0.02761,
+      "grad_norm": 0.9062779545783997,
+      "learning_rate": 0.003,
+      "loss": 4.1742,
+      "step": 2761
+    },
+    {
+      "epoch": 0.02762,
+      "grad_norm": 0.8129538893699646,
+      "learning_rate": 0.003,
+      "loss": 4.1407,
+      "step": 2762
+    },
+    {
+      "epoch": 0.02763,
+      "grad_norm": 0.7007825970649719,
+      "learning_rate": 0.003,
+      "loss": 4.1479,
+      "step": 2763
+    },
+    {
+      "epoch": 0.02764,
+      "grad_norm": 0.8174701929092407,
+      "learning_rate": 0.003,
+      "loss": 4.1454,
+      "step": 2764
+    },
+    {
+      "epoch": 0.02765,
+      "grad_norm": 0.8394188284873962,
+      "learning_rate": 0.003,
+      "loss": 4.1542,
+      "step": 2765
+    },
+    {
+      "epoch": 0.02766,
+      "grad_norm": 0.8616805672645569,
+      "learning_rate": 0.003,
+      "loss": 4.1254,
+      "step": 2766
+    },
+    {
+      "epoch": 0.02767,
+      "grad_norm": 0.8441250920295715,
+      "learning_rate": 0.003,
+      "loss": 4.1869,
+      "step": 2767
+    },
+    {
+      "epoch": 0.02768,
+      "grad_norm": 0.7080286741256714,
+      "learning_rate": 0.003,
+      "loss": 4.1142,
+      "step": 2768
+    },
+    {
+      "epoch": 0.02769,
+      "grad_norm": 0.708198070526123,
+      "learning_rate": 0.003,
+      "loss": 4.174,
+      "step": 2769
+    },
+    {
+      "epoch": 0.0277,
+      "grad_norm": 0.7906506061553955,
+      "learning_rate": 0.003,
+      "loss": 4.1426,
+      "step": 2770
+    },
+    {
+      "epoch": 0.02771,
+      "grad_norm": 0.7239770889282227,
+      "learning_rate": 0.003,
+      "loss": 4.1314,
+      "step": 2771
+    },
+    {
+      "epoch": 0.02772,
+      "grad_norm": 0.5857943296432495,
+      "learning_rate": 0.003,
+      "loss": 4.1172,
+      "step": 2772
+    },
+    {
+      "epoch": 0.02773,
+      "grad_norm": 0.5903287529945374,
+      "learning_rate": 0.003,
+      "loss": 4.1512,
+      "step": 2773
+    },
+    {
+      "epoch": 0.02774,
+      "grad_norm": 0.6580007672309875,
+      "learning_rate": 0.003,
+      "loss": 4.176,
+      "step": 2774
+    },
+    {
+      "epoch": 0.02775,
+      "grad_norm": 0.6529462933540344,
+      "learning_rate": 0.003,
+      "loss": 4.1423,
+      "step": 2775
+    },
+    {
+      "epoch": 0.02776,
+      "grad_norm": 0.7036746144294739,
+      "learning_rate": 0.003,
+      "loss": 4.138,
+      "step": 2776
+    },
+    {
+      "epoch": 0.02777,
+      "grad_norm": 0.7649575471878052,
+      "learning_rate": 0.003,
+      "loss": 4.1221,
+      "step": 2777
+    },
+    {
+      "epoch": 0.02778,
+      "grad_norm": 0.6718697547912598,
+      "learning_rate": 0.003,
+      "loss": 4.1607,
+      "step": 2778
+    },
+    {
+      "epoch": 0.02779,
+      "grad_norm": 0.587509036064148,
+      "learning_rate": 0.003,
+      "loss": 4.1283,
+      "step": 2779
+    },
+    {
+      "epoch": 0.0278,
+      "grad_norm": 0.5817381143569946,
+      "learning_rate": 0.003,
+      "loss": 4.1256,
+      "step": 2780
+    },
+    {
+      "epoch": 0.02781,
+      "grad_norm": 0.5770044326782227,
+      "learning_rate": 0.003,
+      "loss": 4.1078,
+      "step": 2781
+    },
+    {
+      "epoch": 0.02782,
+      "grad_norm": 0.687736451625824,
+      "learning_rate": 0.003,
+      "loss": 4.1341,
+      "step": 2782
+    },
+    {
+      "epoch": 0.02783,
+      "grad_norm": 0.6473103165626526,
+      "learning_rate": 0.003,
+      "loss": 4.1386,
+      "step": 2783
+    },
+    {
+      "epoch": 0.02784,
+      "grad_norm": 0.5239824056625366,
+      "learning_rate": 0.003,
+      "loss": 4.1293,
+      "step": 2784
+    },
+    {
+      "epoch": 0.02785,
+      "grad_norm": 0.4588249921798706,
+      "learning_rate": 0.003,
+      "loss": 4.1447,
+      "step": 2785
+    },
+    {
+      "epoch": 0.02786,
+      "grad_norm": 0.4880278408527374,
+      "learning_rate": 0.003,
+      "loss": 4.1394,
+      "step": 2786
+    },
+    {
+      "epoch": 0.02787,
+      "grad_norm": 0.502045750617981,
+      "learning_rate": 0.003,
+      "loss": 4.1396,
+      "step": 2787
+    },
+    {
+      "epoch": 0.02788,
+      "grad_norm": 0.6067155003547668,
+      "learning_rate": 0.003,
+      "loss": 4.1294,
+      "step": 2788
+    },
+    {
+      "epoch": 0.02789,
+      "grad_norm": 0.7814184427261353,
+      "learning_rate": 0.003,
+      "loss": 4.1409,
+      "step": 2789
+    },
+    {
+      "epoch": 0.0279,
+      "grad_norm": 1.1954095363616943,
+      "learning_rate": 0.003,
+      "loss": 4.132,
+      "step": 2790
+    },
+    {
+      "epoch": 0.02791,
+      "grad_norm": 0.7128247022628784,
+      "learning_rate": 0.003,
+      "loss": 4.1352,
+      "step": 2791
+    },
+    {
+      "epoch": 0.02792,
+      "grad_norm": 0.6396022439002991,
+      "learning_rate": 0.003,
+      "loss": 4.1495,
+      "step": 2792
+    },
+    {
+      "epoch": 0.02793,
+      "grad_norm": 0.8452949523925781,
+      "learning_rate": 0.003,
+      "loss": 4.1513,
+      "step": 2793
+    },
+    {
+      "epoch": 0.02794,
+      "grad_norm": 0.7837446928024292,
+      "learning_rate": 0.003,
+      "loss": 4.1464,
+      "step": 2794
+    },
+    {
+      "epoch": 0.02795,
+      "grad_norm": 0.7633419036865234,
+      "learning_rate": 0.003,
+      "loss": 4.1621,
+      "step": 2795
+    },
+    {
+      "epoch": 0.02796,
+      "grad_norm": 0.7906386852264404,
+      "learning_rate": 0.003,
+      "loss": 4.1576,
+      "step": 2796
+    },
+    {
+      "epoch": 0.02797,
+      "grad_norm": 0.7294182777404785,
+      "learning_rate": 0.003,
+      "loss": 4.1672,
+      "step": 2797
+    },
+    {
+      "epoch": 0.02798,
+      "grad_norm": 0.7420005798339844,
+      "learning_rate": 0.003,
+      "loss": 4.1646,
+      "step": 2798
+    },
+    {
+      "epoch": 0.02799,
+      "grad_norm": 0.722224235534668,
+      "learning_rate": 0.003,
+      "loss": 4.1492,
+      "step": 2799
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.677977442741394,
+      "learning_rate": 0.003,
+      "loss": 4.1596,
+      "step": 2800
+    },
+    {
+      "epoch": 0.02801,
+      "grad_norm": 0.6493065357208252,
+      "learning_rate": 0.003,
+      "loss": 4.1339,
+      "step": 2801
+    },
+    {
+      "epoch": 0.02802,
+      "grad_norm": 0.6755994558334351,
+      "learning_rate": 0.003,
+      "loss": 4.1363,
+      "step": 2802
+    },
+    {
+      "epoch": 0.02803,
+      "grad_norm": 0.6203683614730835,
+      "learning_rate": 0.003,
+      "loss": 4.1536,
+      "step": 2803
+    },
+    {
+      "epoch": 0.02804,
+      "grad_norm": 0.5352902412414551,
+      "learning_rate": 0.003,
+      "loss": 4.1231,
+      "step": 2804
+    },
+    {
+      "epoch": 0.02805,
+      "grad_norm": 0.4998801350593567,
+      "learning_rate": 0.003,
+      "loss": 4.1579,
+      "step": 2805
+    },
+    {
+      "epoch": 0.02806,
+      "grad_norm": 0.43145909905433655,
+      "learning_rate": 0.003,
+      "loss": 4.13,
+      "step": 2806
+    },
+    {
+      "epoch": 0.02807,
+      "grad_norm": 0.4628302752971649,
+      "learning_rate": 0.003,
+      "loss": 4.1337,
+      "step": 2807
+    },
+    {
+      "epoch": 0.02808,
+      "grad_norm": 0.4608153998851776,
+      "learning_rate": 0.003,
+      "loss": 4.1137,
+      "step": 2808
+    },
+    {
+      "epoch": 0.02809,
+      "grad_norm": 0.47452715039253235,
+      "learning_rate": 0.003,
+      "loss": 4.1551,
+      "step": 2809
+    },
+    {
+      "epoch": 0.0281,
+      "grad_norm": 0.5173685550689697,
+      "learning_rate": 0.003,
+      "loss": 4.1083,
+      "step": 2810
+    },
+    {
+      "epoch": 0.02811,
+      "grad_norm": 0.5137503147125244,
+      "learning_rate": 0.003,
+      "loss": 4.1165,
+      "step": 2811
+    },
+    {
+      "epoch": 0.02812,
+      "grad_norm": 0.5507237315177917,
+      "learning_rate": 0.003,
+      "loss": 4.1344,
+      "step": 2812
+    },
+    {
+      "epoch": 0.02813,
+      "grad_norm": 0.667034387588501,
+      "learning_rate": 0.003,
+      "loss": 4.1267,
+      "step": 2813
+    },
+    {
+      "epoch": 0.02814,
+      "grad_norm": 0.8353255987167358,
+      "learning_rate": 0.003,
+      "loss": 4.1329,
+      "step": 2814
+    },
+    {
+      "epoch": 0.02815,
+      "grad_norm": 0.8815486431121826,
+      "learning_rate": 0.003,
+      "loss": 4.1736,
+      "step": 2815
+    },
+    {
+      "epoch": 0.02816,
+      "grad_norm": 0.7531535029411316,
+      "learning_rate": 0.003,
+      "loss": 4.1477,
+      "step": 2816
+    },
+    {
+      "epoch": 0.02817,
+      "grad_norm": 0.6786168217658997,
+      "learning_rate": 0.003,
+      "loss": 4.1043,
+      "step": 2817
+    },
+    {
+      "epoch": 0.02818,
+      "grad_norm": 0.5562347173690796,
+      "learning_rate": 0.003,
+      "loss": 4.1127,
+      "step": 2818
+    },
+    {
+      "epoch": 0.02819,
+      "grad_norm": 0.6471225619316101,
+      "learning_rate": 0.003,
+      "loss": 4.1416,
+      "step": 2819
+    },
+    {
+      "epoch": 0.0282,
+      "grad_norm": 0.7505398392677307,
+      "learning_rate": 0.003,
+      "loss": 4.1422,
+      "step": 2820
+    },
+    {
+      "epoch": 0.02821,
+      "grad_norm": 0.845917820930481,
+      "learning_rate": 0.003,
+      "loss": 4.1632,
+      "step": 2821
+    },
+    {
+      "epoch": 0.02822,
+      "grad_norm": 0.8887820839881897,
+      "learning_rate": 0.003,
+      "loss": 4.1419,
+      "step": 2822
+    },
+    {
+      "epoch": 0.02823,
+      "grad_norm": 0.953726589679718,
+      "learning_rate": 0.003,
+      "loss": 4.1686,
+      "step": 2823
+    },
+    {
+      "epoch": 0.02824,
+      "grad_norm": 0.9745505452156067,
+      "learning_rate": 0.003,
+      "loss": 4.1627,
+      "step": 2824
+    },
+    {
+      "epoch": 0.02825,
+      "grad_norm": 0.9194844961166382,
+      "learning_rate": 0.003,
+      "loss": 4.1655,
+      "step": 2825
+    },
+    {
+      "epoch": 0.02826,
+      "grad_norm": 0.9426382184028625,
+      "learning_rate": 0.003,
+      "loss": 4.1439,
+      "step": 2826
+    },
+    {
+      "epoch": 0.02827,
+      "grad_norm": 0.8329664468765259,
+      "learning_rate": 0.003,
+      "loss": 4.1305,
+      "step": 2827
+    },
+    {
+      "epoch": 0.02828,
+      "grad_norm": 0.7950755953788757,
+      "learning_rate": 0.003,
+      "loss": 4.1685,
+      "step": 2828
+    },
+    {
+      "epoch": 0.02829,
+      "grad_norm": 0.7973052263259888,
+      "learning_rate": 0.003,
+      "loss": 4.1943,
+      "step": 2829
+    },
+    {
+      "epoch": 0.0283,
+      "grad_norm": 0.7938761115074158,
+      "learning_rate": 0.003,
+      "loss": 4.1561,
+      "step": 2830
+    },
+    {
+      "epoch": 0.02831,
+      "grad_norm": 0.900173008441925,
+      "learning_rate": 0.003,
+      "loss": 4.1604,
+      "step": 2831
+    },
+    {
+      "epoch": 0.02832,
+      "grad_norm": 0.9947631359100342,
+      "learning_rate": 0.003,
+      "loss": 4.1755,
+      "step": 2832
+    },
+    {
+      "epoch": 0.02833,
+      "grad_norm": 0.8765170574188232,
+      "learning_rate": 0.003,
+      "loss": 4.1539,
+      "step": 2833
+    },
+    {
+      "epoch": 0.02834,
+      "grad_norm": 0.8053457140922546,
+      "learning_rate": 0.003,
+      "loss": 4.1358,
+      "step": 2834
+    },
+    {
+      "epoch": 0.02835,
+      "grad_norm": 0.728701651096344,
+      "learning_rate": 0.003,
+      "loss": 4.147,
+      "step": 2835
+    },
+    {
+      "epoch": 0.02836,
+      "grad_norm": 0.6180861592292786,
+      "learning_rate": 0.003,
+      "loss": 4.1615,
+      "step": 2836
+    },
+    {
+      "epoch": 0.02837,
+      "grad_norm": 0.5929837226867676,
+      "learning_rate": 0.003,
+      "loss": 4.1342,
+      "step": 2837
+    },
+    {
+      "epoch": 0.02838,
+      "grad_norm": 0.5861077308654785,
+      "learning_rate": 0.003,
+      "loss": 4.0992,
+      "step": 2838
+    },
+    {
+      "epoch": 0.02839,
+      "grad_norm": 0.5293443202972412,
+      "learning_rate": 0.003,
+      "loss": 4.0976,
+      "step": 2839
+    },
+    {
+      "epoch": 0.0284,
+      "grad_norm": 0.5219327211380005,
+      "learning_rate": 0.003,
+      "loss": 4.1106,
+      "step": 2840
+    },
+    {
+      "epoch": 0.02841,
+      "grad_norm": 0.494223028421402,
+      "learning_rate": 0.003,
+      "loss": 4.1465,
+      "step": 2841
+    },
+    {
+      "epoch": 0.02842,
+      "grad_norm": 0.5202710628509521,
+      "learning_rate": 0.003,
+      "loss": 4.1271,
+      "step": 2842
+    },
+    {
+      "epoch": 0.02843,
+      "grad_norm": 0.6077046990394592,
+      "learning_rate": 0.003,
+      "loss": 4.1525,
+      "step": 2843
+    },
+    {
+      "epoch": 0.02844,
+      "grad_norm": 0.6248916983604431,
+      "learning_rate": 0.003,
+      "loss": 4.1488,
+      "step": 2844
+    },
+    {
+      "epoch": 0.02845,
+      "grad_norm": 0.5447239875793457,
+      "learning_rate": 0.003,
+      "loss": 4.124,
+      "step": 2845
+    },
+    {
+      "epoch": 0.02846,
+      "grad_norm": 0.5102211236953735,
+      "learning_rate": 0.003,
+      "loss": 4.1178,
+      "step": 2846
+    },
+    {
+      "epoch": 0.02847,
+      "grad_norm": 0.47447827458381653,
+      "learning_rate": 0.003,
+      "loss": 4.171,
+      "step": 2847
+    },
+    {
+      "epoch": 0.02848,
+      "grad_norm": 0.5006018877029419,
+      "learning_rate": 0.003,
+      "loss": 4.1279,
+      "step": 2848
+    },
+    {
+      "epoch": 0.02849,
+      "grad_norm": 0.6120002865791321,
+      "learning_rate": 0.003,
+      "loss": 4.1185,
+      "step": 2849
+    },
+    {
+      "epoch": 0.0285,
+      "grad_norm": 0.6846702098846436,
+      "learning_rate": 0.003,
+      "loss": 4.1242,
+      "step": 2850
+    },
+    {
+      "epoch": 0.02851,
+      "grad_norm": 0.7738217115402222,
+      "learning_rate": 0.003,
+      "loss": 4.1646,
+      "step": 2851
+    },
+    {
+      "epoch": 0.02852,
+      "grad_norm": 0.8055019974708557,
+      "learning_rate": 0.003,
+      "loss": 4.1502,
+      "step": 2852
+    },
+    {
+      "epoch": 0.02853,
+      "grad_norm": 0.8890880942344666,
+      "learning_rate": 0.003,
+      "loss": 4.1216,
+      "step": 2853
+    },
+    {
+      "epoch": 0.02854,
+      "grad_norm": 0.8885093927383423,
+      "learning_rate": 0.003,
+      "loss": 4.1639,
+      "step": 2854
+    },
+    {
+      "epoch": 0.02855,
+      "grad_norm": 0.8279180526733398,
+      "learning_rate": 0.003,
+      "loss": 4.1584,
+      "step": 2855
+    },
+    {
+      "epoch": 0.02856,
+      "grad_norm": 0.7752195596694946,
+      "learning_rate": 0.003,
+      "loss": 4.1614,
+      "step": 2856
+    },
+    {
+      "epoch": 0.02857,
+      "grad_norm": 0.8694232702255249,
+      "learning_rate": 0.003,
+      "loss": 4.1419,
+      "step": 2857
+    },
+    {
+      "epoch": 0.02858,
+      "grad_norm": 0.9445026516914368,
+      "learning_rate": 0.003,
+      "loss": 4.1366,
+      "step": 2858
+    },
+    {
+      "epoch": 0.02859,
+      "grad_norm": 0.8522076606750488,
+      "learning_rate": 0.003,
+      "loss": 4.1462,
+      "step": 2859
+    },
+    {
+      "epoch": 0.0286,
+      "grad_norm": 0.8931873440742493,
+      "learning_rate": 0.003,
+      "loss": 4.1628,
+      "step": 2860
+    },
+    {
+      "epoch": 0.02861,
+      "grad_norm": 0.8646109700202942,
+      "learning_rate": 0.003,
+      "loss": 4.1343,
+      "step": 2861
+    },
+    {
+      "epoch": 0.02862,
+      "grad_norm": 0.7844035625457764,
+      "learning_rate": 0.003,
+      "loss": 4.1497,
+      "step": 2862
+    },
+    {
+      "epoch": 0.02863,
+      "grad_norm": 0.7147074341773987,
+      "learning_rate": 0.003,
+      "loss": 4.169,
+      "step": 2863
+    },
+    {
+      "epoch": 0.02864,
+      "grad_norm": 0.7001998424530029,
+      "learning_rate": 0.003,
+      "loss": 4.1435,
+      "step": 2864
+    },
+    {
+      "epoch": 0.02865,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.003,
+      "loss": 4.1756,
+      "step": 2865
+    },
+    {
+      "epoch": 0.02866,
+      "grad_norm": 0.6479020714759827,
+      "learning_rate": 0.003,
+      "loss": 4.1532,
+      "step": 2866
+    },
+    {
+      "epoch": 0.02867,
+      "grad_norm": 0.720973014831543,
+      "learning_rate": 0.003,
+      "loss": 4.1431,
+      "step": 2867
+    },
+    {
+      "epoch": 0.02868,
+      "grad_norm": 0.6694818735122681,
+      "learning_rate": 0.003,
+      "loss": 4.1474,
+      "step": 2868
+    },
+    {
+      "epoch": 0.02869,
+      "grad_norm": 0.5998570919036865,
+      "learning_rate": 0.003,
+      "loss": 4.1334,
+      "step": 2869
+    },
+    {
+      "epoch": 0.0287,
+      "grad_norm": 0.5683268308639526,
+      "learning_rate": 0.003,
+      "loss": 4.1536,
+      "step": 2870
+    },
+    {
+      "epoch": 0.02871,
+      "grad_norm": 0.5723297595977783,
+      "learning_rate": 0.003,
+      "loss": 4.1376,
+      "step": 2871
+    },
+    {
+      "epoch": 0.02872,
+      "grad_norm": 0.5388701558113098,
+      "learning_rate": 0.003,
+      "loss": 4.1708,
+      "step": 2872
+    },
+    {
+      "epoch": 0.02873,
+      "grad_norm": 0.46427324414253235,
+      "learning_rate": 0.003,
+      "loss": 4.1389,
+      "step": 2873
+    },
+    {
+      "epoch": 0.02874,
+      "grad_norm": 0.4973464012145996,
+      "learning_rate": 0.003,
+      "loss": 4.1539,
+      "step": 2874
+    },
+    {
+      "epoch": 0.02875,
+      "grad_norm": 0.5175668001174927,
+      "learning_rate": 0.003,
+      "loss": 4.1568,
+      "step": 2875
+    },
+    {
+      "epoch": 0.02876,
+      "grad_norm": 0.5334407091140747,
+      "learning_rate": 0.003,
+      "loss": 4.1386,
+      "step": 2876
+    },
+    {
+      "epoch": 0.02877,
+      "grad_norm": 0.5334872007369995,
+      "learning_rate": 0.003,
+      "loss": 4.1197,
+      "step": 2877
+    },
+    {
+      "epoch": 0.02878,
+      "grad_norm": 0.5646567344665527,
+      "learning_rate": 0.003,
+      "loss": 4.1174,
+      "step": 2878
+    },
+    {
+      "epoch": 0.02879,
+      "grad_norm": 0.560286819934845,
+      "learning_rate": 0.003,
+      "loss": 4.1083,
+      "step": 2879
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.6549745202064514,
+      "learning_rate": 0.003,
+      "loss": 4.1188,
+      "step": 2880
+    },
+    {
+      "epoch": 0.02881,
+      "grad_norm": 0.8416467905044556,
+      "learning_rate": 0.003,
+      "loss": 4.1365,
+      "step": 2881
+    },
+    {
+      "epoch": 0.02882,
+      "grad_norm": 0.7693246603012085,
+      "learning_rate": 0.003,
+      "loss": 4.1523,
+      "step": 2882
+    },
+    {
+      "epoch": 0.02883,
+      "grad_norm": 0.6049852967262268,
+      "learning_rate": 0.003,
+      "loss": 4.1296,
+      "step": 2883
+    },
+    {
+      "epoch": 0.02884,
+      "grad_norm": 0.561359703540802,
+      "learning_rate": 0.003,
+      "loss": 4.1649,
+      "step": 2884
+    },
+    {
+      "epoch": 0.02885,
+      "grad_norm": 0.630006730556488,
+      "learning_rate": 0.003,
+      "loss": 4.1114,
+      "step": 2885
+    },
+    {
+      "epoch": 0.02886,
+      "grad_norm": 0.6453554630279541,
+      "learning_rate": 0.003,
+      "loss": 4.1416,
+      "step": 2886
+    },
+    {
+      "epoch": 0.02887,
+      "grad_norm": 0.5455800890922546,
+      "learning_rate": 0.003,
+      "loss": 4.1255,
+      "step": 2887
+    },
+    {
+      "epoch": 0.02888,
+      "grad_norm": 0.4928850531578064,
+      "learning_rate": 0.003,
+      "loss": 4.1194,
+      "step": 2888
+    },
+    {
+      "epoch": 0.02889,
+      "grad_norm": 0.4772005081176758,
+      "learning_rate": 0.003,
+      "loss": 4.1074,
+      "step": 2889
+    },
+    {
+      "epoch": 0.0289,
+      "grad_norm": 0.6117534041404724,
+      "learning_rate": 0.003,
+      "loss": 4.1446,
+      "step": 2890
+    },
+    {
+      "epoch": 0.02891,
+      "grad_norm": 0.8205873370170593,
+      "learning_rate": 0.003,
+      "loss": 4.1647,
+      "step": 2891
+    },
+    {
+      "epoch": 0.02892,
+      "grad_norm": 0.8640782833099365,
+      "learning_rate": 0.003,
+      "loss": 4.1212,
+      "step": 2892
+    },
+    {
+      "epoch": 0.02893,
+      "grad_norm": 0.8006044626235962,
+      "learning_rate": 0.003,
+      "loss": 4.13,
+      "step": 2893
+    },
+    {
+      "epoch": 0.02894,
+      "grad_norm": 0.8116544485092163,
+      "learning_rate": 0.003,
+      "loss": 4.1376,
+      "step": 2894
+    },
+    {
+      "epoch": 0.02895,
+      "grad_norm": 0.8600641489028931,
+      "learning_rate": 0.003,
+      "loss": 4.1864,
+      "step": 2895
+    },
+    {
+      "epoch": 0.02896,
+      "grad_norm": 0.6702893376350403,
+      "learning_rate": 0.003,
+      "loss": 4.1344,
+      "step": 2896
+    },
+    {
+      "epoch": 0.02897,
+      "grad_norm": 0.6657308340072632,
+      "learning_rate": 0.003,
+      "loss": 4.1186,
+      "step": 2897
+    },
+    {
+      "epoch": 0.02898,
+      "grad_norm": 0.662657618522644,
+      "learning_rate": 0.003,
+      "loss": 4.1623,
+      "step": 2898
+    },
+    {
+      "epoch": 0.02899,
+      "grad_norm": 0.7254528999328613,
+      "learning_rate": 0.003,
+      "loss": 4.148,
+      "step": 2899
+    },
+    {
+      "epoch": 0.029,
+      "grad_norm": 0.7490177750587463,
+      "learning_rate": 0.003,
+      "loss": 4.1372,
+      "step": 2900
+    },
+    {
+      "epoch": 0.02901,
+      "grad_norm": 0.7398015856742859,
+      "learning_rate": 0.003,
+      "loss": 4.1513,
+      "step": 2901
+    },
+    {
+      "epoch": 0.02902,
+      "grad_norm": 0.8604675531387329,
+      "learning_rate": 0.003,
+      "loss": 4.1725,
+      "step": 2902
+    },
+    {
+      "epoch": 0.02903,
+      "grad_norm": 0.9788482785224915,
+      "learning_rate": 0.003,
+      "loss": 4.1436,
+      "step": 2903
+    },
+    {
+      "epoch": 0.02904,
+      "grad_norm": 0.9012083411216736,
+      "learning_rate": 0.003,
+      "loss": 4.1596,
+      "step": 2904
+    },
+    {
+      "epoch": 0.02905,
+      "grad_norm": 0.6638503074645996,
+      "learning_rate": 0.003,
+      "loss": 4.1651,
+      "step": 2905
+    },
+    {
+      "epoch": 0.02906,
+      "grad_norm": 0.640064001083374,
+      "learning_rate": 0.003,
+      "loss": 4.1474,
+      "step": 2906
+    },
+    {
+      "epoch": 0.02907,
+      "grad_norm": 0.6573886871337891,
+      "learning_rate": 0.003,
+      "loss": 4.1195,
+      "step": 2907
+    },
+    {
+      "epoch": 0.02908,
+      "grad_norm": 0.6549690961837769,
+      "learning_rate": 0.003,
+      "loss": 4.1423,
+      "step": 2908
+    },
+    {
+      "epoch": 0.02909,
+      "grad_norm": 0.6671884059906006,
+      "learning_rate": 0.003,
+      "loss": 4.1361,
+      "step": 2909
+    },
+    {
+      "epoch": 0.0291,
+      "grad_norm": 0.6334506273269653,
+      "learning_rate": 0.003,
+      "loss": 4.1462,
+      "step": 2910
+    },
+    {
+      "epoch": 0.02911,
+      "grad_norm": 0.6618163585662842,
+      "learning_rate": 0.003,
+      "loss": 4.1423,
+      "step": 2911
+    },
+    {
+      "epoch": 0.02912,
+      "grad_norm": 0.7299465537071228,
+      "learning_rate": 0.003,
+      "loss": 4.1166,
+      "step": 2912
+    },
+    {
+      "epoch": 0.02913,
+      "grad_norm": 0.7096904516220093,
+      "learning_rate": 0.003,
+      "loss": 4.1481,
+      "step": 2913
+    },
+    {
+      "epoch": 0.02914,
+      "grad_norm": 0.7228156924247742,
+      "learning_rate": 0.003,
+      "loss": 4.1341,
+      "step": 2914
+    },
+    {
+      "epoch": 0.02915,
+      "grad_norm": 0.6965159177780151,
+      "learning_rate": 0.003,
+      "loss": 4.1301,
+      "step": 2915
+    },
+    {
+      "epoch": 0.02916,
+      "grad_norm": 0.5864951014518738,
+      "learning_rate": 0.003,
+      "loss": 4.1326,
+      "step": 2916
+    },
+    {
+      "epoch": 0.02917,
+      "grad_norm": 0.682627260684967,
+      "learning_rate": 0.003,
+      "loss": 4.1647,
+      "step": 2917
+    },
+    {
+      "epoch": 0.02918,
+      "grad_norm": 0.7035491466522217,
+      "learning_rate": 0.003,
+      "loss": 4.1198,
+      "step": 2918
+    },
+    {
+      "epoch": 0.02919,
+      "grad_norm": 0.6258479952812195,
+      "learning_rate": 0.003,
+      "loss": 4.1613,
+      "step": 2919
+    },
+    {
+      "epoch": 0.0292,
+      "grad_norm": 0.7004368305206299,
+      "learning_rate": 0.003,
+      "loss": 4.1034,
+      "step": 2920
+    },
+    {
+      "epoch": 0.02921,
+      "grad_norm": 0.7259970903396606,
+      "learning_rate": 0.003,
+      "loss": 4.1461,
+      "step": 2921
+    },
+    {
+      "epoch": 0.02922,
+      "grad_norm": 0.8203588724136353,
+      "learning_rate": 0.003,
+      "loss": 4.1484,
+      "step": 2922
+    },
+    {
+      "epoch": 0.02923,
+      "grad_norm": 0.7598622441291809,
+      "learning_rate": 0.003,
+      "loss": 4.1197,
+      "step": 2923
+    },
+    {
+      "epoch": 0.02924,
+      "grad_norm": 0.713235080242157,
+      "learning_rate": 0.003,
+      "loss": 4.1518,
+      "step": 2924
+    },
+    {
+      "epoch": 0.02925,
+      "grad_norm": 0.6576932668685913,
+      "learning_rate": 0.003,
+      "loss": 4.1548,
+      "step": 2925
+    },
+    {
+      "epoch": 0.02926,
+      "grad_norm": 0.5912261009216309,
+      "learning_rate": 0.003,
+      "loss": 4.1321,
+      "step": 2926
+    },
+    {
+      "epoch": 0.02927,
+      "grad_norm": 0.6221820712089539,
+      "learning_rate": 0.003,
+      "loss": 4.1364,
+      "step": 2927
+    },
+    {
+      "epoch": 0.02928,
+      "grad_norm": 0.5222737193107605,
+      "learning_rate": 0.003,
+      "loss": 4.1557,
+      "step": 2928
+    },
+    {
+      "epoch": 0.02929,
+      "grad_norm": 0.448812872171402,
+      "learning_rate": 0.003,
+      "loss": 4.1384,
+      "step": 2929
+    },
+    {
+      "epoch": 0.0293,
+      "grad_norm": 0.4934072196483612,
+      "learning_rate": 0.003,
+      "loss": 4.1309,
+      "step": 2930
+    },
+    {
+      "epoch": 0.02931,
+      "grad_norm": 0.6587796211242676,
+      "learning_rate": 0.003,
+      "loss": 4.136,
+      "step": 2931
+    },
+    {
+      "epoch": 0.02932,
+      "grad_norm": 0.8658855557441711,
+      "learning_rate": 0.003,
+      "loss": 4.1325,
+      "step": 2932
+    },
+    {
+      "epoch": 0.02933,
+      "grad_norm": 1.1174193620681763,
+      "learning_rate": 0.003,
+      "loss": 4.1685,
+      "step": 2933
+    },
+    {
+      "epoch": 0.02934,
+      "grad_norm": 0.7016493678092957,
+      "learning_rate": 0.003,
+      "loss": 4.145,
+      "step": 2934
+    },
+    {
+      "epoch": 0.02935,
+      "grad_norm": 0.543066680431366,
+      "learning_rate": 0.003,
+      "loss": 4.1272,
+      "step": 2935
+    },
+    {
+      "epoch": 0.02936,
+      "grad_norm": 0.6172811388969421,
+      "learning_rate": 0.003,
+      "loss": 4.1458,
+      "step": 2936
+    },
+    {
+      "epoch": 0.02937,
+      "grad_norm": 0.7183346152305603,
+      "learning_rate": 0.003,
+      "loss": 4.1303,
+      "step": 2937
+    },
+    {
+      "epoch": 0.02938,
+      "grad_norm": 0.7085850834846497,
+      "learning_rate": 0.003,
+      "loss": 4.1452,
+      "step": 2938
+    },
+    {
+      "epoch": 0.02939,
+      "grad_norm": 0.5843592882156372,
+      "learning_rate": 0.003,
+      "loss": 4.1302,
+      "step": 2939
+    },
+    {
+      "epoch": 0.0294,
+      "grad_norm": 0.5836790800094604,
+      "learning_rate": 0.003,
+      "loss": 4.1531,
+      "step": 2940
+    },
+    {
+      "epoch": 0.02941,
+      "grad_norm": 0.6397603154182434,
+      "learning_rate": 0.003,
+      "loss": 4.1546,
+      "step": 2941
+    },
+    {
+      "epoch": 0.02942,
+      "grad_norm": 0.6955589056015015,
+      "learning_rate": 0.003,
+      "loss": 4.1404,
+      "step": 2942
+    },
+    {
+      "epoch": 0.02943,
+      "grad_norm": 0.754485011100769,
+      "learning_rate": 0.003,
+      "loss": 4.1328,
+      "step": 2943
+    },
+    {
+      "epoch": 0.02944,
+      "grad_norm": 0.8499577045440674,
+      "learning_rate": 0.003,
+      "loss": 4.1566,
+      "step": 2944
+    },
+    {
+      "epoch": 0.02945,
+      "grad_norm": 0.8511131405830383,
+      "learning_rate": 0.003,
+      "loss": 4.1375,
+      "step": 2945
+    },
+    {
+      "epoch": 0.02946,
+      "grad_norm": 0.7081684470176697,
+      "learning_rate": 0.003,
+      "loss": 4.1304,
+      "step": 2946
+    },
+    {
+      "epoch": 0.02947,
+      "grad_norm": 0.6551468968391418,
+      "learning_rate": 0.003,
+      "loss": 4.1627,
+      "step": 2947
+    },
+    {
+      "epoch": 0.02948,
+      "grad_norm": 0.6386260390281677,
+      "learning_rate": 0.003,
+      "loss": 4.1399,
+      "step": 2948
+    },
+    {
+      "epoch": 0.02949,
+      "grad_norm": 0.5764749050140381,
+      "learning_rate": 0.003,
+      "loss": 4.113,
+      "step": 2949
+    },
+    {
+      "epoch": 0.0295,
+      "grad_norm": 0.5615102648735046,
+      "learning_rate": 0.003,
+      "loss": 4.1347,
+      "step": 2950
+    },
+    {
+      "epoch": 0.02951,
+      "grad_norm": 0.5903642773628235,
+      "learning_rate": 0.003,
+      "loss": 4.088,
+      "step": 2951
+    },
+    {
+      "epoch": 0.02952,
+      "grad_norm": 0.6850778460502625,
+      "learning_rate": 0.003,
+      "loss": 4.1243,
+      "step": 2952
+    },
+    {
+      "epoch": 0.02953,
+      "grad_norm": 0.6652215123176575,
+      "learning_rate": 0.003,
+      "loss": 4.1312,
+      "step": 2953
+    },
+    {
+      "epoch": 0.02954,
+      "grad_norm": 0.6463000178337097,
+      "learning_rate": 0.003,
+      "loss": 4.1725,
+      "step": 2954
+    },
+    {
+      "epoch": 0.02955,
+      "grad_norm": 0.7008712291717529,
+      "learning_rate": 0.003,
+      "loss": 4.1435,
+      "step": 2955
+    },
+    {
+      "epoch": 0.02956,
+      "grad_norm": 0.7221593260765076,
+      "learning_rate": 0.003,
+      "loss": 4.1396,
+      "step": 2956
+    },
+    {
+      "epoch": 0.02957,
+      "grad_norm": 0.8538970351219177,
+      "learning_rate": 0.003,
+      "loss": 4.1508,
+      "step": 2957
+    },
+    {
+      "epoch": 0.02958,
+      "grad_norm": 0.9920998811721802,
+      "learning_rate": 0.003,
+      "loss": 4.1749,
+      "step": 2958
+    },
+    {
+      "epoch": 0.02959,
+      "grad_norm": 1.0068269968032837,
+      "learning_rate": 0.003,
+      "loss": 4.1654,
+      "step": 2959
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.9310505986213684,
+      "learning_rate": 0.003,
+      "loss": 4.1729,
+      "step": 2960
+    },
+    {
+      "epoch": 0.02961,
+      "grad_norm": 0.878132164478302,
+      "learning_rate": 0.003,
+      "loss": 4.1458,
+      "step": 2961
+    },
+    {
+      "epoch": 0.02962,
+      "grad_norm": 0.953723132610321,
+      "learning_rate": 0.003,
+      "loss": 4.1532,
+      "step": 2962
+    },
+    {
+      "epoch": 0.02963,
+      "grad_norm": 0.9499011039733887,
+      "learning_rate": 0.003,
+      "loss": 4.1333,
+      "step": 2963
+    },
+    {
+      "epoch": 0.02964,
+      "grad_norm": 0.8707635402679443,
+      "learning_rate": 0.003,
+      "loss": 4.1169,
+      "step": 2964
+    },
+    {
+      "epoch": 0.02965,
+      "grad_norm": 0.8199546337127686,
+      "learning_rate": 0.003,
+      "loss": 4.1628,
+      "step": 2965
+    },
+    {
+      "epoch": 0.02966,
+      "grad_norm": 0.7343395948410034,
+      "learning_rate": 0.003,
+      "loss": 4.1728,
+      "step": 2966
+    },
+    {
+      "epoch": 0.02967,
+      "grad_norm": 0.6565076112747192,
+      "learning_rate": 0.003,
+      "loss": 4.1547,
+      "step": 2967
+    },
+    {
+      "epoch": 0.02968,
+      "grad_norm": 0.6714766025543213,
+      "learning_rate": 0.003,
+      "loss": 4.1518,
+      "step": 2968
+    },
+    {
+      "epoch": 0.02969,
+      "grad_norm": 0.6478135585784912,
+      "learning_rate": 0.003,
+      "loss": 4.1489,
+      "step": 2969
+    },
+    {
+      "epoch": 0.0297,
+      "grad_norm": 0.5846438407897949,
+      "learning_rate": 0.003,
+      "loss": 4.1316,
+      "step": 2970
+    },
+    {
+      "epoch": 0.02971,
+      "grad_norm": 0.5542798042297363,
+      "learning_rate": 0.003,
+      "loss": 4.1482,
+      "step": 2971
+    },
+    {
+      "epoch": 0.02972,
+      "grad_norm": 0.5505024790763855,
+      "learning_rate": 0.003,
+      "loss": 4.1172,
+      "step": 2972
+    },
+    {
+      "epoch": 0.02973,
+      "grad_norm": 0.5356234908103943,
+      "learning_rate": 0.003,
+      "loss": 4.1011,
+      "step": 2973
+    },
+    {
+      "epoch": 0.02974,
+      "grad_norm": 0.6036863327026367,
+      "learning_rate": 0.003,
+      "loss": 4.1403,
+      "step": 2974
+    },
+    {
+      "epoch": 0.02975,
+      "grad_norm": 0.48086002469062805,
+      "learning_rate": 0.003,
+      "loss": 4.1216,
+      "step": 2975
+    },
+    {
+      "epoch": 0.02976,
+      "grad_norm": 0.43674689531326294,
+      "learning_rate": 0.003,
+      "loss": 4.1282,
+      "step": 2976
+    },
+    {
+      "epoch": 0.02977,
+      "grad_norm": 0.4434605836868286,
+      "learning_rate": 0.003,
+      "loss": 4.1165,
+      "step": 2977
+    },
+    {
+      "epoch": 0.02978,
+      "grad_norm": 0.5155782103538513,
+      "learning_rate": 0.003,
+      "loss": 4.1373,
+      "step": 2978
+    },
+    {
+      "epoch": 0.02979,
+      "grad_norm": 0.6957648396492004,
+      "learning_rate": 0.003,
+      "loss": 4.1259,
+      "step": 2979
+    },
+    {
+      "epoch": 0.0298,
+      "grad_norm": 0.8462650179862976,
+      "learning_rate": 0.003,
+      "loss": 4.139,
+      "step": 2980
+    },
+    {
+      "epoch": 0.02981,
+      "grad_norm": 0.7909703254699707,
+      "learning_rate": 0.003,
+      "loss": 4.146,
+      "step": 2981
+    },
+    {
+      "epoch": 0.02982,
+      "grad_norm": 0.5594244003295898,
+      "learning_rate": 0.003,
+      "loss": 4.1346,
+      "step": 2982
+    },
+    {
+      "epoch": 0.02983,
+      "grad_norm": 0.5850778222084045,
+      "learning_rate": 0.003,
+      "loss": 4.1518,
+      "step": 2983
+    },
+    {
+      "epoch": 0.02984,
+      "grad_norm": 0.6843441128730774,
+      "learning_rate": 0.003,
+      "loss": 4.1276,
+      "step": 2984
+    },
+    {
+      "epoch": 0.02985,
+      "grad_norm": 0.6832881569862366,
+      "learning_rate": 0.003,
+      "loss": 4.1591,
+      "step": 2985
+    },
+    {
+      "epoch": 0.02986,
+      "grad_norm": 0.5421028733253479,
+      "learning_rate": 0.003,
+      "loss": 4.1172,
+      "step": 2986
+    },
+    {
+      "epoch": 0.02987,
+      "grad_norm": 0.551857590675354,
+      "learning_rate": 0.003,
+      "loss": 4.1095,
+      "step": 2987
+    },
+    {
+      "epoch": 0.02988,
+      "grad_norm": 0.6235337853431702,
+      "learning_rate": 0.003,
+      "loss": 4.1248,
+      "step": 2988
+    },
+    {
+      "epoch": 0.02989,
+      "grad_norm": 0.7048717737197876,
+      "learning_rate": 0.003,
+      "loss": 4.1396,
+      "step": 2989
+    },
+    {
+      "epoch": 0.0299,
+      "grad_norm": 0.7494181990623474,
+      "learning_rate": 0.003,
+      "loss": 4.1016,
+      "step": 2990
+    },
+    {
+      "epoch": 0.02991,
+      "grad_norm": 0.7830327749252319,
+      "learning_rate": 0.003,
+      "loss": 4.1547,
+      "step": 2991
+    },
+    {
+      "epoch": 0.02992,
+      "grad_norm": 0.8855887651443481,
+      "learning_rate": 0.003,
+      "loss": 4.1523,
+      "step": 2992
+    },
+    {
+      "epoch": 0.02993,
+      "grad_norm": 0.888844907283783,
+      "learning_rate": 0.003,
+      "loss": 4.1255,
+      "step": 2993
+    },
+    {
+      "epoch": 0.02994,
+      "grad_norm": 0.822543203830719,
+      "learning_rate": 0.003,
+      "loss": 4.1363,
+      "step": 2994
+    },
+    {
+      "epoch": 0.02995,
+      "grad_norm": 0.8102987408638,
+      "learning_rate": 0.003,
+      "loss": 4.1743,
+      "step": 2995
+    },
+    {
+      "epoch": 0.02996,
+      "grad_norm": 0.8138471841812134,
+      "learning_rate": 0.003,
+      "loss": 4.1624,
+      "step": 2996
+    },
+    {
+      "epoch": 0.02997,
+      "grad_norm": 0.7990952134132385,
+      "learning_rate": 0.003,
+      "loss": 4.1243,
+      "step": 2997
+    },
+    {
+      "epoch": 0.02998,
+      "grad_norm": 0.7419268488883972,
+      "learning_rate": 0.003,
+      "loss": 4.124,
+      "step": 2998
+    },
+    {
+      "epoch": 0.02999,
+      "grad_norm": 0.6637421250343323,
+      "learning_rate": 0.003,
+      "loss": 4.1145,
+      "step": 2999
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.5517862439155579,
+      "learning_rate": 0.003,
+      "loss": 4.1563,
+      "step": 3000
+    },
+    {
+      "epoch": 0.03001,
+      "grad_norm": 0.5322889089584351,
+      "learning_rate": 0.003,
+      "loss": 4.1217,
+      "step": 3001
+    },
+    {
+      "epoch": 0.03002,
+      "grad_norm": 0.5043152570724487,
+      "learning_rate": 0.003,
+      "loss": 4.1356,
+      "step": 3002
+    },
+    {
+      "epoch": 0.03003,
+      "grad_norm": 0.5661661028862,
+      "learning_rate": 0.003,
+      "loss": 4.1145,
+      "step": 3003
+    },
+    {
+      "epoch": 0.03004,
+      "grad_norm": 0.6002155542373657,
+      "learning_rate": 0.003,
+      "loss": 4.0956,
+      "step": 3004
+    },
+    {
+      "epoch": 0.03005,
+      "grad_norm": 0.5543268918991089,
+      "learning_rate": 0.003,
+      "loss": 4.1446,
+      "step": 3005
+    },
+    {
+      "epoch": 0.03006,
+      "grad_norm": 0.6193807721138,
+      "learning_rate": 0.003,
+      "loss": 4.1405,
+      "step": 3006
+    },
+    {
+      "epoch": 0.03007,
+      "grad_norm": 0.7129995822906494,
+      "learning_rate": 0.003,
+      "loss": 4.1141,
+      "step": 3007
+    },
+    {
+      "epoch": 0.03008,
+      "grad_norm": 0.7921411991119385,
+      "learning_rate": 0.003,
+      "loss": 4.1465,
+      "step": 3008
+    },
+    {
+      "epoch": 0.03009,
+      "grad_norm": 0.6867934465408325,
+      "learning_rate": 0.003,
+      "loss": 4.1179,
+      "step": 3009
+    },
+    {
+      "epoch": 0.0301,
+      "grad_norm": 0.630652666091919,
+      "learning_rate": 0.003,
+      "loss": 4.1398,
+      "step": 3010
+    },
+    {
+      "epoch": 0.03011,
+      "grad_norm": 0.7961447834968567,
+      "learning_rate": 0.003,
+      "loss": 4.1298,
+      "step": 3011
+    },
+    {
+      "epoch": 0.03012,
+      "grad_norm": 0.6278183460235596,
+      "learning_rate": 0.003,
+      "loss": 4.1307,
+      "step": 3012
+    },
+    {
+      "epoch": 0.03013,
+      "grad_norm": 0.5435069799423218,
+      "learning_rate": 0.003,
+      "loss": 4.087,
+      "step": 3013
+    },
+    {
+      "epoch": 0.03014,
+      "grad_norm": 0.5863070487976074,
+      "learning_rate": 0.003,
+      "loss": 4.1185,
+      "step": 3014
+    },
+    {
+      "epoch": 0.03015,
+      "grad_norm": 0.6343687176704407,
+      "learning_rate": 0.003,
+      "loss": 4.1167,
+      "step": 3015
+    },
+    {
+      "epoch": 0.03016,
+      "grad_norm": 0.72650146484375,
+      "learning_rate": 0.003,
+      "loss": 4.1264,
+      "step": 3016
+    },
+    {
+      "epoch": 0.03017,
+      "grad_norm": 0.719595193862915,
+      "learning_rate": 0.003,
+      "loss": 4.126,
+      "step": 3017
+    },
+    {
+      "epoch": 0.03018,
+      "grad_norm": 0.5911171436309814,
+      "learning_rate": 0.003,
+      "loss": 4.1164,
+      "step": 3018
+    },
+    {
+      "epoch": 0.03019,
+      "grad_norm": 0.5423696637153625,
+      "learning_rate": 0.003,
+      "loss": 4.1307,
+      "step": 3019
+    },
+    {
+      "epoch": 0.0302,
+      "grad_norm": 0.5470551252365112,
+      "learning_rate": 0.003,
+      "loss": 4.1232,
+      "step": 3020
+    },
+    {
+      "epoch": 0.03021,
+      "grad_norm": 0.7124951481819153,
+      "learning_rate": 0.003,
+      "loss": 4.1496,
+      "step": 3021
+    },
+    {
+      "epoch": 0.03022,
+      "grad_norm": 0.8745585680007935,
+      "learning_rate": 0.003,
+      "loss": 4.1587,
+      "step": 3022
+    },
+    {
+      "epoch": 0.03023,
+      "grad_norm": 0.866553008556366,
+      "learning_rate": 0.003,
+      "loss": 4.1734,
+      "step": 3023
+    },
+    {
+      "epoch": 0.03024,
+      "grad_norm": 0.7500194907188416,
+      "learning_rate": 0.003,
+      "loss": 4.1646,
+      "step": 3024
+    },
+    {
+      "epoch": 0.03025,
+      "grad_norm": 0.7470998764038086,
+      "learning_rate": 0.003,
+      "loss": 4.1428,
+      "step": 3025
+    },
+    {
+      "epoch": 0.03026,
+      "grad_norm": 0.8869830369949341,
+      "learning_rate": 0.003,
+      "loss": 4.1211,
+      "step": 3026
+    },
+    {
+      "epoch": 0.03027,
+      "grad_norm": 0.7815242409706116,
+      "learning_rate": 0.003,
+      "loss": 4.1415,
+      "step": 3027
+    },
+    {
+      "epoch": 0.03028,
+      "grad_norm": 0.6851547956466675,
+      "learning_rate": 0.003,
+      "loss": 4.098,
+      "step": 3028
+    },
+    {
+      "epoch": 0.03029,
+      "grad_norm": 0.6831088066101074,
+      "learning_rate": 0.003,
+      "loss": 4.1417,
+      "step": 3029
+    },
+    {
+      "epoch": 0.0303,
+      "grad_norm": 0.6287125945091248,
+      "learning_rate": 0.003,
+      "loss": 4.1228,
+      "step": 3030
+    },
+    {
+      "epoch": 0.03031,
+      "grad_norm": 0.5370122194290161,
+      "learning_rate": 0.003,
+      "loss": 4.1032,
+      "step": 3031
+    },
+    {
+      "epoch": 0.03032,
+      "grad_norm": 0.5675404071807861,
+      "learning_rate": 0.003,
+      "loss": 4.1322,
+      "step": 3032
+    },
+    {
+      "epoch": 0.03033,
+      "grad_norm": 0.7573124766349792,
+      "learning_rate": 0.003,
+      "loss": 4.1348,
+      "step": 3033
+    },
+    {
+      "epoch": 0.03034,
+      "grad_norm": 0.8983549475669861,
+      "learning_rate": 0.003,
+      "loss": 4.1466,
+      "step": 3034
+    },
+    {
+      "epoch": 0.03035,
+      "grad_norm": 1.0109939575195312,
+      "learning_rate": 0.003,
+      "loss": 4.1821,
+      "step": 3035
+    },
+    {
+      "epoch": 0.03036,
+      "grad_norm": 0.9054379463195801,
+      "learning_rate": 0.003,
+      "loss": 4.1495,
+      "step": 3036
+    },
+    {
+      "epoch": 0.03037,
+      "grad_norm": 0.9631518125534058,
+      "learning_rate": 0.003,
+      "loss": 4.1395,
+      "step": 3037
+    },
+    {
+      "epoch": 0.03038,
+      "grad_norm": 0.9668822288513184,
+      "learning_rate": 0.003,
+      "loss": 4.1401,
+      "step": 3038
+    },
+    {
+      "epoch": 0.03039,
+      "grad_norm": 0.9021666049957275,
+      "learning_rate": 0.003,
+      "loss": 4.1497,
+      "step": 3039
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.8186821937561035,
+      "learning_rate": 0.003,
+      "loss": 4.1515,
+      "step": 3040
+    },
+    {
+      "epoch": 0.03041,
+      "grad_norm": 0.7148540616035461,
+      "learning_rate": 0.003,
+      "loss": 4.1558,
+      "step": 3041
+    },
+    {
+      "epoch": 0.03042,
+      "grad_norm": 0.7441778182983398,
+      "learning_rate": 0.003,
+      "loss": 4.18,
+      "step": 3042
+    },
+    {
+      "epoch": 0.03043,
+      "grad_norm": 0.7269161343574524,
+      "learning_rate": 0.003,
+      "loss": 4.1303,
+      "step": 3043
+    },
+    {
+      "epoch": 0.03044,
+      "grad_norm": 0.7291993498802185,
+      "learning_rate": 0.003,
+      "loss": 4.1294,
+      "step": 3044
+    },
+    {
+      "epoch": 0.03045,
+      "grad_norm": 0.8323465585708618,
+      "learning_rate": 0.003,
+      "loss": 4.1387,
+      "step": 3045
+    },
+    {
+      "epoch": 0.03046,
+      "grad_norm": 0.9850740432739258,
+      "learning_rate": 0.003,
+      "loss": 4.1535,
+      "step": 3046
+    },
+    {
+      "epoch": 0.03047,
+      "grad_norm": 1.0652803182601929,
+      "learning_rate": 0.003,
+      "loss": 4.1672,
+      "step": 3047
+    },
+    {
+      "epoch": 0.03048,
+      "grad_norm": 0.8505216836929321,
+      "learning_rate": 0.003,
+      "loss": 4.145,
+      "step": 3048
+    },
+    {
+      "epoch": 0.03049,
+      "grad_norm": 0.7812585234642029,
+      "learning_rate": 0.003,
+      "loss": 4.13,
+      "step": 3049
+    },
+    {
+      "epoch": 0.0305,
+      "grad_norm": 0.8137039542198181,
+      "learning_rate": 0.003,
+      "loss": 4.148,
+      "step": 3050
+    },
+    {
+      "epoch": 0.03051,
+      "grad_norm": 0.8009527325630188,
+      "learning_rate": 0.003,
+      "loss": 4.157,
+      "step": 3051
+    },
+    {
+      "epoch": 0.03052,
+      "grad_norm": 0.6827675104141235,
+      "learning_rate": 0.003,
+      "loss": 4.1565,
+      "step": 3052
+    },
+    {
+      "epoch": 0.03053,
+      "grad_norm": 0.5847889184951782,
+      "learning_rate": 0.003,
+      "loss": 4.1564,
+      "step": 3053
+    },
+    {
+      "epoch": 0.03054,
+      "grad_norm": 0.5359497666358948,
+      "learning_rate": 0.003,
+      "loss": 4.1362,
+      "step": 3054
+    },
+    {
+      "epoch": 0.03055,
+      "grad_norm": 0.5127337574958801,
+      "learning_rate": 0.003,
+      "loss": 4.1349,
+      "step": 3055
+    },
+    {
+      "epoch": 0.03056,
+      "grad_norm": 0.4672796130180359,
+      "learning_rate": 0.003,
+      "loss": 4.1421,
+      "step": 3056
+    },
+    {
+      "epoch": 0.03057,
+      "grad_norm": 0.4908005893230438,
+      "learning_rate": 0.003,
+      "loss": 4.141,
+      "step": 3057
+    },
+    {
+      "epoch": 0.03058,
+      "grad_norm": 0.4413350522518158,
+      "learning_rate": 0.003,
+      "loss": 4.1103,
+      "step": 3058
+    },
+    {
+      "epoch": 0.03059,
+      "grad_norm": 0.42543938755989075,
+      "learning_rate": 0.003,
+      "loss": 4.1211,
+      "step": 3059
+    },
+    {
+      "epoch": 0.0306,
+      "grad_norm": 0.43946075439453125,
+      "learning_rate": 0.003,
+      "loss": 4.107,
+      "step": 3060
+    },
+    {
+      "epoch": 0.03061,
+      "grad_norm": 0.4698311388492584,
+      "learning_rate": 0.003,
+      "loss": 4.1318,
+      "step": 3061
+    },
+    {
+      "epoch": 0.03062,
+      "grad_norm": 0.44352906942367554,
+      "learning_rate": 0.003,
+      "loss": 4.154,
+      "step": 3062
+    },
+    {
+      "epoch": 0.03063,
+      "grad_norm": 0.4036288857460022,
+      "learning_rate": 0.003,
+      "loss": 4.0965,
+      "step": 3063
+    },
+    {
+      "epoch": 0.03064,
+      "grad_norm": 0.34206268191337585,
+      "learning_rate": 0.003,
+      "loss": 4.1283,
+      "step": 3064
+    },
+    {
+      "epoch": 0.03065,
+      "grad_norm": 0.31049665808677673,
+      "learning_rate": 0.003,
+      "loss": 4.094,
+      "step": 3065
+    },
+    {
+      "epoch": 0.03066,
+      "grad_norm": 0.309965580701828,
+      "learning_rate": 0.003,
+      "loss": 4.137,
+      "step": 3066
+    },
+    {
+      "epoch": 0.03067,
+      "grad_norm": 0.31760501861572266,
+      "learning_rate": 0.003,
+      "loss": 4.1421,
+      "step": 3067
+    },
+    {
+      "epoch": 0.03068,
+      "grad_norm": 0.34679949283599854,
+      "learning_rate": 0.003,
+      "loss": 4.1118,
+      "step": 3068
+    },
+    {
+      "epoch": 0.03069,
+      "grad_norm": 0.491832435131073,
+      "learning_rate": 0.003,
+      "loss": 4.0983,
+      "step": 3069
+    },
+    {
+      "epoch": 0.0307,
+      "grad_norm": 0.7506191730499268,
+      "learning_rate": 0.003,
+      "loss": 4.1154,
+      "step": 3070
+    },
+    {
+      "epoch": 0.03071,
+      "grad_norm": 0.9722976684570312,
+      "learning_rate": 0.003,
+      "loss": 4.1252,
+      "step": 3071
+    },
+    {
+      "epoch": 0.03072,
+      "grad_norm": 0.9531710147857666,
+      "learning_rate": 0.003,
+      "loss": 4.1347,
+      "step": 3072
+    },
+    {
+      "epoch": 0.03073,
+      "grad_norm": 0.7240149974822998,
+      "learning_rate": 0.003,
+      "loss": 4.1264,
+      "step": 3073
+    },
+    {
+      "epoch": 0.03074,
+      "grad_norm": 0.8381486535072327,
+      "learning_rate": 0.003,
+      "loss": 4.1328,
+      "step": 3074
+    },
+    {
+      "epoch": 0.03075,
+      "grad_norm": 1.0160551071166992,
+      "learning_rate": 0.003,
+      "loss": 4.163,
+      "step": 3075
+    },
+    {
+      "epoch": 0.03076,
+      "grad_norm": 0.9568767547607422,
+      "learning_rate": 0.003,
+      "loss": 4.1535,
+      "step": 3076
+    },
+    {
+      "epoch": 0.03077,
+      "grad_norm": 0.855299174785614,
+      "learning_rate": 0.003,
+      "loss": 4.152,
+      "step": 3077
+    },
+    {
+      "epoch": 0.03078,
+      "grad_norm": 0.9153781533241272,
+      "learning_rate": 0.003,
+      "loss": 4.1418,
+      "step": 3078
+    },
+    {
+      "epoch": 0.03079,
+      "grad_norm": 0.8517905473709106,
+      "learning_rate": 0.003,
+      "loss": 4.1284,
+      "step": 3079
+    },
+    {
+      "epoch": 0.0308,
+      "grad_norm": 0.7446357607841492,
+      "learning_rate": 0.003,
+      "loss": 4.1544,
+      "step": 3080
+    },
+    {
+      "epoch": 0.03081,
+      "grad_norm": 0.8622949719429016,
+      "learning_rate": 0.003,
+      "loss": 4.1305,
+      "step": 3081
+    },
+    {
+      "epoch": 0.03082,
+      "grad_norm": 0.923214316368103,
+      "learning_rate": 0.003,
+      "loss": 4.1406,
+      "step": 3082
+    },
+    {
+      "epoch": 0.03083,
+      "grad_norm": 0.7624456882476807,
+      "learning_rate": 0.003,
+      "loss": 4.1375,
+      "step": 3083
+    },
+    {
+      "epoch": 0.03084,
+      "grad_norm": 0.7506299018859863,
+      "learning_rate": 0.003,
+      "loss": 4.1539,
+      "step": 3084
+    },
+    {
+      "epoch": 0.03085,
+      "grad_norm": 0.7482979893684387,
+      "learning_rate": 0.003,
+      "loss": 4.149,
+      "step": 3085
+    },
+    {
+      "epoch": 0.03086,
+      "grad_norm": 0.8742256164550781,
+      "learning_rate": 0.003,
+      "loss": 4.1735,
+      "step": 3086
+    },
+    {
+      "epoch": 0.03087,
+      "grad_norm": 0.8986655473709106,
+      "learning_rate": 0.003,
+      "loss": 4.1492,
+      "step": 3087
+    },
+    {
+      "epoch": 0.03088,
+      "grad_norm": 0.6525835990905762,
+      "learning_rate": 0.003,
+      "loss": 4.1581,
+      "step": 3088
+    },
+    {
+      "epoch": 0.03089,
+      "grad_norm": 0.5235843062400818,
+      "learning_rate": 0.003,
+      "loss": 4.119,
+      "step": 3089
+    },
+    {
+      "epoch": 0.0309,
+      "grad_norm": 0.5896126627922058,
+      "learning_rate": 0.003,
+      "loss": 4.1262,
+      "step": 3090
+    },
+    {
+      "epoch": 0.03091,
+      "grad_norm": 0.6240640878677368,
+      "learning_rate": 0.003,
+      "loss": 4.0867,
+      "step": 3091
+    },
+    {
+      "epoch": 0.03092,
+      "grad_norm": 0.7089477777481079,
+      "learning_rate": 0.003,
+      "loss": 4.1188,
+      "step": 3092
+    },
+    {
+      "epoch": 0.03093,
+      "grad_norm": 0.7679318785667419,
+      "learning_rate": 0.003,
+      "loss": 4.1512,
+      "step": 3093
+    },
+    {
+      "epoch": 0.03094,
+      "grad_norm": 0.829313337802887,
+      "learning_rate": 0.003,
+      "loss": 4.1391,
+      "step": 3094
+    },
+    {
+      "epoch": 0.03095,
+      "grad_norm": 0.858989953994751,
+      "learning_rate": 0.003,
+      "loss": 4.1537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.03096,
+      "grad_norm": 0.8490680456161499,
+      "learning_rate": 0.003,
+      "loss": 4.1348,
+      "step": 3096
+    },
+    {
+      "epoch": 0.03097,
+      "grad_norm": 0.7456005811691284,
+      "learning_rate": 0.003,
+      "loss": 4.1564,
+      "step": 3097
+    },
+    {
+      "epoch": 0.03098,
+      "grad_norm": 0.6156277656555176,
+      "learning_rate": 0.003,
+      "loss": 4.1447,
+      "step": 3098
+    },
+    {
+      "epoch": 0.03099,
+      "grad_norm": 0.7040721774101257,
+      "learning_rate": 0.003,
+      "loss": 4.1403,
+      "step": 3099
+    },
+    {
+      "epoch": 0.031,
+      "grad_norm": 0.7396160960197449,
+      "learning_rate": 0.003,
+      "loss": 4.1364,
+      "step": 3100
+    },
+    {
+      "epoch": 0.03101,
+      "grad_norm": 0.6194980144500732,
+      "learning_rate": 0.003,
+      "loss": 4.1244,
+      "step": 3101
+    },
+    {
+      "epoch": 0.03102,
+      "grad_norm": 0.5160166025161743,
+      "learning_rate": 0.003,
+      "loss": 4.1235,
+      "step": 3102
+    },
+    {
+      "epoch": 0.03103,
+      "grad_norm": 0.4513045847415924,
+      "learning_rate": 0.003,
+      "loss": 4.1085,
+      "step": 3103
+    },
+    {
+      "epoch": 0.03104,
+      "grad_norm": 0.46098387241363525,
+      "learning_rate": 0.003,
+      "loss": 4.1146,
+      "step": 3104
+    },
+    {
+      "epoch": 0.03105,
+      "grad_norm": 0.46439290046691895,
+      "learning_rate": 0.003,
+      "loss": 4.0932,
+      "step": 3105
+    },
+    {
+      "epoch": 0.03106,
+      "grad_norm": 0.48921629786491394,
+      "learning_rate": 0.003,
+      "loss": 4.112,
+      "step": 3106
+    },
+    {
+      "epoch": 0.03107,
+      "grad_norm": 0.4456900954246521,
+      "learning_rate": 0.003,
+      "loss": 4.0905,
+      "step": 3107
+    },
+    {
+      "epoch": 0.03108,
+      "grad_norm": 0.3602052330970764,
+      "learning_rate": 0.003,
+      "loss": 4.0993,
+      "step": 3108
+    },
+    {
+      "epoch": 0.03109,
+      "grad_norm": 0.37874293327331543,
+      "learning_rate": 0.003,
+      "loss": 4.1048,
+      "step": 3109
+    },
+    {
+      "epoch": 0.0311,
+      "grad_norm": 0.34511056542396545,
+      "learning_rate": 0.003,
+      "loss": 4.1247,
+      "step": 3110
+    },
+    {
+      "epoch": 0.03111,
+      "grad_norm": 0.31971776485443115,
+      "learning_rate": 0.003,
+      "loss": 4.1015,
+      "step": 3111
+    },
+    {
+      "epoch": 0.03112,
+      "grad_norm": 0.38327884674072266,
+      "learning_rate": 0.003,
+      "loss": 4.123,
+      "step": 3112
+    },
+    {
+      "epoch": 0.03113,
+      "grad_norm": 0.4181249141693115,
+      "learning_rate": 0.003,
+      "loss": 4.1094,
+      "step": 3113
+    },
+    {
+      "epoch": 0.03114,
+      "grad_norm": 0.5446284413337708,
+      "learning_rate": 0.003,
+      "loss": 4.1157,
+      "step": 3114
+    },
+    {
+      "epoch": 0.03115,
+      "grad_norm": 0.7353031635284424,
+      "learning_rate": 0.003,
+      "loss": 4.1229,
+      "step": 3115
+    },
+    {
+      "epoch": 0.03116,
+      "grad_norm": 0.9837905168533325,
+      "learning_rate": 0.003,
+      "loss": 4.1171,
+      "step": 3116
+    },
+    {
+      "epoch": 0.03117,
+      "grad_norm": 1.0615694522857666,
+      "learning_rate": 0.003,
+      "loss": 4.1341,
+      "step": 3117
+    },
+    {
+      "epoch": 0.03118,
+      "grad_norm": 0.7411212921142578,
+      "learning_rate": 0.003,
+      "loss": 4.122,
+      "step": 3118
+    },
+    {
+      "epoch": 0.03119,
+      "grad_norm": 0.6940107941627502,
+      "learning_rate": 0.003,
+      "loss": 4.1372,
+      "step": 3119
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.6913151144981384,
+      "learning_rate": 0.003,
+      "loss": 4.1423,
+      "step": 3120
+    },
+    {
+      "epoch": 0.03121,
+      "grad_norm": 0.6097826361656189,
+      "learning_rate": 0.003,
+      "loss": 4.1172,
+      "step": 3121
+    },
+    {
+      "epoch": 0.03122,
+      "grad_norm": 0.6615216732025146,
+      "learning_rate": 0.003,
+      "loss": 4.1389,
+      "step": 3122
+    },
+    {
+      "epoch": 0.03123,
+      "grad_norm": 0.799041211605072,
+      "learning_rate": 0.003,
+      "loss": 4.1385,
+      "step": 3123
+    },
+    {
+      "epoch": 0.03124,
+      "grad_norm": 0.8251768350601196,
+      "learning_rate": 0.003,
+      "loss": 4.1199,
+      "step": 3124
+    },
+    {
+      "epoch": 0.03125,
+      "grad_norm": 0.7816169857978821,
+      "learning_rate": 0.003,
+      "loss": 4.1194,
+      "step": 3125
+    },
+    {
+      "epoch": 0.03126,
+      "grad_norm": 0.7771303057670593,
+      "learning_rate": 0.003,
+      "loss": 4.1237,
+      "step": 3126
+    },
+    {
+      "epoch": 0.03127,
+      "grad_norm": 0.7820833921432495,
+      "learning_rate": 0.003,
+      "loss": 4.1321,
+      "step": 3127
+    },
+    {
+      "epoch": 0.03128,
+      "grad_norm": 0.7861484885215759,
+      "learning_rate": 0.003,
+      "loss": 4.1231,
+      "step": 3128
+    },
+    {
+      "epoch": 0.03129,
+      "grad_norm": 0.7761921286582947,
+      "learning_rate": 0.003,
+      "loss": 4.1391,
+      "step": 3129
+    },
+    {
+      "epoch": 0.0313,
+      "grad_norm": 0.7497063875198364,
+      "learning_rate": 0.003,
+      "loss": 4.1467,
+      "step": 3130
+    },
+    {
+      "epoch": 0.03131,
+      "grad_norm": 0.673405647277832,
+      "learning_rate": 0.003,
+      "loss": 4.1248,
+      "step": 3131
+    },
+    {
+      "epoch": 0.03132,
+      "grad_norm": 0.6740285158157349,
+      "learning_rate": 0.003,
+      "loss": 4.1467,
+      "step": 3132
+    },
+    {
+      "epoch": 0.03133,
+      "grad_norm": 0.7402722835540771,
+      "learning_rate": 0.003,
+      "loss": 4.1395,
+      "step": 3133
+    },
+    {
+      "epoch": 0.03134,
+      "grad_norm": 0.8292396068572998,
+      "learning_rate": 0.003,
+      "loss": 4.1408,
+      "step": 3134
+    },
+    {
+      "epoch": 0.03135,
+      "grad_norm": 0.8309870958328247,
+      "learning_rate": 0.003,
+      "loss": 4.1144,
+      "step": 3135
+    },
+    {
+      "epoch": 0.03136,
+      "grad_norm": 0.9130949378013611,
+      "learning_rate": 0.003,
+      "loss": 4.1334,
+      "step": 3136
+    },
+    {
+      "epoch": 0.03137,
+      "grad_norm": 0.8444306254386902,
+      "learning_rate": 0.003,
+      "loss": 4.1331,
+      "step": 3137
+    },
+    {
+      "epoch": 0.03138,
+      "grad_norm": 0.8266807198524475,
+      "learning_rate": 0.003,
+      "loss": 4.1348,
+      "step": 3138
+    },
+    {
+      "epoch": 0.03139,
+      "grad_norm": 0.9213491082191467,
+      "learning_rate": 0.003,
+      "loss": 4.1459,
+      "step": 3139
+    },
+    {
+      "epoch": 0.0314,
+      "grad_norm": 0.8733784556388855,
+      "learning_rate": 0.003,
+      "loss": 4.1429,
+      "step": 3140
+    },
+    {
+      "epoch": 0.03141,
+      "grad_norm": 0.848698079586029,
+      "learning_rate": 0.003,
+      "loss": 4.1763,
+      "step": 3141
+    },
+    {
+      "epoch": 0.03142,
+      "grad_norm": 0.8152028918266296,
+      "learning_rate": 0.003,
+      "loss": 4.1512,
+      "step": 3142
+    },
+    {
+      "epoch": 0.03143,
+      "grad_norm": 0.690168023109436,
+      "learning_rate": 0.003,
+      "loss": 4.1361,
+      "step": 3143
+    },
+    {
+      "epoch": 0.03144,
+      "grad_norm": 0.6327980160713196,
+      "learning_rate": 0.003,
+      "loss": 4.1047,
+      "step": 3144
+    },
+    {
+      "epoch": 0.03145,
+      "grad_norm": 0.5650879144668579,
+      "learning_rate": 0.003,
+      "loss": 4.1275,
+      "step": 3145
+    },
+    {
+      "epoch": 0.03146,
+      "grad_norm": 0.5615544319152832,
+      "learning_rate": 0.003,
+      "loss": 4.1349,
+      "step": 3146
+    },
+    {
+      "epoch": 0.03147,
+      "grad_norm": 0.5901345014572144,
+      "learning_rate": 0.003,
+      "loss": 4.1335,
+      "step": 3147
+    },
+    {
+      "epoch": 0.03148,
+      "grad_norm": 0.5519311428070068,
+      "learning_rate": 0.003,
+      "loss": 4.1346,
+      "step": 3148
+    },
+    {
+      "epoch": 0.03149,
+      "grad_norm": 0.5862347483634949,
+      "learning_rate": 0.003,
+      "loss": 4.1404,
+      "step": 3149
+    },
+    {
+      "epoch": 0.0315,
+      "grad_norm": 0.48593467473983765,
+      "learning_rate": 0.003,
+      "loss": 4.1245,
+      "step": 3150
+    },
+    {
+      "epoch": 0.03151,
+      "grad_norm": 0.44726112484931946,
+      "learning_rate": 0.003,
+      "loss": 4.1298,
+      "step": 3151
+    },
+    {
+      "epoch": 0.03152,
+      "grad_norm": 0.5725032687187195,
+      "learning_rate": 0.003,
+      "loss": 4.1316,
+      "step": 3152
+    },
+    {
+      "epoch": 0.03153,
+      "grad_norm": 0.7204610109329224,
+      "learning_rate": 0.003,
+      "loss": 4.1445,
+      "step": 3153
+    },
+    {
+      "epoch": 0.03154,
+      "grad_norm": 0.7772231101989746,
+      "learning_rate": 0.003,
+      "loss": 4.1406,
+      "step": 3154
+    },
+    {
+      "epoch": 0.03155,
+      "grad_norm": 0.7544244527816772,
+      "learning_rate": 0.003,
+      "loss": 4.1298,
+      "step": 3155
+    },
+    {
+      "epoch": 0.03156,
+      "grad_norm": 0.6843472719192505,
+      "learning_rate": 0.003,
+      "loss": 4.1127,
+      "step": 3156
+    },
+    {
+      "epoch": 0.03157,
+      "grad_norm": 0.5659379363059998,
+      "learning_rate": 0.003,
+      "loss": 4.1338,
+      "step": 3157
+    },
+    {
+      "epoch": 0.03158,
+      "grad_norm": 0.560870885848999,
+      "learning_rate": 0.003,
+      "loss": 4.1158,
+      "step": 3158
+    },
+    {
+      "epoch": 0.03159,
+      "grad_norm": 0.6483632922172546,
+      "learning_rate": 0.003,
+      "loss": 4.1202,
+      "step": 3159
+    },
+    {
+      "epoch": 0.0316,
+      "grad_norm": 0.7656121253967285,
+      "learning_rate": 0.003,
+      "loss": 4.1378,
+      "step": 3160
+    },
+    {
+      "epoch": 0.03161,
+      "grad_norm": 0.7832038998603821,
+      "learning_rate": 0.003,
+      "loss": 4.148,
+      "step": 3161
+    },
+    {
+      "epoch": 0.03162,
+      "grad_norm": 0.5725152492523193,
+      "learning_rate": 0.003,
+      "loss": 4.1453,
+      "step": 3162
+    },
+    {
+      "epoch": 0.03163,
+      "grad_norm": 0.5066990256309509,
+      "learning_rate": 0.003,
+      "loss": 4.1262,
+      "step": 3163
+    },
+    {
+      "epoch": 0.03164,
+      "grad_norm": 0.5056422352790833,
+      "learning_rate": 0.003,
+      "loss": 4.1234,
+      "step": 3164
+    },
+    {
+      "epoch": 0.03165,
+      "grad_norm": 0.5810751914978027,
+      "learning_rate": 0.003,
+      "loss": 4.0986,
+      "step": 3165
+    },
+    {
+      "epoch": 0.03166,
+      "grad_norm": 0.6706964373588562,
+      "learning_rate": 0.003,
+      "loss": 4.1145,
+      "step": 3166
+    },
+    {
+      "epoch": 0.03167,
+      "grad_norm": 0.8062258958816528,
+      "learning_rate": 0.003,
+      "loss": 4.1086,
+      "step": 3167
+    },
+    {
+      "epoch": 0.03168,
+      "grad_norm": 0.8774738311767578,
+      "learning_rate": 0.003,
+      "loss": 4.1414,
+      "step": 3168
+    },
+    {
+      "epoch": 0.03169,
+      "grad_norm": 0.817283034324646,
+      "learning_rate": 0.003,
+      "loss": 4.1495,
+      "step": 3169
+    },
+    {
+      "epoch": 0.0317,
+      "grad_norm": 0.8635371327400208,
+      "learning_rate": 0.003,
+      "loss": 4.1057,
+      "step": 3170
+    },
+    {
+      "epoch": 0.03171,
+      "grad_norm": 0.7711028456687927,
+      "learning_rate": 0.003,
+      "loss": 4.1463,
+      "step": 3171
+    },
+    {
+      "epoch": 0.03172,
+      "grad_norm": 0.754484236240387,
+      "learning_rate": 0.003,
+      "loss": 4.1332,
+      "step": 3172
+    },
+    {
+      "epoch": 0.03173,
+      "grad_norm": 0.6678244471549988,
+      "learning_rate": 0.003,
+      "loss": 4.1048,
+      "step": 3173
+    },
+    {
+      "epoch": 0.03174,
+      "grad_norm": 0.588426411151886,
+      "learning_rate": 0.003,
+      "loss": 4.1179,
+      "step": 3174
+    },
+    {
+      "epoch": 0.03175,
+      "grad_norm": 0.5396115779876709,
+      "learning_rate": 0.003,
+      "loss": 4.1398,
+      "step": 3175
+    },
+    {
+      "epoch": 0.03176,
+      "grad_norm": 0.5115644931793213,
+      "learning_rate": 0.003,
+      "loss": 4.1025,
+      "step": 3176
+    },
+    {
+      "epoch": 0.03177,
+      "grad_norm": 0.5066182017326355,
+      "learning_rate": 0.003,
+      "loss": 4.1086,
+      "step": 3177
+    },
+    {
+      "epoch": 0.03178,
+      "grad_norm": 0.6152738928794861,
+      "learning_rate": 0.003,
+      "loss": 4.1574,
+      "step": 3178
+    },
+    {
+      "epoch": 0.03179,
+      "grad_norm": 0.7892536520957947,
+      "learning_rate": 0.003,
+      "loss": 4.1254,
+      "step": 3179
+    },
+    {
+      "epoch": 0.0318,
+      "grad_norm": 0.9117100834846497,
+      "learning_rate": 0.003,
+      "loss": 4.1312,
+      "step": 3180
+    },
+    {
+      "epoch": 0.03181,
+      "grad_norm": 0.8305923938751221,
+      "learning_rate": 0.003,
+      "loss": 4.1452,
+      "step": 3181
+    },
+    {
+      "epoch": 0.03182,
+      "grad_norm": 0.7435977458953857,
+      "learning_rate": 0.003,
+      "loss": 4.1356,
+      "step": 3182
+    },
+    {
+      "epoch": 0.03183,
+      "grad_norm": 0.9169098138809204,
+      "learning_rate": 0.003,
+      "loss": 4.1455,
+      "step": 3183
+    },
+    {
+      "epoch": 0.03184,
+      "grad_norm": 1.1370512247085571,
+      "learning_rate": 0.003,
+      "loss": 4.1304,
+      "step": 3184
+    },
+    {
+      "epoch": 0.03185,
+      "grad_norm": 0.7811375856399536,
+      "learning_rate": 0.003,
+      "loss": 4.1419,
+      "step": 3185
+    },
+    {
+      "epoch": 0.03186,
+      "grad_norm": 0.6801838278770447,
+      "learning_rate": 0.003,
+      "loss": 4.1382,
+      "step": 3186
+    },
+    {
+      "epoch": 0.03187,
+      "grad_norm": 0.7519158124923706,
+      "learning_rate": 0.003,
+      "loss": 4.103,
+      "step": 3187
+    },
+    {
+      "epoch": 0.03188,
+      "grad_norm": 0.7719635963439941,
+      "learning_rate": 0.003,
+      "loss": 4.1382,
+      "step": 3188
+    },
+    {
+      "epoch": 0.03189,
+      "grad_norm": 0.7892667055130005,
+      "learning_rate": 0.003,
+      "loss": 4.1273,
+      "step": 3189
+    },
+    {
+      "epoch": 0.0319,
+      "grad_norm": 0.7617167234420776,
+      "learning_rate": 0.003,
+      "loss": 4.1219,
+      "step": 3190
+    },
+    {
+      "epoch": 0.03191,
+      "grad_norm": 0.729390561580658,
+      "learning_rate": 0.003,
+      "loss": 4.1513,
+      "step": 3191
+    },
+    {
+      "epoch": 0.03192,
+      "grad_norm": 0.6866948008537292,
+      "learning_rate": 0.003,
+      "loss": 4.1252,
+      "step": 3192
+    },
+    {
+      "epoch": 0.03193,
+      "grad_norm": 0.8149189352989197,
+      "learning_rate": 0.003,
+      "loss": 4.1294,
+      "step": 3193
+    },
+    {
+      "epoch": 0.03194,
+      "grad_norm": 0.7727823257446289,
+      "learning_rate": 0.003,
+      "loss": 4.1567,
+      "step": 3194
+    },
+    {
+      "epoch": 0.03195,
+      "grad_norm": 0.6907305121421814,
+      "learning_rate": 0.003,
+      "loss": 4.1187,
+      "step": 3195
+    },
+    {
+      "epoch": 0.03196,
+      "grad_norm": 0.6480019688606262,
+      "learning_rate": 0.003,
+      "loss": 4.129,
+      "step": 3196
+    },
+    {
+      "epoch": 0.03197,
+      "grad_norm": 0.6377413272857666,
+      "learning_rate": 0.003,
+      "loss": 4.1009,
+      "step": 3197
+    },
+    {
+      "epoch": 0.03198,
+      "grad_norm": 0.5232990384101868,
+      "learning_rate": 0.003,
+      "loss": 4.1265,
+      "step": 3198
+    },
+    {
+      "epoch": 0.03199,
+      "grad_norm": 0.5117045044898987,
+      "learning_rate": 0.003,
+      "loss": 4.1001,
+      "step": 3199
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.49118801951408386,
+      "learning_rate": 0.003,
+      "loss": 4.1204,
+      "step": 3200
+    },
+    {
+      "epoch": 0.03201,
+      "grad_norm": 0.39169371128082275,
+      "learning_rate": 0.003,
+      "loss": 4.0803,
+      "step": 3201
+    },
+    {
+      "epoch": 0.03202,
+      "grad_norm": 0.4503386616706848,
+      "learning_rate": 0.003,
+      "loss": 4.1192,
+      "step": 3202
+    },
+    {
+      "epoch": 0.03203,
+      "grad_norm": 0.44623667001724243,
+      "learning_rate": 0.003,
+      "loss": 4.1033,
+      "step": 3203
+    },
+    {
+      "epoch": 0.03204,
+      "grad_norm": 0.4461396634578705,
+      "learning_rate": 0.003,
+      "loss": 4.1193,
+      "step": 3204
+    },
+    {
+      "epoch": 0.03205,
+      "grad_norm": 0.47087642550468445,
+      "learning_rate": 0.003,
+      "loss": 4.1152,
+      "step": 3205
+    },
+    {
+      "epoch": 0.03206,
+      "grad_norm": 0.4962460696697235,
+      "learning_rate": 0.003,
+      "loss": 4.1085,
+      "step": 3206
+    },
+    {
+      "epoch": 0.03207,
+      "grad_norm": 0.6064698100090027,
+      "learning_rate": 0.003,
+      "loss": 4.0901,
+      "step": 3207
+    },
+    {
+      "epoch": 0.03208,
+      "grad_norm": 0.759096622467041,
+      "learning_rate": 0.003,
+      "loss": 4.1186,
+      "step": 3208
+    },
+    {
+      "epoch": 0.03209,
+      "grad_norm": 0.7810276746749878,
+      "learning_rate": 0.003,
+      "loss": 4.1579,
+      "step": 3209
+    },
+    {
+      "epoch": 0.0321,
+      "grad_norm": 0.7179496884346008,
+      "learning_rate": 0.003,
+      "loss": 4.1363,
+      "step": 3210
+    },
+    {
+      "epoch": 0.03211,
+      "grad_norm": 0.7148532867431641,
+      "learning_rate": 0.003,
+      "loss": 4.1099,
+      "step": 3211
+    },
+    {
+      "epoch": 0.03212,
+      "grad_norm": 0.7214945554733276,
+      "learning_rate": 0.003,
+      "loss": 4.1198,
+      "step": 3212
+    },
+    {
+      "epoch": 0.03213,
+      "grad_norm": 0.7809195518493652,
+      "learning_rate": 0.003,
+      "loss": 4.1251,
+      "step": 3213
+    },
+    {
+      "epoch": 0.03214,
+      "grad_norm": 0.9291434288024902,
+      "learning_rate": 0.003,
+      "loss": 4.1015,
+      "step": 3214
+    },
+    {
+      "epoch": 0.03215,
+      "grad_norm": 0.9452233910560608,
+      "learning_rate": 0.003,
+      "loss": 4.1164,
+      "step": 3215
+    },
+    {
+      "epoch": 0.03216,
+      "grad_norm": 0.8493017554283142,
+      "learning_rate": 0.003,
+      "loss": 4.125,
+      "step": 3216
+    },
+    {
+      "epoch": 0.03217,
+      "grad_norm": 0.7973296046257019,
+      "learning_rate": 0.003,
+      "loss": 4.1215,
+      "step": 3217
+    },
+    {
+      "epoch": 0.03218,
+      "grad_norm": 0.7930925488471985,
+      "learning_rate": 0.003,
+      "loss": 4.1166,
+      "step": 3218
+    },
+    {
+      "epoch": 0.03219,
+      "grad_norm": 0.7060784101486206,
+      "learning_rate": 0.003,
+      "loss": 4.1215,
+      "step": 3219
+    },
+    {
+      "epoch": 0.0322,
+      "grad_norm": 0.6504276394844055,
+      "learning_rate": 0.003,
+      "loss": 4.1329,
+      "step": 3220
+    },
+    {
+      "epoch": 0.03221,
+      "grad_norm": 0.7263512015342712,
+      "learning_rate": 0.003,
+      "loss": 4.1414,
+      "step": 3221
+    },
+    {
+      "epoch": 0.03222,
+      "grad_norm": 0.7812563180923462,
+      "learning_rate": 0.003,
+      "loss": 4.1338,
+      "step": 3222
+    },
+    {
+      "epoch": 0.03223,
+      "grad_norm": 0.718917727470398,
+      "learning_rate": 0.003,
+      "loss": 4.1374,
+      "step": 3223
+    },
+    {
+      "epoch": 0.03224,
+      "grad_norm": 0.6486620306968689,
+      "learning_rate": 0.003,
+      "loss": 4.0945,
+      "step": 3224
+    },
+    {
+      "epoch": 0.03225,
+      "grad_norm": 0.633409857749939,
+      "learning_rate": 0.003,
+      "loss": 4.1275,
+      "step": 3225
+    },
+    {
+      "epoch": 0.03226,
+      "grad_norm": 0.6219311356544495,
+      "learning_rate": 0.003,
+      "loss": 4.1074,
+      "step": 3226
+    },
+    {
+      "epoch": 0.03227,
+      "grad_norm": 0.6718069911003113,
+      "learning_rate": 0.003,
+      "loss": 4.1299,
+      "step": 3227
+    },
+    {
+      "epoch": 0.03228,
+      "grad_norm": 0.6600261926651001,
+      "learning_rate": 0.003,
+      "loss": 4.1046,
+      "step": 3228
+    },
+    {
+      "epoch": 0.03229,
+      "grad_norm": 0.7300699353218079,
+      "learning_rate": 0.003,
+      "loss": 4.1237,
+      "step": 3229
+    },
+    {
+      "epoch": 0.0323,
+      "grad_norm": 0.8185812830924988,
+      "learning_rate": 0.003,
+      "loss": 4.1184,
+      "step": 3230
+    },
+    {
+      "epoch": 0.03231,
+      "grad_norm": 0.8448948860168457,
+      "learning_rate": 0.003,
+      "loss": 4.1359,
+      "step": 3231
+    },
+    {
+      "epoch": 0.03232,
+      "grad_norm": 0.8037069439888,
+      "learning_rate": 0.003,
+      "loss": 4.1226,
+      "step": 3232
+    },
+    {
+      "epoch": 0.03233,
+      "grad_norm": 0.6786398887634277,
+      "learning_rate": 0.003,
+      "loss": 4.1384,
+      "step": 3233
+    },
+    {
+      "epoch": 0.03234,
+      "grad_norm": 0.6006938219070435,
+      "learning_rate": 0.003,
+      "loss": 4.1167,
+      "step": 3234
+    },
+    {
+      "epoch": 0.03235,
+      "grad_norm": 0.651039183139801,
+      "learning_rate": 0.003,
+      "loss": 4.1271,
+      "step": 3235
+    },
+    {
+      "epoch": 0.03236,
+      "grad_norm": 0.635384202003479,
+      "learning_rate": 0.003,
+      "loss": 4.1167,
+      "step": 3236
+    },
+    {
+      "epoch": 0.03237,
+      "grad_norm": 0.6635240912437439,
+      "learning_rate": 0.003,
+      "loss": 4.1401,
+      "step": 3237
+    },
+    {
+      "epoch": 0.03238,
+      "grad_norm": 0.6811038851737976,
+      "learning_rate": 0.003,
+      "loss": 4.1468,
+      "step": 3238
+    },
+    {
+      "epoch": 0.03239,
+      "grad_norm": 0.7615686655044556,
+      "learning_rate": 0.003,
+      "loss": 4.1295,
+      "step": 3239
+    },
+    {
+      "epoch": 0.0324,
+      "grad_norm": 0.8041428923606873,
+      "learning_rate": 0.003,
+      "loss": 4.151,
+      "step": 3240
+    },
+    {
+      "epoch": 0.03241,
+      "grad_norm": 0.8027405738830566,
+      "learning_rate": 0.003,
+      "loss": 4.1432,
+      "step": 3241
+    },
+    {
+      "epoch": 0.03242,
+      "grad_norm": 0.9180432558059692,
+      "learning_rate": 0.003,
+      "loss": 4.1256,
+      "step": 3242
+    },
+    {
+      "epoch": 0.03243,
+      "grad_norm": 0.9836052060127258,
+      "learning_rate": 0.003,
+      "loss": 4.1186,
+      "step": 3243
+    },
+    {
+      "epoch": 0.03244,
+      "grad_norm": 0.9347205758094788,
+      "learning_rate": 0.003,
+      "loss": 4.1385,
+      "step": 3244
+    },
+    {
+      "epoch": 0.03245,
+      "grad_norm": 1.1407207250595093,
+      "learning_rate": 0.003,
+      "loss": 4.1692,
+      "step": 3245
+    },
+    {
+      "epoch": 0.03246,
+      "grad_norm": 0.7863346934318542,
+      "learning_rate": 0.003,
+      "loss": 4.1416,
+      "step": 3246
+    },
+    {
+      "epoch": 0.03247,
+      "grad_norm": 0.6830831170082092,
+      "learning_rate": 0.003,
+      "loss": 4.1458,
+      "step": 3247
+    },
+    {
+      "epoch": 0.03248,
+      "grad_norm": 0.6007001996040344,
+      "learning_rate": 0.003,
+      "loss": 4.1605,
+      "step": 3248
+    },
+    {
+      "epoch": 0.03249,
+      "grad_norm": 0.5720266103744507,
+      "learning_rate": 0.003,
+      "loss": 4.1242,
+      "step": 3249
+    },
+    {
+      "epoch": 0.0325,
+      "grad_norm": 0.5476762056350708,
+      "learning_rate": 0.003,
+      "loss": 4.1536,
+      "step": 3250
+    },
+    {
+      "epoch": 0.03251,
+      "grad_norm": 0.5844308137893677,
+      "learning_rate": 0.003,
+      "loss": 4.1498,
+      "step": 3251
+    },
+    {
+      "epoch": 0.03252,
+      "grad_norm": 0.6554009318351746,
+      "learning_rate": 0.003,
+      "loss": 4.1172,
+      "step": 3252
+    },
+    {
+      "epoch": 0.03253,
+      "grad_norm": 0.7417848110198975,
+      "learning_rate": 0.003,
+      "loss": 4.1203,
+      "step": 3253
+    },
+    {
+      "epoch": 0.03254,
+      "grad_norm": 0.7940112352371216,
+      "learning_rate": 0.003,
+      "loss": 4.1189,
+      "step": 3254
+    },
+    {
+      "epoch": 0.03255,
+      "grad_norm": 0.7129653096199036,
+      "learning_rate": 0.003,
+      "loss": 4.1292,
+      "step": 3255
+    },
+    {
+      "epoch": 0.03256,
+      "grad_norm": 0.5819328427314758,
+      "learning_rate": 0.003,
+      "loss": 4.1213,
+      "step": 3256
+    },
+    {
+      "epoch": 0.03257,
+      "grad_norm": 0.4663737118244171,
+      "learning_rate": 0.003,
+      "loss": 4.1081,
+      "step": 3257
+    },
+    {
+      "epoch": 0.03258,
+      "grad_norm": 0.423995703458786,
+      "learning_rate": 0.003,
+      "loss": 4.1155,
+      "step": 3258
+    },
+    {
+      "epoch": 0.03259,
+      "grad_norm": 0.43535032868385315,
+      "learning_rate": 0.003,
+      "loss": 4.1178,
+      "step": 3259
+    },
+    {
+      "epoch": 0.0326,
+      "grad_norm": 0.39455822110176086,
+      "learning_rate": 0.003,
+      "loss": 4.1154,
+      "step": 3260
+    },
+    {
+      "epoch": 0.03261,
+      "grad_norm": 0.40695109963417053,
+      "learning_rate": 0.003,
+      "loss": 4.088,
+      "step": 3261
+    },
+    {
+      "epoch": 0.03262,
+      "grad_norm": 0.3774632513523102,
+      "learning_rate": 0.003,
+      "loss": 4.113,
+      "step": 3262
+    },
+    {
+      "epoch": 0.03263,
+      "grad_norm": 0.46376833319664,
+      "learning_rate": 0.003,
+      "loss": 4.1225,
+      "step": 3263
+    },
+    {
+      "epoch": 0.03264,
+      "grad_norm": 0.5567134618759155,
+      "learning_rate": 0.003,
+      "loss": 4.136,
+      "step": 3264
+    },
+    {
+      "epoch": 0.03265,
+      "grad_norm": 0.6502901315689087,
+      "learning_rate": 0.003,
+      "loss": 4.1336,
+      "step": 3265
+    },
+    {
+      "epoch": 0.03266,
+      "grad_norm": 0.8112798929214478,
+      "learning_rate": 0.003,
+      "loss": 4.0905,
+      "step": 3266
+    },
+    {
+      "epoch": 0.03267,
+      "grad_norm": 0.9981924295425415,
+      "learning_rate": 0.003,
+      "loss": 4.1139,
+      "step": 3267
+    },
+    {
+      "epoch": 0.03268,
+      "grad_norm": 0.9973412752151489,
+      "learning_rate": 0.003,
+      "loss": 4.1068,
+      "step": 3268
+    },
+    {
+      "epoch": 0.03269,
+      "grad_norm": 0.8641186356544495,
+      "learning_rate": 0.003,
+      "loss": 4.1296,
+      "step": 3269
+    },
+    {
+      "epoch": 0.0327,
+      "grad_norm": 0.7888898253440857,
+      "learning_rate": 0.003,
+      "loss": 4.1558,
+      "step": 3270
+    },
+    {
+      "epoch": 0.03271,
+      "grad_norm": 0.8556457757949829,
+      "learning_rate": 0.003,
+      "loss": 4.1218,
+      "step": 3271
+    },
+    {
+      "epoch": 0.03272,
+      "grad_norm": 0.8728947639465332,
+      "learning_rate": 0.003,
+      "loss": 4.1311,
+      "step": 3272
+    },
+    {
+      "epoch": 0.03273,
+      "grad_norm": 0.8285180926322937,
+      "learning_rate": 0.003,
+      "loss": 4.133,
+      "step": 3273
+    },
+    {
+      "epoch": 0.03274,
+      "grad_norm": 0.6249201893806458,
+      "learning_rate": 0.003,
+      "loss": 4.1423,
+      "step": 3274
+    },
+    {
+      "epoch": 0.03275,
+      "grad_norm": 0.5604618787765503,
+      "learning_rate": 0.003,
+      "loss": 4.1388,
+      "step": 3275
+    },
+    {
+      "epoch": 0.03276,
+      "grad_norm": 0.6792795062065125,
+      "learning_rate": 0.003,
+      "loss": 4.1194,
+      "step": 3276
+    },
+    {
+      "epoch": 0.03277,
+      "grad_norm": 0.7676330804824829,
+      "learning_rate": 0.003,
+      "loss": 4.1239,
+      "step": 3277
+    },
+    {
+      "epoch": 0.03278,
+      "grad_norm": 0.8335491418838501,
+      "learning_rate": 0.003,
+      "loss": 4.138,
+      "step": 3278
+    },
+    {
+      "epoch": 0.03279,
+      "grad_norm": 0.7494176626205444,
+      "learning_rate": 0.003,
+      "loss": 4.1244,
+      "step": 3279
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.5933589935302734,
+      "learning_rate": 0.003,
+      "loss": 4.1027,
+      "step": 3280
+    },
+    {
+      "epoch": 0.03281,
+      "grad_norm": 0.49470627307891846,
+      "learning_rate": 0.003,
+      "loss": 4.1456,
+      "step": 3281
+    },
+    {
+      "epoch": 0.03282,
+      "grad_norm": 0.5273627638816833,
+      "learning_rate": 0.003,
+      "loss": 4.1014,
+      "step": 3282
+    },
+    {
+      "epoch": 0.03283,
+      "grad_norm": 0.5509713292121887,
+      "learning_rate": 0.003,
+      "loss": 4.0972,
+      "step": 3283
+    },
+    {
+      "epoch": 0.03284,
+      "grad_norm": 0.544580340385437,
+      "learning_rate": 0.003,
+      "loss": 4.1383,
+      "step": 3284
+    },
+    {
+      "epoch": 0.03285,
+      "grad_norm": 0.5744161009788513,
+      "learning_rate": 0.003,
+      "loss": 4.0849,
+      "step": 3285
+    },
+    {
+      "epoch": 0.03286,
+      "grad_norm": 0.531225860118866,
+      "learning_rate": 0.003,
+      "loss": 4.1237,
+      "step": 3286
+    },
+    {
+      "epoch": 0.03287,
+      "grad_norm": 0.49495846033096313,
+      "learning_rate": 0.003,
+      "loss": 4.13,
+      "step": 3287
+    },
+    {
+      "epoch": 0.03288,
+      "grad_norm": 0.5203666687011719,
+      "learning_rate": 0.003,
+      "loss": 4.1111,
+      "step": 3288
+    },
+    {
+      "epoch": 0.03289,
+      "grad_norm": 0.5833369493484497,
+      "learning_rate": 0.003,
+      "loss": 4.1168,
+      "step": 3289
+    },
+    {
+      "epoch": 0.0329,
+      "grad_norm": 0.6156966686248779,
+      "learning_rate": 0.003,
+      "loss": 4.1476,
+      "step": 3290
+    },
+    {
+      "epoch": 0.03291,
+      "grad_norm": 0.7284314036369324,
+      "learning_rate": 0.003,
+      "loss": 4.1346,
+      "step": 3291
+    },
+    {
+      "epoch": 0.03292,
+      "grad_norm": 0.8301785588264465,
+      "learning_rate": 0.003,
+      "loss": 4.1017,
+      "step": 3292
+    },
+    {
+      "epoch": 0.03293,
+      "grad_norm": 0.8880192041397095,
+      "learning_rate": 0.003,
+      "loss": 4.1255,
+      "step": 3293
+    },
+    {
+      "epoch": 0.03294,
+      "grad_norm": 0.7777594923973083,
+      "learning_rate": 0.003,
+      "loss": 4.1268,
+      "step": 3294
+    },
+    {
+      "epoch": 0.03295,
+      "grad_norm": 0.7415675520896912,
+      "learning_rate": 0.003,
+      "loss": 4.1178,
+      "step": 3295
+    },
+    {
+      "epoch": 0.03296,
+      "grad_norm": 0.5945268273353577,
+      "learning_rate": 0.003,
+      "loss": 4.1056,
+      "step": 3296
+    },
+    {
+      "epoch": 0.03297,
+      "grad_norm": 0.6904322504997253,
+      "learning_rate": 0.003,
+      "loss": 4.1182,
+      "step": 3297
+    },
+    {
+      "epoch": 0.03298,
+      "grad_norm": 0.8207030892372131,
+      "learning_rate": 0.003,
+      "loss": 4.1412,
+      "step": 3298
+    },
+    {
+      "epoch": 0.03299,
+      "grad_norm": 1.0657931566238403,
+      "learning_rate": 0.003,
+      "loss": 4.1346,
+      "step": 3299
+    },
+    {
+      "epoch": 0.033,
+      "grad_norm": 0.9290387034416199,
+      "learning_rate": 0.003,
+      "loss": 4.1222,
+      "step": 3300
+    },
+    {
+      "epoch": 0.03301,
+      "grad_norm": 0.6823791861534119,
+      "learning_rate": 0.003,
+      "loss": 4.1464,
+      "step": 3301
+    },
+    {
+      "epoch": 0.03302,
+      "grad_norm": 0.5951451659202576,
+      "learning_rate": 0.003,
+      "loss": 4.1188,
+      "step": 3302
+    },
+    {
+      "epoch": 0.03303,
+      "grad_norm": 0.5956482887268066,
+      "learning_rate": 0.003,
+      "loss": 4.1233,
+      "step": 3303
+    },
+    {
+      "epoch": 0.03304,
+      "grad_norm": 0.5632296204566956,
+      "learning_rate": 0.003,
+      "loss": 4.1161,
+      "step": 3304
+    },
+    {
+      "epoch": 0.03305,
+      "grad_norm": 0.5975555181503296,
+      "learning_rate": 0.003,
+      "loss": 4.1217,
+      "step": 3305
+    },
+    {
+      "epoch": 0.03306,
+      "grad_norm": 0.5866029262542725,
+      "learning_rate": 0.003,
+      "loss": 4.1387,
+      "step": 3306
+    },
+    {
+      "epoch": 0.03307,
+      "grad_norm": 0.586901843547821,
+      "learning_rate": 0.003,
+      "loss": 4.1345,
+      "step": 3307
+    },
+    {
+      "epoch": 0.03308,
+      "grad_norm": 0.5834401249885559,
+      "learning_rate": 0.003,
+      "loss": 4.1153,
+      "step": 3308
+    },
+    {
+      "epoch": 0.03309,
+      "grad_norm": 0.6751212477684021,
+      "learning_rate": 0.003,
+      "loss": 4.1037,
+      "step": 3309
+    },
+    {
+      "epoch": 0.0331,
+      "grad_norm": 0.5830533504486084,
+      "learning_rate": 0.003,
+      "loss": 4.1068,
+      "step": 3310
+    },
+    {
+      "epoch": 0.03311,
+      "grad_norm": 0.6124199628829956,
+      "learning_rate": 0.003,
+      "loss": 4.1271,
+      "step": 3311
+    },
+    {
+      "epoch": 0.03312,
+      "grad_norm": 0.5900564193725586,
+      "learning_rate": 0.003,
+      "loss": 4.0968,
+      "step": 3312
+    },
+    {
+      "epoch": 0.03313,
+      "grad_norm": 0.6442069411277771,
+      "learning_rate": 0.003,
+      "loss": 4.1271,
+      "step": 3313
+    },
+    {
+      "epoch": 0.03314,
+      "grad_norm": 0.7764161229133606,
+      "learning_rate": 0.003,
+      "loss": 4.0759,
+      "step": 3314
+    },
+    {
+      "epoch": 0.03315,
+      "grad_norm": 0.8021034002304077,
+      "learning_rate": 0.003,
+      "loss": 4.13,
+      "step": 3315
+    },
+    {
+      "epoch": 0.03316,
+      "grad_norm": 0.8197445869445801,
+      "learning_rate": 0.003,
+      "loss": 4.1343,
+      "step": 3316
+    },
+    {
+      "epoch": 0.03317,
+      "grad_norm": 0.7437990307807922,
+      "learning_rate": 0.003,
+      "loss": 4.1238,
+      "step": 3317
+    },
+    {
+      "epoch": 0.03318,
+      "grad_norm": 0.7505268454551697,
+      "learning_rate": 0.003,
+      "loss": 4.1055,
+      "step": 3318
+    },
+    {
+      "epoch": 0.03319,
+      "grad_norm": 0.7677083611488342,
+      "learning_rate": 0.003,
+      "loss": 4.0993,
+      "step": 3319
+    },
+    {
+      "epoch": 0.0332,
+      "grad_norm": 0.8221232891082764,
+      "learning_rate": 0.003,
+      "loss": 4.1277,
+      "step": 3320
+    },
+    {
+      "epoch": 0.03321,
+      "grad_norm": 0.8141359090805054,
+      "learning_rate": 0.003,
+      "loss": 4.1433,
+      "step": 3321
+    },
+    {
+      "epoch": 0.03322,
+      "grad_norm": 0.7189428806304932,
+      "learning_rate": 0.003,
+      "loss": 4.0938,
+      "step": 3322
+    },
+    {
+      "epoch": 0.03323,
+      "grad_norm": 0.6044638156890869,
+      "learning_rate": 0.003,
+      "loss": 4.1329,
+      "step": 3323
+    },
+    {
+      "epoch": 0.03324,
+      "grad_norm": 0.5276358127593994,
+      "learning_rate": 0.003,
+      "loss": 4.1001,
+      "step": 3324
+    },
+    {
+      "epoch": 0.03325,
+      "grad_norm": 0.5766726136207581,
+      "learning_rate": 0.003,
+      "loss": 4.134,
+      "step": 3325
+    },
+    {
+      "epoch": 0.03326,
+      "grad_norm": 0.6931579113006592,
+      "learning_rate": 0.003,
+      "loss": 4.0949,
+      "step": 3326
+    },
+    {
+      "epoch": 0.03327,
+      "grad_norm": 0.8419184684753418,
+      "learning_rate": 0.003,
+      "loss": 4.1308,
+      "step": 3327
+    },
+    {
+      "epoch": 0.03328,
+      "grad_norm": 0.82012540102005,
+      "learning_rate": 0.003,
+      "loss": 4.1529,
+      "step": 3328
+    },
+    {
+      "epoch": 0.03329,
+      "grad_norm": 0.6669595837593079,
+      "learning_rate": 0.003,
+      "loss": 4.1055,
+      "step": 3329
+    },
+    {
+      "epoch": 0.0333,
+      "grad_norm": 0.6745646595954895,
+      "learning_rate": 0.003,
+      "loss": 4.1079,
+      "step": 3330
+    },
+    {
+      "epoch": 0.03331,
+      "grad_norm": 0.7668302655220032,
+      "learning_rate": 0.003,
+      "loss": 4.1109,
+      "step": 3331
+    },
+    {
+      "epoch": 0.03332,
+      "grad_norm": 0.7806735634803772,
+      "learning_rate": 0.003,
+      "loss": 4.1058,
+      "step": 3332
+    },
+    {
+      "epoch": 0.03333,
+      "grad_norm": 0.8616372346878052,
+      "learning_rate": 0.003,
+      "loss": 4.0992,
+      "step": 3333
+    },
+    {
+      "epoch": 0.03334,
+      "grad_norm": 0.8143258690834045,
+      "learning_rate": 0.003,
+      "loss": 4.1291,
+      "step": 3334
+    },
+    {
+      "epoch": 0.03335,
+      "grad_norm": 0.7264704704284668,
+      "learning_rate": 0.003,
+      "loss": 4.1332,
+      "step": 3335
+    },
+    {
+      "epoch": 0.03336,
+      "grad_norm": 0.7028628587722778,
+      "learning_rate": 0.003,
+      "loss": 4.134,
+      "step": 3336
+    },
+    {
+      "epoch": 0.03337,
+      "grad_norm": 0.7172381281852722,
+      "learning_rate": 0.003,
+      "loss": 4.1099,
+      "step": 3337
+    },
+    {
+      "epoch": 0.03338,
+      "grad_norm": 0.7452136278152466,
+      "learning_rate": 0.003,
+      "loss": 4.1223,
+      "step": 3338
+    },
+    {
+      "epoch": 0.03339,
+      "grad_norm": 0.6151822209358215,
+      "learning_rate": 0.003,
+      "loss": 4.122,
+      "step": 3339
+    },
+    {
+      "epoch": 0.0334,
+      "grad_norm": 0.48978814482688904,
+      "learning_rate": 0.003,
+      "loss": 4.0957,
+      "step": 3340
+    },
+    {
+      "epoch": 0.03341,
+      "grad_norm": 0.5616298317909241,
+      "learning_rate": 0.003,
+      "loss": 4.1166,
+      "step": 3341
+    },
+    {
+      "epoch": 0.03342,
+      "grad_norm": 0.5989934802055359,
+      "learning_rate": 0.003,
+      "loss": 4.1047,
+      "step": 3342
+    },
+    {
+      "epoch": 0.03343,
+      "grad_norm": 0.6791746616363525,
+      "learning_rate": 0.003,
+      "loss": 4.0819,
+      "step": 3343
+    },
+    {
+      "epoch": 0.03344,
+      "grad_norm": 0.8041501641273499,
+      "learning_rate": 0.003,
+      "loss": 4.139,
+      "step": 3344
+    },
+    {
+      "epoch": 0.03345,
+      "grad_norm": 0.9025998711585999,
+      "learning_rate": 0.003,
+      "loss": 4.1226,
+      "step": 3345
+    },
+    {
+      "epoch": 0.03346,
+      "grad_norm": 0.8547161221504211,
+      "learning_rate": 0.003,
+      "loss": 4.1382,
+      "step": 3346
+    },
+    {
+      "epoch": 0.03347,
+      "grad_norm": 0.8230910301208496,
+      "learning_rate": 0.003,
+      "loss": 4.1211,
+      "step": 3347
+    },
+    {
+      "epoch": 0.03348,
+      "grad_norm": 0.6764378547668457,
+      "learning_rate": 0.003,
+      "loss": 4.1086,
+      "step": 3348
+    },
+    {
+      "epoch": 0.03349,
+      "grad_norm": 0.6756121516227722,
+      "learning_rate": 0.003,
+      "loss": 4.1537,
+      "step": 3349
+    },
+    {
+      "epoch": 0.0335,
+      "grad_norm": 0.90619957447052,
+      "learning_rate": 0.003,
+      "loss": 4.1174,
+      "step": 3350
+    },
+    {
+      "epoch": 0.03351,
+      "grad_norm": 0.995084822177887,
+      "learning_rate": 0.003,
+      "loss": 4.1296,
+      "step": 3351
+    },
+    {
+      "epoch": 0.03352,
+      "grad_norm": 0.9932165741920471,
+      "learning_rate": 0.003,
+      "loss": 4.1476,
+      "step": 3352
+    },
+    {
+      "epoch": 0.03353,
+      "grad_norm": 0.7695693373680115,
+      "learning_rate": 0.003,
+      "loss": 4.144,
+      "step": 3353
+    },
+    {
+      "epoch": 0.03354,
+      "grad_norm": 0.757735013961792,
+      "learning_rate": 0.003,
+      "loss": 4.1066,
+      "step": 3354
+    },
+    {
+      "epoch": 0.03355,
+      "grad_norm": 0.7958679795265198,
+      "learning_rate": 0.003,
+      "loss": 4.1313,
+      "step": 3355
+    },
+    {
+      "epoch": 0.03356,
+      "grad_norm": 0.8041818141937256,
+      "learning_rate": 0.003,
+      "loss": 4.1556,
+      "step": 3356
+    },
+    {
+      "epoch": 0.03357,
+      "grad_norm": 0.6908081769943237,
+      "learning_rate": 0.003,
+      "loss": 4.1258,
+      "step": 3357
+    },
+    {
+      "epoch": 0.03358,
+      "grad_norm": 0.5655892491340637,
+      "learning_rate": 0.003,
+      "loss": 4.1399,
+      "step": 3358
+    },
+    {
+      "epoch": 0.03359,
+      "grad_norm": 0.6353917717933655,
+      "learning_rate": 0.003,
+      "loss": 4.1288,
+      "step": 3359
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.6901819705963135,
+      "learning_rate": 0.003,
+      "loss": 4.1421,
+      "step": 3360
+    },
+    {
+      "epoch": 0.03361,
+      "grad_norm": 0.6260518431663513,
+      "learning_rate": 0.003,
+      "loss": 4.1414,
+      "step": 3361
+    },
+    {
+      "epoch": 0.03362,
+      "grad_norm": 0.5238988399505615,
+      "learning_rate": 0.003,
+      "loss": 4.132,
+      "step": 3362
+    },
+    {
+      "epoch": 0.03363,
+      "grad_norm": 0.4695539176464081,
+      "learning_rate": 0.003,
+      "loss": 4.0823,
+      "step": 3363
+    },
+    {
+      "epoch": 0.03364,
+      "grad_norm": 0.4180772602558136,
+      "learning_rate": 0.003,
+      "loss": 4.103,
+      "step": 3364
+    },
+    {
+      "epoch": 0.03365,
+      "grad_norm": 0.46931684017181396,
+      "learning_rate": 0.003,
+      "loss": 4.098,
+      "step": 3365
+    },
+    {
+      "epoch": 0.03366,
+      "grad_norm": 0.4340416491031647,
+      "learning_rate": 0.003,
+      "loss": 4.0693,
+      "step": 3366
+    },
+    {
+      "epoch": 0.03367,
+      "grad_norm": 0.3824833929538727,
+      "learning_rate": 0.003,
+      "loss": 4.0882,
+      "step": 3367
+    },
+    {
+      "epoch": 0.03368,
+      "grad_norm": 0.391565203666687,
+      "learning_rate": 0.003,
+      "loss": 4.1353,
+      "step": 3368
+    },
+    {
+      "epoch": 0.03369,
+      "grad_norm": 0.3979616165161133,
+      "learning_rate": 0.003,
+      "loss": 4.093,
+      "step": 3369
+    },
+    {
+      "epoch": 0.0337,
+      "grad_norm": 0.41400259733200073,
+      "learning_rate": 0.003,
+      "loss": 4.1266,
+      "step": 3370
+    },
+    {
+      "epoch": 0.03371,
+      "grad_norm": 0.4535299241542816,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 3371
+    },
+    {
+      "epoch": 0.03372,
+      "grad_norm": 0.4900164008140564,
+      "learning_rate": 0.003,
+      "loss": 4.091,
+      "step": 3372
+    },
+    {
+      "epoch": 0.03373,
+      "grad_norm": 0.48760801553726196,
+      "learning_rate": 0.003,
+      "loss": 4.124,
+      "step": 3373
+    },
+    {
+      "epoch": 0.03374,
+      "grad_norm": 0.42620474100112915,
+      "learning_rate": 0.003,
+      "loss": 4.1035,
+      "step": 3374
+    },
+    {
+      "epoch": 0.03375,
+      "grad_norm": 0.539411187171936,
+      "learning_rate": 0.003,
+      "loss": 4.0937,
+      "step": 3375
+    },
+    {
+      "epoch": 0.03376,
+      "grad_norm": 0.7004280686378479,
+      "learning_rate": 0.003,
+      "loss": 4.0758,
+      "step": 3376
+    },
+    {
+      "epoch": 0.03377,
+      "grad_norm": 0.9512114524841309,
+      "learning_rate": 0.003,
+      "loss": 4.1273,
+      "step": 3377
+    },
+    {
+      "epoch": 0.03378,
+      "grad_norm": 1.173850178718567,
+      "learning_rate": 0.003,
+      "loss": 4.1444,
+      "step": 3378
+    },
+    {
+      "epoch": 0.03379,
+      "grad_norm": 0.8208590149879456,
+      "learning_rate": 0.003,
+      "loss": 4.1088,
+      "step": 3379
+    },
+    {
+      "epoch": 0.0338,
+      "grad_norm": 0.9167894721031189,
+      "learning_rate": 0.003,
+      "loss": 4.138,
+      "step": 3380
+    },
+    {
+      "epoch": 0.03381,
+      "grad_norm": 0.8736410140991211,
+      "learning_rate": 0.003,
+      "loss": 4.1428,
+      "step": 3381
+    },
+    {
+      "epoch": 0.03382,
+      "grad_norm": 0.9413759112358093,
+      "learning_rate": 0.003,
+      "loss": 4.1405,
+      "step": 3382
+    },
+    {
+      "epoch": 0.03383,
+      "grad_norm": 0.9034369587898254,
+      "learning_rate": 0.003,
+      "loss": 4.102,
+      "step": 3383
+    },
+    {
+      "epoch": 0.03384,
+      "grad_norm": 0.7863962650299072,
+      "learning_rate": 0.003,
+      "loss": 4.1256,
+      "step": 3384
+    },
+    {
+      "epoch": 0.03385,
+      "grad_norm": 0.6559106707572937,
+      "learning_rate": 0.003,
+      "loss": 4.1243,
+      "step": 3385
+    },
+    {
+      "epoch": 0.03386,
+      "grad_norm": 0.5466395020484924,
+      "learning_rate": 0.003,
+      "loss": 4.116,
+      "step": 3386
+    },
+    {
+      "epoch": 0.03387,
+      "grad_norm": 0.5657241344451904,
+      "learning_rate": 0.003,
+      "loss": 4.1195,
+      "step": 3387
+    },
+    {
+      "epoch": 0.03388,
+      "grad_norm": 0.6148836016654968,
+      "learning_rate": 0.003,
+      "loss": 4.1344,
+      "step": 3388
+    },
+    {
+      "epoch": 0.03389,
+      "grad_norm": 0.6511346101760864,
+      "learning_rate": 0.003,
+      "loss": 4.1482,
+      "step": 3389
+    },
+    {
+      "epoch": 0.0339,
+      "grad_norm": 0.7301871180534363,
+      "learning_rate": 0.003,
+      "loss": 4.14,
+      "step": 3390
+    },
+    {
+      "epoch": 0.03391,
+      "grad_norm": 0.7425968647003174,
+      "learning_rate": 0.003,
+      "loss": 4.1178,
+      "step": 3391
+    },
+    {
+      "epoch": 0.03392,
+      "grad_norm": 0.8300238251686096,
+      "learning_rate": 0.003,
+      "loss": 4.1282,
+      "step": 3392
+    },
+    {
+      "epoch": 0.03393,
+      "grad_norm": 0.9328773617744446,
+      "learning_rate": 0.003,
+      "loss": 4.1432,
+      "step": 3393
+    },
+    {
+      "epoch": 0.03394,
+      "grad_norm": 0.8949936628341675,
+      "learning_rate": 0.003,
+      "loss": 4.145,
+      "step": 3394
+    },
+    {
+      "epoch": 0.03395,
+      "grad_norm": 0.9411712884902954,
+      "learning_rate": 0.003,
+      "loss": 4.1549,
+      "step": 3395
+    },
+    {
+      "epoch": 0.03396,
+      "grad_norm": 0.9026573896408081,
+      "learning_rate": 0.003,
+      "loss": 4.1395,
+      "step": 3396
+    },
+    {
+      "epoch": 0.03397,
+      "grad_norm": 0.8574052453041077,
+      "learning_rate": 0.003,
+      "loss": 4.1065,
+      "step": 3397
+    },
+    {
+      "epoch": 0.03398,
+      "grad_norm": 0.9310076236724854,
+      "learning_rate": 0.003,
+      "loss": 4.1626,
+      "step": 3398
+    },
+    {
+      "epoch": 0.03399,
+      "grad_norm": 0.9323184490203857,
+      "learning_rate": 0.003,
+      "loss": 4.1821,
+      "step": 3399
+    },
+    {
+      "epoch": 0.034,
+      "grad_norm": 0.996589720249176,
+      "learning_rate": 0.003,
+      "loss": 4.1643,
+      "step": 3400
+    },
+    {
+      "epoch": 0.03401,
+      "grad_norm": 0.8762171268463135,
+      "learning_rate": 0.003,
+      "loss": 4.1361,
+      "step": 3401
+    },
+    {
+      "epoch": 0.03402,
+      "grad_norm": 0.7612597942352295,
+      "learning_rate": 0.003,
+      "loss": 4.162,
+      "step": 3402
+    },
+    {
+      "epoch": 0.03403,
+      "grad_norm": 0.7073224782943726,
+      "learning_rate": 0.003,
+      "loss": 4.1193,
+      "step": 3403
+    },
+    {
+      "epoch": 0.03404,
+      "grad_norm": 0.8644006848335266,
+      "learning_rate": 0.003,
+      "loss": 4.1576,
+      "step": 3404
+    },
+    {
+      "epoch": 0.03405,
+      "grad_norm": 0.9565165638923645,
+      "learning_rate": 0.003,
+      "loss": 4.1456,
+      "step": 3405
+    },
+    {
+      "epoch": 0.03406,
+      "grad_norm": 0.9732779264450073,
+      "learning_rate": 0.003,
+      "loss": 4.1351,
+      "step": 3406
+    },
+    {
+      "epoch": 0.03407,
+      "grad_norm": 0.8747183084487915,
+      "learning_rate": 0.003,
+      "loss": 4.1492,
+      "step": 3407
+    },
+    {
+      "epoch": 0.03408,
+      "grad_norm": 0.8171975612640381,
+      "learning_rate": 0.003,
+      "loss": 4.1301,
+      "step": 3408
+    },
+    {
+      "epoch": 0.03409,
+      "grad_norm": 0.8472719788551331,
+      "learning_rate": 0.003,
+      "loss": 4.139,
+      "step": 3409
+    },
+    {
+      "epoch": 0.0341,
+      "grad_norm": 0.772420346736908,
+      "learning_rate": 0.003,
+      "loss": 4.1296,
+      "step": 3410
+    },
+    {
+      "epoch": 0.03411,
+      "grad_norm": 0.7122489809989929,
+      "learning_rate": 0.003,
+      "loss": 4.1412,
+      "step": 3411
+    },
+    {
+      "epoch": 0.03412,
+      "grad_norm": 0.7397792339324951,
+      "learning_rate": 0.003,
+      "loss": 4.1595,
+      "step": 3412
+    },
+    {
+      "epoch": 0.03413,
+      "grad_norm": 0.5002322793006897,
+      "learning_rate": 0.003,
+      "loss": 4.1603,
+      "step": 3413
+    },
+    {
+      "epoch": 0.03414,
+      "grad_norm": 0.533718466758728,
+      "learning_rate": 0.003,
+      "loss": 4.1493,
+      "step": 3414
+    },
+    {
+      "epoch": 0.03415,
+      "grad_norm": 0.5009657144546509,
+      "learning_rate": 0.003,
+      "loss": 4.1281,
+      "step": 3415
+    },
+    {
+      "epoch": 0.03416,
+      "grad_norm": 0.49424099922180176,
+      "learning_rate": 0.003,
+      "loss": 4.1289,
+      "step": 3416
+    },
+    {
+      "epoch": 0.03417,
+      "grad_norm": 0.5071776509284973,
+      "learning_rate": 0.003,
+      "loss": 4.1225,
+      "step": 3417
+    },
+    {
+      "epoch": 0.03418,
+      "grad_norm": 0.600383996963501,
+      "learning_rate": 0.003,
+      "loss": 4.0956,
+      "step": 3418
+    },
+    {
+      "epoch": 0.03419,
+      "grad_norm": 0.5728192329406738,
+      "learning_rate": 0.003,
+      "loss": 4.1155,
+      "step": 3419
+    },
+    {
+      "epoch": 0.0342,
+      "grad_norm": 0.5360076427459717,
+      "learning_rate": 0.003,
+      "loss": 4.1246,
+      "step": 3420
+    },
+    {
+      "epoch": 0.03421,
+      "grad_norm": 0.5283966660499573,
+      "learning_rate": 0.003,
+      "loss": 4.1131,
+      "step": 3421
+    },
+    {
+      "epoch": 0.03422,
+      "grad_norm": 0.5383856892585754,
+      "learning_rate": 0.003,
+      "loss": 4.1282,
+      "step": 3422
+    },
+    {
+      "epoch": 0.03423,
+      "grad_norm": 0.6316571831703186,
+      "learning_rate": 0.003,
+      "loss": 4.1096,
+      "step": 3423
+    },
+    {
+      "epoch": 0.03424,
+      "grad_norm": 0.7047085762023926,
+      "learning_rate": 0.003,
+      "loss": 4.1375,
+      "step": 3424
+    },
+    {
+      "epoch": 0.03425,
+      "grad_norm": 0.7236632704734802,
+      "learning_rate": 0.003,
+      "loss": 4.1018,
+      "step": 3425
+    },
+    {
+      "epoch": 0.03426,
+      "grad_norm": 0.6646005511283875,
+      "learning_rate": 0.003,
+      "loss": 4.1234,
+      "step": 3426
+    },
+    {
+      "epoch": 0.03427,
+      "grad_norm": 0.5403919816017151,
+      "learning_rate": 0.003,
+      "loss": 4.0894,
+      "step": 3427
+    },
+    {
+      "epoch": 0.03428,
+      "grad_norm": 0.5291489958763123,
+      "learning_rate": 0.003,
+      "loss": 4.0887,
+      "step": 3428
+    },
+    {
+      "epoch": 0.03429,
+      "grad_norm": 0.5223709940910339,
+      "learning_rate": 0.003,
+      "loss": 4.1048,
+      "step": 3429
+    },
+    {
+      "epoch": 0.0343,
+      "grad_norm": 0.5112771987915039,
+      "learning_rate": 0.003,
+      "loss": 4.0996,
+      "step": 3430
+    },
+    {
+      "epoch": 0.03431,
+      "grad_norm": 0.4984135031700134,
+      "learning_rate": 0.003,
+      "loss": 4.0918,
+      "step": 3431
+    },
+    {
+      "epoch": 0.03432,
+      "grad_norm": 0.6651791334152222,
+      "learning_rate": 0.003,
+      "loss": 4.1044,
+      "step": 3432
+    },
+    {
+      "epoch": 0.03433,
+      "grad_norm": 0.8698033094406128,
+      "learning_rate": 0.003,
+      "loss": 4.0843,
+      "step": 3433
+    },
+    {
+      "epoch": 0.03434,
+      "grad_norm": 0.9080641865730286,
+      "learning_rate": 0.003,
+      "loss": 4.1135,
+      "step": 3434
+    },
+    {
+      "epoch": 0.03435,
+      "grad_norm": 0.6624755263328552,
+      "learning_rate": 0.003,
+      "loss": 4.0888,
+      "step": 3435
+    },
+    {
+      "epoch": 0.03436,
+      "grad_norm": 0.5985552668571472,
+      "learning_rate": 0.003,
+      "loss": 4.1135,
+      "step": 3436
+    },
+    {
+      "epoch": 0.03437,
+      "grad_norm": 0.74169921875,
+      "learning_rate": 0.003,
+      "loss": 4.1061,
+      "step": 3437
+    },
+    {
+      "epoch": 0.03438,
+      "grad_norm": 0.6739313006401062,
+      "learning_rate": 0.003,
+      "loss": 4.1204,
+      "step": 3438
+    },
+    {
+      "epoch": 0.03439,
+      "grad_norm": 0.6183229684829712,
+      "learning_rate": 0.003,
+      "loss": 4.1164,
+      "step": 3439
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.6372612118721008,
+      "learning_rate": 0.003,
+      "loss": 4.0928,
+      "step": 3440
+    },
+    {
+      "epoch": 0.03441,
+      "grad_norm": 0.5561286211013794,
+      "learning_rate": 0.003,
+      "loss": 4.1126,
+      "step": 3441
+    },
+    {
+      "epoch": 0.03442,
+      "grad_norm": 0.5146679282188416,
+      "learning_rate": 0.003,
+      "loss": 4.0767,
+      "step": 3442
+    },
+    {
+      "epoch": 0.03443,
+      "grad_norm": 0.5086862444877625,
+      "learning_rate": 0.003,
+      "loss": 4.108,
+      "step": 3443
+    },
+    {
+      "epoch": 0.03444,
+      "grad_norm": 0.5029706358909607,
+      "learning_rate": 0.003,
+      "loss": 4.117,
+      "step": 3444
+    },
+    {
+      "epoch": 0.03445,
+      "grad_norm": 0.5076051950454712,
+      "learning_rate": 0.003,
+      "loss": 4.0821,
+      "step": 3445
+    },
+    {
+      "epoch": 0.03446,
+      "grad_norm": 0.5356073379516602,
+      "learning_rate": 0.003,
+      "loss": 4.0732,
+      "step": 3446
+    },
+    {
+      "epoch": 0.03447,
+      "grad_norm": 0.5907533764839172,
+      "learning_rate": 0.003,
+      "loss": 4.1219,
+      "step": 3447
+    },
+    {
+      "epoch": 0.03448,
+      "grad_norm": 0.6191877722740173,
+      "learning_rate": 0.003,
+      "loss": 4.0951,
+      "step": 3448
+    },
+    {
+      "epoch": 0.03449,
+      "grad_norm": 0.6186185479164124,
+      "learning_rate": 0.003,
+      "loss": 4.0808,
+      "step": 3449
+    },
+    {
+      "epoch": 0.0345,
+      "grad_norm": 0.6868272423744202,
+      "learning_rate": 0.003,
+      "loss": 4.119,
+      "step": 3450
+    },
+    {
+      "epoch": 0.03451,
+      "grad_norm": 0.5868905186653137,
+      "learning_rate": 0.003,
+      "loss": 4.0896,
+      "step": 3451
+    },
+    {
+      "epoch": 0.03452,
+      "grad_norm": 0.5602003931999207,
+      "learning_rate": 0.003,
+      "loss": 4.1187,
+      "step": 3452
+    },
+    {
+      "epoch": 0.03453,
+      "grad_norm": 0.6827725172042847,
+      "learning_rate": 0.003,
+      "loss": 4.0989,
+      "step": 3453
+    },
+    {
+      "epoch": 0.03454,
+      "grad_norm": 0.6988645792007446,
+      "learning_rate": 0.003,
+      "loss": 4.084,
+      "step": 3454
+    },
+    {
+      "epoch": 0.03455,
+      "grad_norm": 0.7984029650688171,
+      "learning_rate": 0.003,
+      "loss": 4.0883,
+      "step": 3455
+    },
+    {
+      "epoch": 0.03456,
+      "grad_norm": 0.8589499592781067,
+      "learning_rate": 0.003,
+      "loss": 4.1126,
+      "step": 3456
+    },
+    {
+      "epoch": 0.03457,
+      "grad_norm": 0.8572336435317993,
+      "learning_rate": 0.003,
+      "loss": 4.0784,
+      "step": 3457
+    },
+    {
+      "epoch": 0.03458,
+      "grad_norm": 0.7515364289283752,
+      "learning_rate": 0.003,
+      "loss": 4.1174,
+      "step": 3458
+    },
+    {
+      "epoch": 0.03459,
+      "grad_norm": 0.7732160091400146,
+      "learning_rate": 0.003,
+      "loss": 4.1453,
+      "step": 3459
+    },
+    {
+      "epoch": 0.0346,
+      "grad_norm": 1.0194605588912964,
+      "learning_rate": 0.003,
+      "loss": 4.1595,
+      "step": 3460
+    },
+    {
+      "epoch": 0.03461,
+      "grad_norm": 1.2320705652236938,
+      "learning_rate": 0.003,
+      "loss": 4.1536,
+      "step": 3461
+    },
+    {
+      "epoch": 0.03462,
+      "grad_norm": 0.8935246467590332,
+      "learning_rate": 0.003,
+      "loss": 4.134,
+      "step": 3462
+    },
+    {
+      "epoch": 0.03463,
+      "grad_norm": 0.8400885462760925,
+      "learning_rate": 0.003,
+      "loss": 4.1144,
+      "step": 3463
+    },
+    {
+      "epoch": 0.03464,
+      "grad_norm": 0.8417510390281677,
+      "learning_rate": 0.003,
+      "loss": 4.1325,
+      "step": 3464
+    },
+    {
+      "epoch": 0.03465,
+      "grad_norm": 0.9221370220184326,
+      "learning_rate": 0.003,
+      "loss": 4.1429,
+      "step": 3465
+    },
+    {
+      "epoch": 0.03466,
+      "grad_norm": 0.9339236617088318,
+      "learning_rate": 0.003,
+      "loss": 4.1417,
+      "step": 3466
+    },
+    {
+      "epoch": 0.03467,
+      "grad_norm": 1.027878999710083,
+      "learning_rate": 0.003,
+      "loss": 4.1322,
+      "step": 3467
+    },
+    {
+      "epoch": 0.03468,
+      "grad_norm": 0.9010937213897705,
+      "learning_rate": 0.003,
+      "loss": 4.1478,
+      "step": 3468
+    },
+    {
+      "epoch": 0.03469,
+      "grad_norm": 1.0625158548355103,
+      "learning_rate": 0.003,
+      "loss": 4.1472,
+      "step": 3469
+    },
+    {
+      "epoch": 0.0347,
+      "grad_norm": 0.9459049701690674,
+      "learning_rate": 0.003,
+      "loss": 4.1579,
+      "step": 3470
+    },
+    {
+      "epoch": 0.03471,
+      "grad_norm": 0.8470619320869446,
+      "learning_rate": 0.003,
+      "loss": 4.1443,
+      "step": 3471
+    },
+    {
+      "epoch": 0.03472,
+      "grad_norm": 0.7621662020683289,
+      "learning_rate": 0.003,
+      "loss": 4.1408,
+      "step": 3472
+    },
+    {
+      "epoch": 0.03473,
+      "grad_norm": 0.7190437316894531,
+      "learning_rate": 0.003,
+      "loss": 4.1319,
+      "step": 3473
+    },
+    {
+      "epoch": 0.03474,
+      "grad_norm": 0.6625324487686157,
+      "learning_rate": 0.003,
+      "loss": 4.1605,
+      "step": 3474
+    },
+    {
+      "epoch": 0.03475,
+      "grad_norm": 0.5887710452079773,
+      "learning_rate": 0.003,
+      "loss": 4.1257,
+      "step": 3475
+    },
+    {
+      "epoch": 0.03476,
+      "grad_norm": 0.6497656106948853,
+      "learning_rate": 0.003,
+      "loss": 4.1307,
+      "step": 3476
+    },
+    {
+      "epoch": 0.03477,
+      "grad_norm": 0.6939454078674316,
+      "learning_rate": 0.003,
+      "loss": 4.1271,
+      "step": 3477
+    },
+    {
+      "epoch": 0.03478,
+      "grad_norm": 0.7538445591926575,
+      "learning_rate": 0.003,
+      "loss": 4.1361,
+      "step": 3478
+    },
+    {
+      "epoch": 0.03479,
+      "grad_norm": 0.8461519479751587,
+      "learning_rate": 0.003,
+      "loss": 4.1151,
+      "step": 3479
+    },
+    {
+      "epoch": 0.0348,
+      "grad_norm": 0.9156594276428223,
+      "learning_rate": 0.003,
+      "loss": 4.1158,
+      "step": 3480
+    },
+    {
+      "epoch": 0.03481,
+      "grad_norm": 0.830324649810791,
+      "learning_rate": 0.003,
+      "loss": 4.1316,
+      "step": 3481
+    },
+    {
+      "epoch": 0.03482,
+      "grad_norm": 0.6343877911567688,
+      "learning_rate": 0.003,
+      "loss": 4.1326,
+      "step": 3482
+    },
+    {
+      "epoch": 0.03483,
+      "grad_norm": 0.5798615217208862,
+      "learning_rate": 0.003,
+      "loss": 4.1065,
+      "step": 3483
+    },
+    {
+      "epoch": 0.03484,
+      "grad_norm": 0.6012097001075745,
+      "learning_rate": 0.003,
+      "loss": 4.1507,
+      "step": 3484
+    },
+    {
+      "epoch": 0.03485,
+      "grad_norm": 0.534057080745697,
+      "learning_rate": 0.003,
+      "loss": 4.14,
+      "step": 3485
+    },
+    {
+      "epoch": 0.03486,
+      "grad_norm": 0.5091946721076965,
+      "learning_rate": 0.003,
+      "loss": 4.1087,
+      "step": 3486
+    },
+    {
+      "epoch": 0.03487,
+      "grad_norm": 0.45456182956695557,
+      "learning_rate": 0.003,
+      "loss": 4.1009,
+      "step": 3487
+    },
+    {
+      "epoch": 0.03488,
+      "grad_norm": 0.4439913034439087,
+      "learning_rate": 0.003,
+      "loss": 4.0973,
+      "step": 3488
+    },
+    {
+      "epoch": 0.03489,
+      "grad_norm": 0.48145756125450134,
+      "learning_rate": 0.003,
+      "loss": 4.1128,
+      "step": 3489
+    },
+    {
+      "epoch": 0.0349,
+      "grad_norm": 0.543171763420105,
+      "learning_rate": 0.003,
+      "loss": 4.1103,
+      "step": 3490
+    },
+    {
+      "epoch": 0.03491,
+      "grad_norm": 0.4846112132072449,
+      "learning_rate": 0.003,
+      "loss": 4.1225,
+      "step": 3491
+    },
+    {
+      "epoch": 0.03492,
+      "grad_norm": 0.4532843828201294,
+      "learning_rate": 0.003,
+      "loss": 4.1096,
+      "step": 3492
+    },
+    {
+      "epoch": 0.03493,
+      "grad_norm": 0.452328622341156,
+      "learning_rate": 0.003,
+      "loss": 4.0984,
+      "step": 3493
+    },
+    {
+      "epoch": 0.03494,
+      "grad_norm": 0.43499302864074707,
+      "learning_rate": 0.003,
+      "loss": 4.0996,
+      "step": 3494
+    },
+    {
+      "epoch": 0.03495,
+      "grad_norm": 0.3597606122493744,
+      "learning_rate": 0.003,
+      "loss": 4.1056,
+      "step": 3495
+    },
+    {
+      "epoch": 0.03496,
+      "grad_norm": 0.36562833189964294,
+      "learning_rate": 0.003,
+      "loss": 4.0783,
+      "step": 3496
+    },
+    {
+      "epoch": 0.03497,
+      "grad_norm": 0.3867475986480713,
+      "learning_rate": 0.003,
+      "loss": 4.1148,
+      "step": 3497
+    },
+    {
+      "epoch": 0.03498,
+      "grad_norm": 0.42188137769699097,
+      "learning_rate": 0.003,
+      "loss": 4.0881,
+      "step": 3498
+    },
+    {
+      "epoch": 0.03499,
+      "grad_norm": 0.5299180746078491,
+      "learning_rate": 0.003,
+      "loss": 4.0947,
+      "step": 3499
+    },
+    {
+      "epoch": 0.035,
+      "grad_norm": 0.6962635517120361,
+      "learning_rate": 0.003,
+      "loss": 4.1169,
+      "step": 3500
+    },
+    {
+      "epoch": 0.03501,
+      "grad_norm": 0.9552702307701111,
+      "learning_rate": 0.003,
+      "loss": 4.1206,
+      "step": 3501
+    },
+    {
+      "epoch": 0.03502,
+      "grad_norm": 1.1519496440887451,
+      "learning_rate": 0.003,
+      "loss": 4.1306,
+      "step": 3502
+    },
+    {
+      "epoch": 0.03503,
+      "grad_norm": 0.6880185604095459,
+      "learning_rate": 0.003,
+      "loss": 4.1312,
+      "step": 3503
+    },
+    {
+      "epoch": 0.03504,
+      "grad_norm": 0.5892576575279236,
+      "learning_rate": 0.003,
+      "loss": 4.1025,
+      "step": 3504
+    },
+    {
+      "epoch": 0.03505,
+      "grad_norm": 0.6985999345779419,
+      "learning_rate": 0.003,
+      "loss": 4.1215,
+      "step": 3505
+    },
+    {
+      "epoch": 0.03506,
+      "grad_norm": 0.7576223611831665,
+      "learning_rate": 0.003,
+      "loss": 4.1306,
+      "step": 3506
+    },
+    {
+      "epoch": 0.03507,
+      "grad_norm": 0.6434160470962524,
+      "learning_rate": 0.003,
+      "loss": 4.1089,
+      "step": 3507
+    },
+    {
+      "epoch": 0.03508,
+      "grad_norm": 0.6596179604530334,
+      "learning_rate": 0.003,
+      "loss": 4.0875,
+      "step": 3508
+    },
+    {
+      "epoch": 0.03509,
+      "grad_norm": 0.6995351314544678,
+      "learning_rate": 0.003,
+      "loss": 4.1043,
+      "step": 3509
+    },
+    {
+      "epoch": 0.0351,
+      "grad_norm": 0.6974444389343262,
+      "learning_rate": 0.003,
+      "loss": 4.0959,
+      "step": 3510
+    },
+    {
+      "epoch": 0.03511,
+      "grad_norm": 0.6773884296417236,
+      "learning_rate": 0.003,
+      "loss": 4.1118,
+      "step": 3511
+    },
+    {
+      "epoch": 0.03512,
+      "grad_norm": 0.6800361275672913,
+      "learning_rate": 0.003,
+      "loss": 4.101,
+      "step": 3512
+    },
+    {
+      "epoch": 0.03513,
+      "grad_norm": 0.6331777572631836,
+      "learning_rate": 0.003,
+      "loss": 4.1017,
+      "step": 3513
+    },
+    {
+      "epoch": 0.03514,
+      "grad_norm": 0.6398311853408813,
+      "learning_rate": 0.003,
+      "loss": 4.1107,
+      "step": 3514
+    },
+    {
+      "epoch": 0.03515,
+      "grad_norm": 0.652564525604248,
+      "learning_rate": 0.003,
+      "loss": 4.123,
+      "step": 3515
+    },
+    {
+      "epoch": 0.03516,
+      "grad_norm": 0.6278548836708069,
+      "learning_rate": 0.003,
+      "loss": 4.1476,
+      "step": 3516
+    },
+    {
+      "epoch": 0.03517,
+      "grad_norm": 0.5660470128059387,
+      "learning_rate": 0.003,
+      "loss": 4.0959,
+      "step": 3517
+    },
+    {
+      "epoch": 0.03518,
+      "grad_norm": 0.6680671572685242,
+      "learning_rate": 0.003,
+      "loss": 4.088,
+      "step": 3518
+    },
+    {
+      "epoch": 0.03519,
+      "grad_norm": 0.8297867178916931,
+      "learning_rate": 0.003,
+      "loss": 4.1339,
+      "step": 3519
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 1.0089613199234009,
+      "learning_rate": 0.003,
+      "loss": 4.1251,
+      "step": 3520
+    },
+    {
+      "epoch": 0.03521,
+      "grad_norm": 0.8898510336875916,
+      "learning_rate": 0.003,
+      "loss": 4.1207,
+      "step": 3521
+    },
+    {
+      "epoch": 0.03522,
+      "grad_norm": 0.7937150001525879,
+      "learning_rate": 0.003,
+      "loss": 4.1173,
+      "step": 3522
+    },
+    {
+      "epoch": 0.03523,
+      "grad_norm": 0.9151129126548767,
+      "learning_rate": 0.003,
+      "loss": 4.0883,
+      "step": 3523
+    },
+    {
+      "epoch": 0.03524,
+      "grad_norm": 1.016127109527588,
+      "learning_rate": 0.003,
+      "loss": 4.1424,
+      "step": 3524
+    },
+    {
+      "epoch": 0.03525,
+      "grad_norm": 0.8876500129699707,
+      "learning_rate": 0.003,
+      "loss": 4.1096,
+      "step": 3525
+    },
+    {
+      "epoch": 0.03526,
+      "grad_norm": 0.9732938408851624,
+      "learning_rate": 0.003,
+      "loss": 4.1249,
+      "step": 3526
+    },
+    {
+      "epoch": 0.03527,
+      "grad_norm": 0.9365341067314148,
+      "learning_rate": 0.003,
+      "loss": 4.1567,
+      "step": 3527
+    },
+    {
+      "epoch": 0.03528,
+      "grad_norm": 0.7851080298423767,
+      "learning_rate": 0.003,
+      "loss": 4.1377,
+      "step": 3528
+    },
+    {
+      "epoch": 0.03529,
+      "grad_norm": 0.7726654410362244,
+      "learning_rate": 0.003,
+      "loss": 4.1279,
+      "step": 3529
+    },
+    {
+      "epoch": 0.0353,
+      "grad_norm": 0.8136371374130249,
+      "learning_rate": 0.003,
+      "loss": 4.1286,
+      "step": 3530
+    },
+    {
+      "epoch": 0.03531,
+      "grad_norm": 0.9276587963104248,
+      "learning_rate": 0.003,
+      "loss": 4.146,
+      "step": 3531
+    },
+    {
+      "epoch": 0.03532,
+      "grad_norm": 1.0662339925765991,
+      "learning_rate": 0.003,
+      "loss": 4.1553,
+      "step": 3532
+    },
+    {
+      "epoch": 0.03533,
+      "grad_norm": 0.9835387468338013,
+      "learning_rate": 0.003,
+      "loss": 4.154,
+      "step": 3533
+    },
+    {
+      "epoch": 0.03534,
+      "grad_norm": 0.8469735383987427,
+      "learning_rate": 0.003,
+      "loss": 4.1438,
+      "step": 3534
+    },
+    {
+      "epoch": 0.03535,
+      "grad_norm": 0.8011348247528076,
+      "learning_rate": 0.003,
+      "loss": 4.1115,
+      "step": 3535
+    },
+    {
+      "epoch": 0.03536,
+      "grad_norm": 0.7366276383399963,
+      "learning_rate": 0.003,
+      "loss": 4.113,
+      "step": 3536
+    },
+    {
+      "epoch": 0.03537,
+      "grad_norm": 0.7694060802459717,
+      "learning_rate": 0.003,
+      "loss": 4.1186,
+      "step": 3537
+    },
+    {
+      "epoch": 0.03538,
+      "grad_norm": 0.7417018413543701,
+      "learning_rate": 0.003,
+      "loss": 4.1239,
+      "step": 3538
+    },
+    {
+      "epoch": 0.03539,
+      "grad_norm": 0.6806791424751282,
+      "learning_rate": 0.003,
+      "loss": 4.1095,
+      "step": 3539
+    },
+    {
+      "epoch": 0.0354,
+      "grad_norm": 0.7284207940101624,
+      "learning_rate": 0.003,
+      "loss": 4.1493,
+      "step": 3540
+    },
+    {
+      "epoch": 0.03541,
+      "grad_norm": 0.8043148517608643,
+      "learning_rate": 0.003,
+      "loss": 4.1355,
+      "step": 3541
+    },
+    {
+      "epoch": 0.03542,
+      "grad_norm": 0.9069803357124329,
+      "learning_rate": 0.003,
+      "loss": 4.121,
+      "step": 3542
+    },
+    {
+      "epoch": 0.03543,
+      "grad_norm": 0.8361301422119141,
+      "learning_rate": 0.003,
+      "loss": 4.1305,
+      "step": 3543
+    },
+    {
+      "epoch": 0.03544,
+      "grad_norm": 0.6755576133728027,
+      "learning_rate": 0.003,
+      "loss": 4.1316,
+      "step": 3544
+    },
+    {
+      "epoch": 0.03545,
+      "grad_norm": 0.6478848457336426,
+      "learning_rate": 0.003,
+      "loss": 4.0998,
+      "step": 3545
+    },
+    {
+      "epoch": 0.03546,
+      "grad_norm": 0.6325247287750244,
+      "learning_rate": 0.003,
+      "loss": 4.1169,
+      "step": 3546
+    },
+    {
+      "epoch": 0.03547,
+      "grad_norm": 0.551747739315033,
+      "learning_rate": 0.003,
+      "loss": 4.1233,
+      "step": 3547
+    },
+    {
+      "epoch": 0.03548,
+      "grad_norm": 0.550105094909668,
+      "learning_rate": 0.003,
+      "loss": 4.1261,
+      "step": 3548
+    },
+    {
+      "epoch": 0.03549,
+      "grad_norm": 0.5281518697738647,
+      "learning_rate": 0.003,
+      "loss": 4.1188,
+      "step": 3549
+    },
+    {
+      "epoch": 0.0355,
+      "grad_norm": 0.5446397066116333,
+      "learning_rate": 0.003,
+      "loss": 4.0722,
+      "step": 3550
+    },
+    {
+      "epoch": 0.03551,
+      "grad_norm": 0.5065484046936035,
+      "learning_rate": 0.003,
+      "loss": 4.1297,
+      "step": 3551
+    },
+    {
+      "epoch": 0.03552,
+      "grad_norm": 0.5231883525848389,
+      "learning_rate": 0.003,
+      "loss": 4.0898,
+      "step": 3552
+    },
+    {
+      "epoch": 0.03553,
+      "grad_norm": 0.5496423840522766,
+      "learning_rate": 0.003,
+      "loss": 4.1036,
+      "step": 3553
+    },
+    {
+      "epoch": 0.03554,
+      "grad_norm": 0.5717417597770691,
+      "learning_rate": 0.003,
+      "loss": 4.1073,
+      "step": 3554
+    },
+    {
+      "epoch": 0.03555,
+      "grad_norm": 0.5578296184539795,
+      "learning_rate": 0.003,
+      "loss": 4.1184,
+      "step": 3555
+    },
+    {
+      "epoch": 0.03556,
+      "grad_norm": 0.5329107642173767,
+      "learning_rate": 0.003,
+      "loss": 4.0988,
+      "step": 3556
+    },
+    {
+      "epoch": 0.03557,
+      "grad_norm": 0.6033030152320862,
+      "learning_rate": 0.003,
+      "loss": 4.1004,
+      "step": 3557
+    },
+    {
+      "epoch": 0.03558,
+      "grad_norm": 0.7457759380340576,
+      "learning_rate": 0.003,
+      "loss": 4.135,
+      "step": 3558
+    },
+    {
+      "epoch": 0.03559,
+      "grad_norm": 0.9078201055526733,
+      "learning_rate": 0.003,
+      "loss": 4.1229,
+      "step": 3559
+    },
+    {
+      "epoch": 0.0356,
+      "grad_norm": 0.848138689994812,
+      "learning_rate": 0.003,
+      "loss": 4.1262,
+      "step": 3560
+    },
+    {
+      "epoch": 0.03561,
+      "grad_norm": 0.6350159645080566,
+      "learning_rate": 0.003,
+      "loss": 4.1425,
+      "step": 3561
+    },
+    {
+      "epoch": 0.03562,
+      "grad_norm": 0.6772452592849731,
+      "learning_rate": 0.003,
+      "loss": 4.1085,
+      "step": 3562
+    },
+    {
+      "epoch": 0.03563,
+      "grad_norm": 0.7804591655731201,
+      "learning_rate": 0.003,
+      "loss": 4.1453,
+      "step": 3563
+    },
+    {
+      "epoch": 0.03564,
+      "grad_norm": 0.725709080696106,
+      "learning_rate": 0.003,
+      "loss": 4.1193,
+      "step": 3564
+    },
+    {
+      "epoch": 0.03565,
+      "grad_norm": 0.6160607933998108,
+      "learning_rate": 0.003,
+      "loss": 4.1088,
+      "step": 3565
+    },
+    {
+      "epoch": 0.03566,
+      "grad_norm": 0.6539621353149414,
+      "learning_rate": 0.003,
+      "loss": 4.1136,
+      "step": 3566
+    },
+    {
+      "epoch": 0.03567,
+      "grad_norm": 0.7242776155471802,
+      "learning_rate": 0.003,
+      "loss": 4.1107,
+      "step": 3567
+    },
+    {
+      "epoch": 0.03568,
+      "grad_norm": 0.623746395111084,
+      "learning_rate": 0.003,
+      "loss": 4.1043,
+      "step": 3568
+    },
+    {
+      "epoch": 0.03569,
+      "grad_norm": 0.570114016532898,
+      "learning_rate": 0.003,
+      "loss": 4.0963,
+      "step": 3569
+    },
+    {
+      "epoch": 0.0357,
+      "grad_norm": 0.5575445294380188,
+      "learning_rate": 0.003,
+      "loss": 4.098,
+      "step": 3570
+    },
+    {
+      "epoch": 0.03571,
+      "grad_norm": 0.5092292428016663,
+      "learning_rate": 0.003,
+      "loss": 4.1047,
+      "step": 3571
+    },
+    {
+      "epoch": 0.03572,
+      "grad_norm": 0.5417683720588684,
+      "learning_rate": 0.003,
+      "loss": 4.1243,
+      "step": 3572
+    },
+    {
+      "epoch": 0.03573,
+      "grad_norm": 0.5097172856330872,
+      "learning_rate": 0.003,
+      "loss": 4.0926,
+      "step": 3573
+    },
+    {
+      "epoch": 0.03574,
+      "grad_norm": 0.4699859917163849,
+      "learning_rate": 0.003,
+      "loss": 4.1165,
+      "step": 3574
+    },
+    {
+      "epoch": 0.03575,
+      "grad_norm": 0.47701337933540344,
+      "learning_rate": 0.003,
+      "loss": 4.0951,
+      "step": 3575
+    },
+    {
+      "epoch": 0.03576,
+      "grad_norm": 0.48855581879615784,
+      "learning_rate": 0.003,
+      "loss": 4.079,
+      "step": 3576
+    },
+    {
+      "epoch": 0.03577,
+      "grad_norm": 0.5532284379005432,
+      "learning_rate": 0.003,
+      "loss": 4.1121,
+      "step": 3577
+    },
+    {
+      "epoch": 0.03578,
+      "grad_norm": 0.6470955610275269,
+      "learning_rate": 0.003,
+      "loss": 4.1044,
+      "step": 3578
+    },
+    {
+      "epoch": 0.03579,
+      "grad_norm": 0.7264450788497925,
+      "learning_rate": 0.003,
+      "loss": 4.1219,
+      "step": 3579
+    },
+    {
+      "epoch": 0.0358,
+      "grad_norm": 0.8104549646377563,
+      "learning_rate": 0.003,
+      "loss": 4.083,
+      "step": 3580
+    },
+    {
+      "epoch": 0.03581,
+      "grad_norm": 0.7132370471954346,
+      "learning_rate": 0.003,
+      "loss": 4.0948,
+      "step": 3581
+    },
+    {
+      "epoch": 0.03582,
+      "grad_norm": 0.5359878540039062,
+      "learning_rate": 0.003,
+      "loss": 4.0911,
+      "step": 3582
+    },
+    {
+      "epoch": 0.03583,
+      "grad_norm": 0.45527881383895874,
+      "learning_rate": 0.003,
+      "loss": 4.0899,
+      "step": 3583
+    },
+    {
+      "epoch": 0.03584,
+      "grad_norm": 0.5485943555831909,
+      "learning_rate": 0.003,
+      "loss": 4.0809,
+      "step": 3584
+    },
+    {
+      "epoch": 0.03585,
+      "grad_norm": 0.5606132745742798,
+      "learning_rate": 0.003,
+      "loss": 4.0774,
+      "step": 3585
+    },
+    {
+      "epoch": 0.03586,
+      "grad_norm": 0.5852906703948975,
+      "learning_rate": 0.003,
+      "loss": 4.1008,
+      "step": 3586
+    },
+    {
+      "epoch": 0.03587,
+      "grad_norm": 0.6464812159538269,
+      "learning_rate": 0.003,
+      "loss": 4.1013,
+      "step": 3587
+    },
+    {
+      "epoch": 0.03588,
+      "grad_norm": 0.6532333493232727,
+      "learning_rate": 0.003,
+      "loss": 4.1008,
+      "step": 3588
+    },
+    {
+      "epoch": 0.03589,
+      "grad_norm": 0.729999303817749,
+      "learning_rate": 0.003,
+      "loss": 4.1197,
+      "step": 3589
+    },
+    {
+      "epoch": 0.0359,
+      "grad_norm": 0.716708779335022,
+      "learning_rate": 0.003,
+      "loss": 4.0883,
+      "step": 3590
+    },
+    {
+      "epoch": 0.03591,
+      "grad_norm": 0.6319497227668762,
+      "learning_rate": 0.003,
+      "loss": 4.1188,
+      "step": 3591
+    },
+    {
+      "epoch": 0.03592,
+      "grad_norm": 0.7355323433876038,
+      "learning_rate": 0.003,
+      "loss": 4.1029,
+      "step": 3592
+    },
+    {
+      "epoch": 0.03593,
+      "grad_norm": 0.74644935131073,
+      "learning_rate": 0.003,
+      "loss": 4.1041,
+      "step": 3593
+    },
+    {
+      "epoch": 0.03594,
+      "grad_norm": 0.7125880122184753,
+      "learning_rate": 0.003,
+      "loss": 4.1094,
+      "step": 3594
+    },
+    {
+      "epoch": 0.03595,
+      "grad_norm": 0.6679682731628418,
+      "learning_rate": 0.003,
+      "loss": 4.0993,
+      "step": 3595
+    },
+    {
+      "epoch": 0.03596,
+      "grad_norm": 0.6546993851661682,
+      "learning_rate": 0.003,
+      "loss": 4.1358,
+      "step": 3596
+    },
+    {
+      "epoch": 0.03597,
+      "grad_norm": 0.6869426369667053,
+      "learning_rate": 0.003,
+      "loss": 4.1088,
+      "step": 3597
+    },
+    {
+      "epoch": 0.03598,
+      "grad_norm": 0.6870181560516357,
+      "learning_rate": 0.003,
+      "loss": 4.1155,
+      "step": 3598
+    },
+    {
+      "epoch": 0.03599,
+      "grad_norm": 0.6337956190109253,
+      "learning_rate": 0.003,
+      "loss": 4.1088,
+      "step": 3599
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.7691097855567932,
+      "learning_rate": 0.003,
+      "loss": 4.1371,
+      "step": 3600
+    },
+    {
+      "epoch": 0.03601,
+      "grad_norm": 0.938689649105072,
+      "learning_rate": 0.003,
+      "loss": 4.1168,
+      "step": 3601
+    },
+    {
+      "epoch": 0.03602,
+      "grad_norm": 1.0124480724334717,
+      "learning_rate": 0.003,
+      "loss": 4.1053,
+      "step": 3602
+    },
+    {
+      "epoch": 0.03603,
+      "grad_norm": 1.0977243185043335,
+      "learning_rate": 0.003,
+      "loss": 4.1007,
+      "step": 3603
+    },
+    {
+      "epoch": 0.03604,
+      "grad_norm": 0.7794393301010132,
+      "learning_rate": 0.003,
+      "loss": 4.131,
+      "step": 3604
+    },
+    {
+      "epoch": 0.03605,
+      "grad_norm": 0.5185660719871521,
+      "learning_rate": 0.003,
+      "loss": 4.114,
+      "step": 3605
+    },
+    {
+      "epoch": 0.03606,
+      "grad_norm": 0.5366716384887695,
+      "learning_rate": 0.003,
+      "loss": 4.0997,
+      "step": 3606
+    },
+    {
+      "epoch": 0.03607,
+      "grad_norm": 0.5930801630020142,
+      "learning_rate": 0.003,
+      "loss": 4.0952,
+      "step": 3607
+    },
+    {
+      "epoch": 0.03608,
+      "grad_norm": 0.5576140284538269,
+      "learning_rate": 0.003,
+      "loss": 4.0965,
+      "step": 3608
+    },
+    {
+      "epoch": 0.03609,
+      "grad_norm": 0.5525704622268677,
+      "learning_rate": 0.003,
+      "loss": 4.1239,
+      "step": 3609
+    },
+    {
+      "epoch": 0.0361,
+      "grad_norm": 0.5817636847496033,
+      "learning_rate": 0.003,
+      "loss": 4.126,
+      "step": 3610
+    },
+    {
+      "epoch": 0.03611,
+      "grad_norm": 0.5388094782829285,
+      "learning_rate": 0.003,
+      "loss": 4.0762,
+      "step": 3611
+    },
+    {
+      "epoch": 0.03612,
+      "grad_norm": 0.5407794117927551,
+      "learning_rate": 0.003,
+      "loss": 4.0741,
+      "step": 3612
+    },
+    {
+      "epoch": 0.03613,
+      "grad_norm": 0.7358308434486389,
+      "learning_rate": 0.003,
+      "loss": 4.1212,
+      "step": 3613
+    },
+    {
+      "epoch": 0.03614,
+      "grad_norm": 0.7770156264305115,
+      "learning_rate": 0.003,
+      "loss": 4.0986,
+      "step": 3614
+    },
+    {
+      "epoch": 0.03615,
+      "grad_norm": 0.7676093578338623,
+      "learning_rate": 0.003,
+      "loss": 4.1123,
+      "step": 3615
+    },
+    {
+      "epoch": 0.03616,
+      "grad_norm": 0.8209259510040283,
+      "learning_rate": 0.003,
+      "loss": 4.1253,
+      "step": 3616
+    },
+    {
+      "epoch": 0.03617,
+      "grad_norm": 0.7589568495750427,
+      "learning_rate": 0.003,
+      "loss": 4.1202,
+      "step": 3617
+    },
+    {
+      "epoch": 0.03618,
+      "grad_norm": 0.7976088523864746,
+      "learning_rate": 0.003,
+      "loss": 4.1145,
+      "step": 3618
+    },
+    {
+      "epoch": 0.03619,
+      "grad_norm": 0.7740630507469177,
+      "learning_rate": 0.003,
+      "loss": 4.1146,
+      "step": 3619
+    },
+    {
+      "epoch": 0.0362,
+      "grad_norm": 0.8788385391235352,
+      "learning_rate": 0.003,
+      "loss": 4.1243,
+      "step": 3620
+    },
+    {
+      "epoch": 0.03621,
+      "grad_norm": 1.0235563516616821,
+      "learning_rate": 0.003,
+      "loss": 4.1324,
+      "step": 3621
+    },
+    {
+      "epoch": 0.03622,
+      "grad_norm": 1.101138949394226,
+      "learning_rate": 0.003,
+      "loss": 4.1112,
+      "step": 3622
+    },
+    {
+      "epoch": 0.03623,
+      "grad_norm": 0.782410204410553,
+      "learning_rate": 0.003,
+      "loss": 4.0938,
+      "step": 3623
+    },
+    {
+      "epoch": 0.03624,
+      "grad_norm": 0.5812339186668396,
+      "learning_rate": 0.003,
+      "loss": 4.0954,
+      "step": 3624
+    },
+    {
+      "epoch": 0.03625,
+      "grad_norm": 0.679203987121582,
+      "learning_rate": 0.003,
+      "loss": 4.1317,
+      "step": 3625
+    },
+    {
+      "epoch": 0.03626,
+      "grad_norm": 0.7614912986755371,
+      "learning_rate": 0.003,
+      "loss": 4.1097,
+      "step": 3626
+    },
+    {
+      "epoch": 0.03627,
+      "grad_norm": 0.7893409729003906,
+      "learning_rate": 0.003,
+      "loss": 4.1109,
+      "step": 3627
+    },
+    {
+      "epoch": 0.03628,
+      "grad_norm": 0.6757506132125854,
+      "learning_rate": 0.003,
+      "loss": 4.1182,
+      "step": 3628
+    },
+    {
+      "epoch": 0.03629,
+      "grad_norm": 0.5744339227676392,
+      "learning_rate": 0.003,
+      "loss": 4.1332,
+      "step": 3629
+    },
+    {
+      "epoch": 0.0363,
+      "grad_norm": 0.5694675445556641,
+      "learning_rate": 0.003,
+      "loss": 4.1144,
+      "step": 3630
+    },
+    {
+      "epoch": 0.03631,
+      "grad_norm": 0.5684549808502197,
+      "learning_rate": 0.003,
+      "loss": 4.107,
+      "step": 3631
+    },
+    {
+      "epoch": 0.03632,
+      "grad_norm": 0.7418304085731506,
+      "learning_rate": 0.003,
+      "loss": 4.1039,
+      "step": 3632
+    },
+    {
+      "epoch": 0.03633,
+      "grad_norm": 0.8812902569770813,
+      "learning_rate": 0.003,
+      "loss": 4.1369,
+      "step": 3633
+    },
+    {
+      "epoch": 0.03634,
+      "grad_norm": 0.960989236831665,
+      "learning_rate": 0.003,
+      "loss": 4.1198,
+      "step": 3634
+    },
+    {
+      "epoch": 0.03635,
+      "grad_norm": 0.8940312266349792,
+      "learning_rate": 0.003,
+      "loss": 4.1037,
+      "step": 3635
+    },
+    {
+      "epoch": 0.03636,
+      "grad_norm": 0.7907798886299133,
+      "learning_rate": 0.003,
+      "loss": 4.1228,
+      "step": 3636
+    },
+    {
+      "epoch": 0.03637,
+      "grad_norm": 0.7844222187995911,
+      "learning_rate": 0.003,
+      "loss": 4.1135,
+      "step": 3637
+    },
+    {
+      "epoch": 0.03638,
+      "grad_norm": 0.8917554616928101,
+      "learning_rate": 0.003,
+      "loss": 4.1269,
+      "step": 3638
+    },
+    {
+      "epoch": 0.03639,
+      "grad_norm": 0.8074480891227722,
+      "learning_rate": 0.003,
+      "loss": 4.1473,
+      "step": 3639
+    },
+    {
+      "epoch": 0.0364,
+      "grad_norm": 0.9071139097213745,
+      "learning_rate": 0.003,
+      "loss": 4.149,
+      "step": 3640
+    },
+    {
+      "epoch": 0.03641,
+      "grad_norm": 0.749542236328125,
+      "learning_rate": 0.003,
+      "loss": 4.1275,
+      "step": 3641
+    },
+    {
+      "epoch": 0.03642,
+      "grad_norm": 0.7375038862228394,
+      "learning_rate": 0.003,
+      "loss": 4.1468,
+      "step": 3642
+    },
+    {
+      "epoch": 0.03643,
+      "grad_norm": 0.7622751593589783,
+      "learning_rate": 0.003,
+      "loss": 4.1104,
+      "step": 3643
+    },
+    {
+      "epoch": 0.03644,
+      "grad_norm": 0.7410848140716553,
+      "learning_rate": 0.003,
+      "loss": 4.1341,
+      "step": 3644
+    },
+    {
+      "epoch": 0.03645,
+      "grad_norm": 0.7992194890975952,
+      "learning_rate": 0.003,
+      "loss": 4.1055,
+      "step": 3645
+    },
+    {
+      "epoch": 0.03646,
+      "grad_norm": 0.7577066421508789,
+      "learning_rate": 0.003,
+      "loss": 4.124,
+      "step": 3646
+    },
+    {
+      "epoch": 0.03647,
+      "grad_norm": 0.7054228782653809,
+      "learning_rate": 0.003,
+      "loss": 4.1186,
+      "step": 3647
+    },
+    {
+      "epoch": 0.03648,
+      "grad_norm": 0.664077877998352,
+      "learning_rate": 0.003,
+      "loss": 4.1069,
+      "step": 3648
+    },
+    {
+      "epoch": 0.03649,
+      "grad_norm": 0.5441340208053589,
+      "learning_rate": 0.003,
+      "loss": 4.126,
+      "step": 3649
+    },
+    {
+      "epoch": 0.0365,
+      "grad_norm": 0.5746814012527466,
+      "learning_rate": 0.003,
+      "loss": 4.1046,
+      "step": 3650
+    },
+    {
+      "epoch": 0.03651,
+      "grad_norm": 0.6534304618835449,
+      "learning_rate": 0.003,
+      "loss": 4.1274,
+      "step": 3651
+    },
+    {
+      "epoch": 0.03652,
+      "grad_norm": 0.6664396524429321,
+      "learning_rate": 0.003,
+      "loss": 4.1084,
+      "step": 3652
+    },
+    {
+      "epoch": 0.03653,
+      "grad_norm": 0.6238181591033936,
+      "learning_rate": 0.003,
+      "loss": 4.0937,
+      "step": 3653
+    },
+    {
+      "epoch": 0.03654,
+      "grad_norm": 0.5265892148017883,
+      "learning_rate": 0.003,
+      "loss": 4.0921,
+      "step": 3654
+    },
+    {
+      "epoch": 0.03655,
+      "grad_norm": 0.4858669936656952,
+      "learning_rate": 0.003,
+      "loss": 4.1136,
+      "step": 3655
+    },
+    {
+      "epoch": 0.03656,
+      "grad_norm": 0.41965577006340027,
+      "learning_rate": 0.003,
+      "loss": 4.0995,
+      "step": 3656
+    },
+    {
+      "epoch": 0.03657,
+      "grad_norm": 0.4169410765171051,
+      "learning_rate": 0.003,
+      "loss": 4.0719,
+      "step": 3657
+    },
+    {
+      "epoch": 0.03658,
+      "grad_norm": 0.4434378445148468,
+      "learning_rate": 0.003,
+      "loss": 4.1361,
+      "step": 3658
+    },
+    {
+      "epoch": 0.03659,
+      "grad_norm": 0.4580453634262085,
+      "learning_rate": 0.003,
+      "loss": 4.0843,
+      "step": 3659
+    },
+    {
+      "epoch": 0.0366,
+      "grad_norm": 0.4966508448123932,
+      "learning_rate": 0.003,
+      "loss": 4.1149,
+      "step": 3660
+    },
+    {
+      "epoch": 0.03661,
+      "grad_norm": 0.6382727026939392,
+      "learning_rate": 0.003,
+      "loss": 4.0967,
+      "step": 3661
+    },
+    {
+      "epoch": 0.03662,
+      "grad_norm": 0.8349222540855408,
+      "learning_rate": 0.003,
+      "loss": 4.0986,
+      "step": 3662
+    },
+    {
+      "epoch": 0.03663,
+      "grad_norm": 0.931993842124939,
+      "learning_rate": 0.003,
+      "loss": 4.1239,
+      "step": 3663
+    },
+    {
+      "epoch": 0.03664,
+      "grad_norm": 0.837236762046814,
+      "learning_rate": 0.003,
+      "loss": 4.1023,
+      "step": 3664
+    },
+    {
+      "epoch": 0.03665,
+      "grad_norm": 0.6735215783119202,
+      "learning_rate": 0.003,
+      "loss": 4.12,
+      "step": 3665
+    },
+    {
+      "epoch": 0.03666,
+      "grad_norm": 0.7142363786697388,
+      "learning_rate": 0.003,
+      "loss": 4.0773,
+      "step": 3666
+    },
+    {
+      "epoch": 0.03667,
+      "grad_norm": 0.7672339081764221,
+      "learning_rate": 0.003,
+      "loss": 4.0907,
+      "step": 3667
+    },
+    {
+      "epoch": 0.03668,
+      "grad_norm": 0.660099446773529,
+      "learning_rate": 0.003,
+      "loss": 4.1026,
+      "step": 3668
+    },
+    {
+      "epoch": 0.03669,
+      "grad_norm": 0.664168119430542,
+      "learning_rate": 0.003,
+      "loss": 4.08,
+      "step": 3669
+    },
+    {
+      "epoch": 0.0367,
+      "grad_norm": 0.5606299042701721,
+      "learning_rate": 0.003,
+      "loss": 4.1162,
+      "step": 3670
+    },
+    {
+      "epoch": 0.03671,
+      "grad_norm": 0.5958951711654663,
+      "learning_rate": 0.003,
+      "loss": 4.1041,
+      "step": 3671
+    },
+    {
+      "epoch": 0.03672,
+      "grad_norm": 0.7330699563026428,
+      "learning_rate": 0.003,
+      "loss": 4.1123,
+      "step": 3672
+    },
+    {
+      "epoch": 0.03673,
+      "grad_norm": 0.7183411717414856,
+      "learning_rate": 0.003,
+      "loss": 4.1238,
+      "step": 3673
+    },
+    {
+      "epoch": 0.03674,
+      "grad_norm": 0.6538311243057251,
+      "learning_rate": 0.003,
+      "loss": 4.0817,
+      "step": 3674
+    },
+    {
+      "epoch": 0.03675,
+      "grad_norm": 0.6396141052246094,
+      "learning_rate": 0.003,
+      "loss": 4.1215,
+      "step": 3675
+    },
+    {
+      "epoch": 0.03676,
+      "grad_norm": 0.5857633948326111,
+      "learning_rate": 0.003,
+      "loss": 4.0802,
+      "step": 3676
+    },
+    {
+      "epoch": 0.03677,
+      "grad_norm": 0.5811484456062317,
+      "learning_rate": 0.003,
+      "loss": 4.0927,
+      "step": 3677
+    },
+    {
+      "epoch": 0.03678,
+      "grad_norm": 0.5666897892951965,
+      "learning_rate": 0.003,
+      "loss": 4.121,
+      "step": 3678
+    },
+    {
+      "epoch": 0.03679,
+      "grad_norm": 0.6068264245986938,
+      "learning_rate": 0.003,
+      "loss": 4.1091,
+      "step": 3679
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.6787545084953308,
+      "learning_rate": 0.003,
+      "loss": 4.1035,
+      "step": 3680
+    },
+    {
+      "epoch": 0.03681,
+      "grad_norm": 0.8176649212837219,
+      "learning_rate": 0.003,
+      "loss": 4.0805,
+      "step": 3681
+    },
+    {
+      "epoch": 0.03682,
+      "grad_norm": 1.0537775754928589,
+      "learning_rate": 0.003,
+      "loss": 4.1282,
+      "step": 3682
+    },
+    {
+      "epoch": 0.03683,
+      "grad_norm": 0.9726674556732178,
+      "learning_rate": 0.003,
+      "loss": 4.0925,
+      "step": 3683
+    },
+    {
+      "epoch": 0.03684,
+      "grad_norm": 0.7920905351638794,
+      "learning_rate": 0.003,
+      "loss": 4.1132,
+      "step": 3684
+    },
+    {
+      "epoch": 0.03685,
+      "grad_norm": 0.6423739194869995,
+      "learning_rate": 0.003,
+      "loss": 4.094,
+      "step": 3685
+    },
+    {
+      "epoch": 0.03686,
+      "grad_norm": 0.6905859112739563,
+      "learning_rate": 0.003,
+      "loss": 4.1182,
+      "step": 3686
+    },
+    {
+      "epoch": 0.03687,
+      "grad_norm": 0.7596766948699951,
+      "learning_rate": 0.003,
+      "loss": 4.1374,
+      "step": 3687
+    },
+    {
+      "epoch": 0.03688,
+      "grad_norm": 0.7518109679222107,
+      "learning_rate": 0.003,
+      "loss": 4.1276,
+      "step": 3688
+    },
+    {
+      "epoch": 0.03689,
+      "grad_norm": 0.764574408531189,
+      "learning_rate": 0.003,
+      "loss": 4.0944,
+      "step": 3689
+    },
+    {
+      "epoch": 0.0369,
+      "grad_norm": 0.721488893032074,
+      "learning_rate": 0.003,
+      "loss": 4.1267,
+      "step": 3690
+    },
+    {
+      "epoch": 0.03691,
+      "grad_norm": 0.7047094702720642,
+      "learning_rate": 0.003,
+      "loss": 4.1095,
+      "step": 3691
+    },
+    {
+      "epoch": 0.03692,
+      "grad_norm": 0.8295153975486755,
+      "learning_rate": 0.003,
+      "loss": 4.1489,
+      "step": 3692
+    },
+    {
+      "epoch": 0.03693,
+      "grad_norm": 0.899239182472229,
+      "learning_rate": 0.003,
+      "loss": 4.1229,
+      "step": 3693
+    },
+    {
+      "epoch": 0.03694,
+      "grad_norm": 0.9156467914581299,
+      "learning_rate": 0.003,
+      "loss": 4.1211,
+      "step": 3694
+    },
+    {
+      "epoch": 0.03695,
+      "grad_norm": 0.8504350781440735,
+      "learning_rate": 0.003,
+      "loss": 4.1158,
+      "step": 3695
+    },
+    {
+      "epoch": 0.03696,
+      "grad_norm": 0.7041043639183044,
+      "learning_rate": 0.003,
+      "loss": 4.1126,
+      "step": 3696
+    },
+    {
+      "epoch": 0.03697,
+      "grad_norm": 0.643920361995697,
+      "learning_rate": 0.003,
+      "loss": 4.0985,
+      "step": 3697
+    },
+    {
+      "epoch": 0.03698,
+      "grad_norm": 0.7144352197647095,
+      "learning_rate": 0.003,
+      "loss": 4.1359,
+      "step": 3698
+    },
+    {
+      "epoch": 0.03699,
+      "grad_norm": 0.7363175749778748,
+      "learning_rate": 0.003,
+      "loss": 4.1234,
+      "step": 3699
+    },
+    {
+      "epoch": 0.037,
+      "grad_norm": 0.649755597114563,
+      "learning_rate": 0.003,
+      "loss": 4.1119,
+      "step": 3700
+    },
+    {
+      "epoch": 0.03701,
+      "grad_norm": 0.5577970743179321,
+      "learning_rate": 0.003,
+      "loss": 4.1128,
+      "step": 3701
+    },
+    {
+      "epoch": 0.03702,
+      "grad_norm": 0.5865651369094849,
+      "learning_rate": 0.003,
+      "loss": 4.1004,
+      "step": 3702
+    },
+    {
+      "epoch": 0.03703,
+      "grad_norm": 0.6238481402397156,
+      "learning_rate": 0.003,
+      "loss": 4.1129,
+      "step": 3703
+    },
+    {
+      "epoch": 0.03704,
+      "grad_norm": 0.630403995513916,
+      "learning_rate": 0.003,
+      "loss": 4.1145,
+      "step": 3704
+    },
+    {
+      "epoch": 0.03705,
+      "grad_norm": 0.6488320231437683,
+      "learning_rate": 0.003,
+      "loss": 4.1311,
+      "step": 3705
+    },
+    {
+      "epoch": 0.03706,
+      "grad_norm": 0.6384865045547485,
+      "learning_rate": 0.003,
+      "loss": 4.1146,
+      "step": 3706
+    },
+    {
+      "epoch": 0.03707,
+      "grad_norm": 0.6757420897483826,
+      "learning_rate": 0.003,
+      "loss": 4.0963,
+      "step": 3707
+    },
+    {
+      "epoch": 0.03708,
+      "grad_norm": 0.7628651857376099,
+      "learning_rate": 0.003,
+      "loss": 4.1187,
+      "step": 3708
+    },
+    {
+      "epoch": 0.03709,
+      "grad_norm": 0.6610810160636902,
+      "learning_rate": 0.003,
+      "loss": 4.0902,
+      "step": 3709
+    },
+    {
+      "epoch": 0.0371,
+      "grad_norm": 0.6568567156791687,
+      "learning_rate": 0.003,
+      "loss": 4.1283,
+      "step": 3710
+    },
+    {
+      "epoch": 0.03711,
+      "grad_norm": 0.6123287677764893,
+      "learning_rate": 0.003,
+      "loss": 4.0969,
+      "step": 3711
+    },
+    {
+      "epoch": 0.03712,
+      "grad_norm": 0.5023053288459778,
+      "learning_rate": 0.003,
+      "loss": 4.0948,
+      "step": 3712
+    },
+    {
+      "epoch": 0.03713,
+      "grad_norm": 0.561567485332489,
+      "learning_rate": 0.003,
+      "loss": 4.0848,
+      "step": 3713
+    },
+    {
+      "epoch": 0.03714,
+      "grad_norm": 0.6204001903533936,
+      "learning_rate": 0.003,
+      "loss": 4.0978,
+      "step": 3714
+    },
+    {
+      "epoch": 0.03715,
+      "grad_norm": 0.7001908421516418,
+      "learning_rate": 0.003,
+      "loss": 4.1084,
+      "step": 3715
+    },
+    {
+      "epoch": 0.03716,
+      "grad_norm": 0.7154534459114075,
+      "learning_rate": 0.003,
+      "loss": 4.1031,
+      "step": 3716
+    },
+    {
+      "epoch": 0.03717,
+      "grad_norm": 0.8945140242576599,
+      "learning_rate": 0.003,
+      "loss": 4.1087,
+      "step": 3717
+    },
+    {
+      "epoch": 0.03718,
+      "grad_norm": 0.9586088061332703,
+      "learning_rate": 0.003,
+      "loss": 4.0868,
+      "step": 3718
+    },
+    {
+      "epoch": 0.03719,
+      "grad_norm": 0.8600691556930542,
+      "learning_rate": 0.003,
+      "loss": 4.1178,
+      "step": 3719
+    },
+    {
+      "epoch": 0.0372,
+      "grad_norm": 0.7804197669029236,
+      "learning_rate": 0.003,
+      "loss": 4.1204,
+      "step": 3720
+    },
+    {
+      "epoch": 0.03721,
+      "grad_norm": 0.9556355476379395,
+      "learning_rate": 0.003,
+      "loss": 4.1145,
+      "step": 3721
+    },
+    {
+      "epoch": 0.03722,
+      "grad_norm": 0.9721794724464417,
+      "learning_rate": 0.003,
+      "loss": 4.1185,
+      "step": 3722
+    },
+    {
+      "epoch": 0.03723,
+      "grad_norm": 0.9020527005195618,
+      "learning_rate": 0.003,
+      "loss": 4.1534,
+      "step": 3723
+    },
+    {
+      "epoch": 0.03724,
+      "grad_norm": 0.7264366745948792,
+      "learning_rate": 0.003,
+      "loss": 4.1229,
+      "step": 3724
+    },
+    {
+      "epoch": 0.03725,
+      "grad_norm": 0.6015045046806335,
+      "learning_rate": 0.003,
+      "loss": 4.1288,
+      "step": 3725
+    },
+    {
+      "epoch": 0.03726,
+      "grad_norm": 0.5826119780540466,
+      "learning_rate": 0.003,
+      "loss": 4.1017,
+      "step": 3726
+    },
+    {
+      "epoch": 0.03727,
+      "grad_norm": 0.5803874731063843,
+      "learning_rate": 0.003,
+      "loss": 4.1349,
+      "step": 3727
+    },
+    {
+      "epoch": 0.03728,
+      "grad_norm": 0.5483809113502502,
+      "learning_rate": 0.003,
+      "loss": 4.0828,
+      "step": 3728
+    },
+    {
+      "epoch": 0.03729,
+      "grad_norm": 0.49769601225852966,
+      "learning_rate": 0.003,
+      "loss": 4.117,
+      "step": 3729
+    },
+    {
+      "epoch": 0.0373,
+      "grad_norm": 0.5014387965202332,
+      "learning_rate": 0.003,
+      "loss": 4.0982,
+      "step": 3730
+    },
+    {
+      "epoch": 0.03731,
+      "grad_norm": 0.5072956681251526,
+      "learning_rate": 0.003,
+      "loss": 4.1216,
+      "step": 3731
+    },
+    {
+      "epoch": 0.03732,
+      "grad_norm": 0.5933743119239807,
+      "learning_rate": 0.003,
+      "loss": 4.1209,
+      "step": 3732
+    },
+    {
+      "epoch": 0.03733,
+      "grad_norm": 0.644372820854187,
+      "learning_rate": 0.003,
+      "loss": 4.0984,
+      "step": 3733
+    },
+    {
+      "epoch": 0.03734,
+      "grad_norm": 0.6615225076675415,
+      "learning_rate": 0.003,
+      "loss": 4.0993,
+      "step": 3734
+    },
+    {
+      "epoch": 0.03735,
+      "grad_norm": 0.7122288942337036,
+      "learning_rate": 0.003,
+      "loss": 4.0961,
+      "step": 3735
+    },
+    {
+      "epoch": 0.03736,
+      "grad_norm": 0.7745519876480103,
+      "learning_rate": 0.003,
+      "loss": 4.1273,
+      "step": 3736
+    },
+    {
+      "epoch": 0.03737,
+      "grad_norm": 0.7451056241989136,
+      "learning_rate": 0.003,
+      "loss": 4.1138,
+      "step": 3737
+    },
+    {
+      "epoch": 0.03738,
+      "grad_norm": 0.6421622037887573,
+      "learning_rate": 0.003,
+      "loss": 4.127,
+      "step": 3738
+    },
+    {
+      "epoch": 0.03739,
+      "grad_norm": 0.6206536889076233,
+      "learning_rate": 0.003,
+      "loss": 4.0911,
+      "step": 3739
+    },
+    {
+      "epoch": 0.0374,
+      "grad_norm": 0.6064566373825073,
+      "learning_rate": 0.003,
+      "loss": 4.1106,
+      "step": 3740
+    },
+    {
+      "epoch": 0.03741,
+      "grad_norm": 0.726904034614563,
+      "learning_rate": 0.003,
+      "loss": 4.0978,
+      "step": 3741
+    },
+    {
+      "epoch": 0.03742,
+      "grad_norm": 0.8068590760231018,
+      "learning_rate": 0.003,
+      "loss": 4.0937,
+      "step": 3742
+    },
+    {
+      "epoch": 0.03743,
+      "grad_norm": 0.6390416622161865,
+      "learning_rate": 0.003,
+      "loss": 4.0884,
+      "step": 3743
+    },
+    {
+      "epoch": 0.03744,
+      "grad_norm": 0.6150771975517273,
+      "learning_rate": 0.003,
+      "loss": 4.1172,
+      "step": 3744
+    },
+    {
+      "epoch": 0.03745,
+      "grad_norm": 0.664985716342926,
+      "learning_rate": 0.003,
+      "loss": 4.1024,
+      "step": 3745
+    },
+    {
+      "epoch": 0.03746,
+      "grad_norm": 0.7317749857902527,
+      "learning_rate": 0.003,
+      "loss": 4.1139,
+      "step": 3746
+    },
+    {
+      "epoch": 0.03747,
+      "grad_norm": 0.7559656500816345,
+      "learning_rate": 0.003,
+      "loss": 4.119,
+      "step": 3747
+    },
+    {
+      "epoch": 0.03748,
+      "grad_norm": 0.6956848502159119,
+      "learning_rate": 0.003,
+      "loss": 4.0925,
+      "step": 3748
+    },
+    {
+      "epoch": 0.03749,
+      "grad_norm": 0.8748852014541626,
+      "learning_rate": 0.003,
+      "loss": 4.1089,
+      "step": 3749
+    },
+    {
+      "epoch": 0.0375,
+      "grad_norm": 0.8952407240867615,
+      "learning_rate": 0.003,
+      "loss": 4.1339,
+      "step": 3750
+    },
+    {
+      "epoch": 0.03751,
+      "grad_norm": 0.9176827669143677,
+      "learning_rate": 0.003,
+      "loss": 4.0869,
+      "step": 3751
+    },
+    {
+      "epoch": 0.03752,
+      "grad_norm": 1.0543770790100098,
+      "learning_rate": 0.003,
+      "loss": 4.1191,
+      "step": 3752
+    },
+    {
+      "epoch": 0.03753,
+      "grad_norm": 0.9552967548370361,
+      "learning_rate": 0.003,
+      "loss": 4.1269,
+      "step": 3753
+    },
+    {
+      "epoch": 0.03754,
+      "grad_norm": 0.7647460699081421,
+      "learning_rate": 0.003,
+      "loss": 4.1277,
+      "step": 3754
+    },
+    {
+      "epoch": 0.03755,
+      "grad_norm": 0.8259355425834656,
+      "learning_rate": 0.003,
+      "loss": 4.1264,
+      "step": 3755
+    },
+    {
+      "epoch": 0.03756,
+      "grad_norm": 0.8404808044433594,
+      "learning_rate": 0.003,
+      "loss": 4.0998,
+      "step": 3756
+    },
+    {
+      "epoch": 0.03757,
+      "grad_norm": 0.7295672297477722,
+      "learning_rate": 0.003,
+      "loss": 4.1265,
+      "step": 3757
+    },
+    {
+      "epoch": 0.03758,
+      "grad_norm": 0.6218863129615784,
+      "learning_rate": 0.003,
+      "loss": 4.0653,
+      "step": 3758
+    },
+    {
+      "epoch": 0.03759,
+      "grad_norm": 0.5583391785621643,
+      "learning_rate": 0.003,
+      "loss": 4.1224,
+      "step": 3759
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.6009096503257751,
+      "learning_rate": 0.003,
+      "loss": 4.1056,
+      "step": 3760
+    },
+    {
+      "epoch": 0.03761,
+      "grad_norm": 0.7737796902656555,
+      "learning_rate": 0.003,
+      "loss": 4.0853,
+      "step": 3761
+    },
+    {
+      "epoch": 0.03762,
+      "grad_norm": 0.8101951479911804,
+      "learning_rate": 0.003,
+      "loss": 4.1395,
+      "step": 3762
+    },
+    {
+      "epoch": 0.03763,
+      "grad_norm": 0.6714347004890442,
+      "learning_rate": 0.003,
+      "loss": 4.1198,
+      "step": 3763
+    },
+    {
+      "epoch": 0.03764,
+      "grad_norm": 0.4961341321468353,
+      "learning_rate": 0.003,
+      "loss": 4.125,
+      "step": 3764
+    },
+    {
+      "epoch": 0.03765,
+      "grad_norm": 0.5392913818359375,
+      "learning_rate": 0.003,
+      "loss": 4.1471,
+      "step": 3765
+    },
+    {
+      "epoch": 0.03766,
+      "grad_norm": 0.6497456431388855,
+      "learning_rate": 0.003,
+      "loss": 4.0875,
+      "step": 3766
+    },
+    {
+      "epoch": 0.03767,
+      "grad_norm": 0.6586916446685791,
+      "learning_rate": 0.003,
+      "loss": 4.1283,
+      "step": 3767
+    },
+    {
+      "epoch": 0.03768,
+      "grad_norm": 0.6380057334899902,
+      "learning_rate": 0.003,
+      "loss": 4.1067,
+      "step": 3768
+    },
+    {
+      "epoch": 0.03769,
+      "grad_norm": 0.6120327711105347,
+      "learning_rate": 0.003,
+      "loss": 4.0939,
+      "step": 3769
+    },
+    {
+      "epoch": 0.0377,
+      "grad_norm": 0.5535104274749756,
+      "learning_rate": 0.003,
+      "loss": 4.074,
+      "step": 3770
+    },
+    {
+      "epoch": 0.03771,
+      "grad_norm": 0.5369603633880615,
+      "learning_rate": 0.003,
+      "loss": 4.0896,
+      "step": 3771
+    },
+    {
+      "epoch": 0.03772,
+      "grad_norm": 0.5378307700157166,
+      "learning_rate": 0.003,
+      "loss": 4.1075,
+      "step": 3772
+    },
+    {
+      "epoch": 0.03773,
+      "grad_norm": 0.5350049734115601,
+      "learning_rate": 0.003,
+      "loss": 4.1151,
+      "step": 3773
+    },
+    {
+      "epoch": 0.03774,
+      "grad_norm": 0.48990651965141296,
+      "learning_rate": 0.003,
+      "loss": 4.0971,
+      "step": 3774
+    },
+    {
+      "epoch": 0.03775,
+      "grad_norm": 0.4350149929523468,
+      "learning_rate": 0.003,
+      "loss": 4.112,
+      "step": 3775
+    },
+    {
+      "epoch": 0.03776,
+      "grad_norm": 0.455160915851593,
+      "learning_rate": 0.003,
+      "loss": 4.0888,
+      "step": 3776
+    },
+    {
+      "epoch": 0.03777,
+      "grad_norm": 0.4597632586956024,
+      "learning_rate": 0.003,
+      "loss": 4.1041,
+      "step": 3777
+    },
+    {
+      "epoch": 0.03778,
+      "grad_norm": 0.5477138757705688,
+      "learning_rate": 0.003,
+      "loss": 4.0821,
+      "step": 3778
+    },
+    {
+      "epoch": 0.03779,
+      "grad_norm": 0.7227920293807983,
+      "learning_rate": 0.003,
+      "loss": 4.1017,
+      "step": 3779
+    },
+    {
+      "epoch": 0.0378,
+      "grad_norm": 0.9108830690383911,
+      "learning_rate": 0.003,
+      "loss": 4.1068,
+      "step": 3780
+    },
+    {
+      "epoch": 0.03781,
+      "grad_norm": 1.0170267820358276,
+      "learning_rate": 0.003,
+      "loss": 4.1423,
+      "step": 3781
+    },
+    {
+      "epoch": 0.03782,
+      "grad_norm": 0.9500049948692322,
+      "learning_rate": 0.003,
+      "loss": 4.1159,
+      "step": 3782
+    },
+    {
+      "epoch": 0.03783,
+      "grad_norm": 0.7215931415557861,
+      "learning_rate": 0.003,
+      "loss": 4.1121,
+      "step": 3783
+    },
+    {
+      "epoch": 0.03784,
+      "grad_norm": 0.6211499571800232,
+      "learning_rate": 0.003,
+      "loss": 4.1103,
+      "step": 3784
+    },
+    {
+      "epoch": 0.03785,
+      "grad_norm": 0.6421756148338318,
+      "learning_rate": 0.003,
+      "loss": 4.0943,
+      "step": 3785
+    },
+    {
+      "epoch": 0.03786,
+      "grad_norm": 0.6241987943649292,
+      "learning_rate": 0.003,
+      "loss": 4.1119,
+      "step": 3786
+    },
+    {
+      "epoch": 0.03787,
+      "grad_norm": 0.6087479591369629,
+      "learning_rate": 0.003,
+      "loss": 4.0958,
+      "step": 3787
+    },
+    {
+      "epoch": 0.03788,
+      "grad_norm": 0.5939789414405823,
+      "learning_rate": 0.003,
+      "loss": 4.1271,
+      "step": 3788
+    },
+    {
+      "epoch": 0.03789,
+      "grad_norm": 0.5391459465026855,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 3789
+    },
+    {
+      "epoch": 0.0379,
+      "grad_norm": 0.5861315131187439,
+      "learning_rate": 0.003,
+      "loss": 4.0863,
+      "step": 3790
+    },
+    {
+      "epoch": 0.03791,
+      "grad_norm": 0.6246805191040039,
+      "learning_rate": 0.003,
+      "loss": 4.0974,
+      "step": 3791
+    },
+    {
+      "epoch": 0.03792,
+      "grad_norm": 0.7353853583335876,
+      "learning_rate": 0.003,
+      "loss": 4.1322,
+      "step": 3792
+    },
+    {
+      "epoch": 0.03793,
+      "grad_norm": 0.6765242218971252,
+      "learning_rate": 0.003,
+      "loss": 4.1235,
+      "step": 3793
+    },
+    {
+      "epoch": 0.03794,
+      "grad_norm": 0.7689555883407593,
+      "learning_rate": 0.003,
+      "loss": 4.1271,
+      "step": 3794
+    },
+    {
+      "epoch": 0.03795,
+      "grad_norm": 0.9137020707130432,
+      "learning_rate": 0.003,
+      "loss": 4.121,
+      "step": 3795
+    },
+    {
+      "epoch": 0.03796,
+      "grad_norm": 1.0011204481124878,
+      "learning_rate": 0.003,
+      "loss": 4.1083,
+      "step": 3796
+    },
+    {
+      "epoch": 0.03797,
+      "grad_norm": 1.0785057544708252,
+      "learning_rate": 0.003,
+      "loss": 4.1042,
+      "step": 3797
+    },
+    {
+      "epoch": 0.03798,
+      "grad_norm": 1.0564720630645752,
+      "learning_rate": 0.003,
+      "loss": 4.1262,
+      "step": 3798
+    },
+    {
+      "epoch": 0.03799,
+      "grad_norm": 0.9513949155807495,
+      "learning_rate": 0.003,
+      "loss": 4.1296,
+      "step": 3799
+    },
+    {
+      "epoch": 0.038,
+      "grad_norm": 0.7991365790367126,
+      "learning_rate": 0.003,
+      "loss": 4.136,
+      "step": 3800
+    },
+    {
+      "epoch": 0.03801,
+      "grad_norm": 0.942852795124054,
+      "learning_rate": 0.003,
+      "loss": 4.1481,
+      "step": 3801
+    },
+    {
+      "epoch": 0.03802,
+      "grad_norm": 0.9442994594573975,
+      "learning_rate": 0.003,
+      "loss": 4.1224,
+      "step": 3802
+    },
+    {
+      "epoch": 0.03803,
+      "grad_norm": 0.917535662651062,
+      "learning_rate": 0.003,
+      "loss": 4.1541,
+      "step": 3803
+    },
+    {
+      "epoch": 0.03804,
+      "grad_norm": 1.0844844579696655,
+      "learning_rate": 0.003,
+      "loss": 4.1317,
+      "step": 3804
+    },
+    {
+      "epoch": 0.03805,
+      "grad_norm": 0.8430477380752563,
+      "learning_rate": 0.003,
+      "loss": 4.1379,
+      "step": 3805
+    },
+    {
+      "epoch": 0.03806,
+      "grad_norm": 0.8205291032791138,
+      "learning_rate": 0.003,
+      "loss": 4.1397,
+      "step": 3806
+    },
+    {
+      "epoch": 0.03807,
+      "grad_norm": 0.8324175477027893,
+      "learning_rate": 0.003,
+      "loss": 4.1299,
+      "step": 3807
+    },
+    {
+      "epoch": 0.03808,
+      "grad_norm": 0.8151392936706543,
+      "learning_rate": 0.003,
+      "loss": 4.146,
+      "step": 3808
+    },
+    {
+      "epoch": 0.03809,
+      "grad_norm": 0.6972253918647766,
+      "learning_rate": 0.003,
+      "loss": 4.1248,
+      "step": 3809
+    },
+    {
+      "epoch": 0.0381,
+      "grad_norm": 0.7419064044952393,
+      "learning_rate": 0.003,
+      "loss": 4.109,
+      "step": 3810
+    },
+    {
+      "epoch": 0.03811,
+      "grad_norm": 0.72950679063797,
+      "learning_rate": 0.003,
+      "loss": 4.1397,
+      "step": 3811
+    },
+    {
+      "epoch": 0.03812,
+      "grad_norm": 0.7456114888191223,
+      "learning_rate": 0.003,
+      "loss": 4.1298,
+      "step": 3812
+    },
+    {
+      "epoch": 0.03813,
+      "grad_norm": 0.868884801864624,
+      "learning_rate": 0.003,
+      "loss": 4.1041,
+      "step": 3813
+    },
+    {
+      "epoch": 0.03814,
+      "grad_norm": 1.0820525884628296,
+      "learning_rate": 0.003,
+      "loss": 4.1402,
+      "step": 3814
+    },
+    {
+      "epoch": 0.03815,
+      "grad_norm": 1.0806629657745361,
+      "learning_rate": 0.003,
+      "loss": 4.1304,
+      "step": 3815
+    },
+    {
+      "epoch": 0.03816,
+      "grad_norm": 0.820555567741394,
+      "learning_rate": 0.003,
+      "loss": 4.126,
+      "step": 3816
+    },
+    {
+      "epoch": 0.03817,
+      "grad_norm": 0.7203388810157776,
+      "learning_rate": 0.003,
+      "loss": 4.1016,
+      "step": 3817
+    },
+    {
+      "epoch": 0.03818,
+      "grad_norm": 0.6214293241500854,
+      "learning_rate": 0.003,
+      "loss": 4.1479,
+      "step": 3818
+    },
+    {
+      "epoch": 0.03819,
+      "grad_norm": 0.5392916798591614,
+      "learning_rate": 0.003,
+      "loss": 4.0977,
+      "step": 3819
+    },
+    {
+      "epoch": 0.0382,
+      "grad_norm": 0.5882942080497742,
+      "learning_rate": 0.003,
+      "loss": 4.1451,
+      "step": 3820
+    },
+    {
+      "epoch": 0.03821,
+      "grad_norm": 0.6876145005226135,
+      "learning_rate": 0.003,
+      "loss": 4.0921,
+      "step": 3821
+    },
+    {
+      "epoch": 0.03822,
+      "grad_norm": 0.6543509364128113,
+      "learning_rate": 0.003,
+      "loss": 4.1133,
+      "step": 3822
+    },
+    {
+      "epoch": 0.03823,
+      "grad_norm": 0.6853277087211609,
+      "learning_rate": 0.003,
+      "loss": 4.1091,
+      "step": 3823
+    },
+    {
+      "epoch": 0.03824,
+      "grad_norm": 0.6686453819274902,
+      "learning_rate": 0.003,
+      "loss": 4.1061,
+      "step": 3824
+    },
+    {
+      "epoch": 0.03825,
+      "grad_norm": 0.5822768211364746,
+      "learning_rate": 0.003,
+      "loss": 4.1176,
+      "step": 3825
+    },
+    {
+      "epoch": 0.03826,
+      "grad_norm": 0.5187621712684631,
+      "learning_rate": 0.003,
+      "loss": 4.101,
+      "step": 3826
+    },
+    {
+      "epoch": 0.03827,
+      "grad_norm": 0.48330262303352356,
+      "learning_rate": 0.003,
+      "loss": 4.125,
+      "step": 3827
+    },
+    {
+      "epoch": 0.03828,
+      "grad_norm": 0.525285542011261,
+      "learning_rate": 0.003,
+      "loss": 4.0998,
+      "step": 3828
+    },
+    {
+      "epoch": 0.03829,
+      "grad_norm": 0.5435876846313477,
+      "learning_rate": 0.003,
+      "loss": 4.1066,
+      "step": 3829
+    },
+    {
+      "epoch": 0.0383,
+      "grad_norm": 0.5318546295166016,
+      "learning_rate": 0.003,
+      "loss": 4.0943,
+      "step": 3830
+    },
+    {
+      "epoch": 0.03831,
+      "grad_norm": 0.5064510703086853,
+      "learning_rate": 0.003,
+      "loss": 4.0935,
+      "step": 3831
+    },
+    {
+      "epoch": 0.03832,
+      "grad_norm": 0.5135751962661743,
+      "learning_rate": 0.003,
+      "loss": 4.0901,
+      "step": 3832
+    },
+    {
+      "epoch": 0.03833,
+      "grad_norm": 0.5150834321975708,
+      "learning_rate": 0.003,
+      "loss": 4.091,
+      "step": 3833
+    },
+    {
+      "epoch": 0.03834,
+      "grad_norm": 0.5075490474700928,
+      "learning_rate": 0.003,
+      "loss": 4.1195,
+      "step": 3834
+    },
+    {
+      "epoch": 0.03835,
+      "grad_norm": 0.47964203357696533,
+      "learning_rate": 0.003,
+      "loss": 4.1071,
+      "step": 3835
+    },
+    {
+      "epoch": 0.03836,
+      "grad_norm": 0.5092160105705261,
+      "learning_rate": 0.003,
+      "loss": 4.1087,
+      "step": 3836
+    },
+    {
+      "epoch": 0.03837,
+      "grad_norm": 0.5252288579940796,
+      "learning_rate": 0.003,
+      "loss": 4.0969,
+      "step": 3837
+    },
+    {
+      "epoch": 0.03838,
+      "grad_norm": 0.4719623029232025,
+      "learning_rate": 0.003,
+      "loss": 4.0815,
+      "step": 3838
+    },
+    {
+      "epoch": 0.03839,
+      "grad_norm": 0.525349497795105,
+      "learning_rate": 0.003,
+      "loss": 4.09,
+      "step": 3839
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.5973276495933533,
+      "learning_rate": 0.003,
+      "loss": 4.0965,
+      "step": 3840
+    },
+    {
+      "epoch": 0.03841,
+      "grad_norm": 0.7389470338821411,
+      "learning_rate": 0.003,
+      "loss": 4.101,
+      "step": 3841
+    },
+    {
+      "epoch": 0.03842,
+      "grad_norm": 0.8890230059623718,
+      "learning_rate": 0.003,
+      "loss": 4.1096,
+      "step": 3842
+    },
+    {
+      "epoch": 0.03843,
+      "grad_norm": 0.9227127432823181,
+      "learning_rate": 0.003,
+      "loss": 4.0975,
+      "step": 3843
+    },
+    {
+      "epoch": 0.03844,
+      "grad_norm": 0.8832306861877441,
+      "learning_rate": 0.003,
+      "loss": 4.1101,
+      "step": 3844
+    },
+    {
+      "epoch": 0.03845,
+      "grad_norm": 0.711075484752655,
+      "learning_rate": 0.003,
+      "loss": 4.104,
+      "step": 3845
+    },
+    {
+      "epoch": 0.03846,
+      "grad_norm": 0.5776762962341309,
+      "learning_rate": 0.003,
+      "loss": 4.1135,
+      "step": 3846
+    },
+    {
+      "epoch": 0.03847,
+      "grad_norm": 0.6540825963020325,
+      "learning_rate": 0.003,
+      "loss": 4.1065,
+      "step": 3847
+    },
+    {
+      "epoch": 0.03848,
+      "grad_norm": 0.6692461967468262,
+      "learning_rate": 0.003,
+      "loss": 4.0898,
+      "step": 3848
+    },
+    {
+      "epoch": 0.03849,
+      "grad_norm": 0.6830011010169983,
+      "learning_rate": 0.003,
+      "loss": 4.1044,
+      "step": 3849
+    },
+    {
+      "epoch": 0.0385,
+      "grad_norm": 0.6109870076179504,
+      "learning_rate": 0.003,
+      "loss": 4.1145,
+      "step": 3850
+    },
+    {
+      "epoch": 0.03851,
+      "grad_norm": 0.6136434078216553,
+      "learning_rate": 0.003,
+      "loss": 4.0754,
+      "step": 3851
+    },
+    {
+      "epoch": 0.03852,
+      "grad_norm": 0.6604334712028503,
+      "learning_rate": 0.003,
+      "loss": 4.1124,
+      "step": 3852
+    },
+    {
+      "epoch": 0.03853,
+      "grad_norm": 0.59455806016922,
+      "learning_rate": 0.003,
+      "loss": 4.1074,
+      "step": 3853
+    },
+    {
+      "epoch": 0.03854,
+      "grad_norm": 0.575217604637146,
+      "learning_rate": 0.003,
+      "loss": 4.0982,
+      "step": 3854
+    },
+    {
+      "epoch": 0.03855,
+      "grad_norm": 0.6192313432693481,
+      "learning_rate": 0.003,
+      "loss": 4.1173,
+      "step": 3855
+    },
+    {
+      "epoch": 0.03856,
+      "grad_norm": 0.6025145649909973,
+      "learning_rate": 0.003,
+      "loss": 4.0898,
+      "step": 3856
+    },
+    {
+      "epoch": 0.03857,
+      "grad_norm": 0.6573590636253357,
+      "learning_rate": 0.003,
+      "loss": 4.0973,
+      "step": 3857
+    },
+    {
+      "epoch": 0.03858,
+      "grad_norm": 0.6135656833648682,
+      "learning_rate": 0.003,
+      "loss": 4.0911,
+      "step": 3858
+    },
+    {
+      "epoch": 0.03859,
+      "grad_norm": 0.6176097989082336,
+      "learning_rate": 0.003,
+      "loss": 4.1146,
+      "step": 3859
+    },
+    {
+      "epoch": 0.0386,
+      "grad_norm": 0.7532007098197937,
+      "learning_rate": 0.003,
+      "loss": 4.0797,
+      "step": 3860
+    },
+    {
+      "epoch": 0.03861,
+      "grad_norm": 0.8719074130058289,
+      "learning_rate": 0.003,
+      "loss": 4.1223,
+      "step": 3861
+    },
+    {
+      "epoch": 0.03862,
+      "grad_norm": 1.079362154006958,
+      "learning_rate": 0.003,
+      "loss": 4.1073,
+      "step": 3862
+    },
+    {
+      "epoch": 0.03863,
+      "grad_norm": 0.9231005311012268,
+      "learning_rate": 0.003,
+      "loss": 4.1031,
+      "step": 3863
+    },
+    {
+      "epoch": 0.03864,
+      "grad_norm": 0.776627242565155,
+      "learning_rate": 0.003,
+      "loss": 4.105,
+      "step": 3864
+    },
+    {
+      "epoch": 0.03865,
+      "grad_norm": 0.6355084180831909,
+      "learning_rate": 0.003,
+      "loss": 4.1132,
+      "step": 3865
+    },
+    {
+      "epoch": 0.03866,
+      "grad_norm": 0.6550437211990356,
+      "learning_rate": 0.003,
+      "loss": 4.0906,
+      "step": 3866
+    },
+    {
+      "epoch": 0.03867,
+      "grad_norm": 0.8415738940238953,
+      "learning_rate": 0.003,
+      "loss": 4.0778,
+      "step": 3867
+    },
+    {
+      "epoch": 0.03868,
+      "grad_norm": 0.8901708126068115,
+      "learning_rate": 0.003,
+      "loss": 4.1199,
+      "step": 3868
+    },
+    {
+      "epoch": 0.03869,
+      "grad_norm": 0.697814404964447,
+      "learning_rate": 0.003,
+      "loss": 4.1022,
+      "step": 3869
+    },
+    {
+      "epoch": 0.0387,
+      "grad_norm": 0.6080979108810425,
+      "learning_rate": 0.003,
+      "loss": 4.1098,
+      "step": 3870
+    },
+    {
+      "epoch": 0.03871,
+      "grad_norm": 0.6336243748664856,
+      "learning_rate": 0.003,
+      "loss": 4.0855,
+      "step": 3871
+    },
+    {
+      "epoch": 0.03872,
+      "grad_norm": 0.6221011877059937,
+      "learning_rate": 0.003,
+      "loss": 4.0776,
+      "step": 3872
+    },
+    {
+      "epoch": 0.03873,
+      "grad_norm": 0.6594142317771912,
+      "learning_rate": 0.003,
+      "loss": 4.1194,
+      "step": 3873
+    },
+    {
+      "epoch": 0.03874,
+      "grad_norm": 0.5940245389938354,
+      "learning_rate": 0.003,
+      "loss": 4.0914,
+      "step": 3874
+    },
+    {
+      "epoch": 0.03875,
+      "grad_norm": 0.47639554738998413,
+      "learning_rate": 0.003,
+      "loss": 4.0897,
+      "step": 3875
+    },
+    {
+      "epoch": 0.03876,
+      "grad_norm": 0.5326244831085205,
+      "learning_rate": 0.003,
+      "loss": 4.123,
+      "step": 3876
+    },
+    {
+      "epoch": 0.03877,
+      "grad_norm": 0.5539395213127136,
+      "learning_rate": 0.003,
+      "loss": 4.1029,
+      "step": 3877
+    },
+    {
+      "epoch": 0.03878,
+      "grad_norm": 0.604924201965332,
+      "learning_rate": 0.003,
+      "loss": 4.1151,
+      "step": 3878
+    },
+    {
+      "epoch": 0.03879,
+      "grad_norm": 0.6767284870147705,
+      "learning_rate": 0.003,
+      "loss": 4.1189,
+      "step": 3879
+    },
+    {
+      "epoch": 0.0388,
+      "grad_norm": 0.8324589133262634,
+      "learning_rate": 0.003,
+      "loss": 4.0894,
+      "step": 3880
+    },
+    {
+      "epoch": 0.03881,
+      "grad_norm": 1.0423182249069214,
+      "learning_rate": 0.003,
+      "loss": 4.1216,
+      "step": 3881
+    },
+    {
+      "epoch": 0.03882,
+      "grad_norm": 1.036028504371643,
+      "learning_rate": 0.003,
+      "loss": 4.1333,
+      "step": 3882
+    },
+    {
+      "epoch": 0.03883,
+      "grad_norm": 0.8236280679702759,
+      "learning_rate": 0.003,
+      "loss": 4.0962,
+      "step": 3883
+    },
+    {
+      "epoch": 0.03884,
+      "grad_norm": 0.6207178831100464,
+      "learning_rate": 0.003,
+      "loss": 4.1053,
+      "step": 3884
+    },
+    {
+      "epoch": 0.03885,
+      "grad_norm": 0.6633720397949219,
+      "learning_rate": 0.003,
+      "loss": 4.122,
+      "step": 3885
+    },
+    {
+      "epoch": 0.03886,
+      "grad_norm": 0.6237534880638123,
+      "learning_rate": 0.003,
+      "loss": 4.0972,
+      "step": 3886
+    },
+    {
+      "epoch": 0.03887,
+      "grad_norm": 0.49867168068885803,
+      "learning_rate": 0.003,
+      "loss": 4.1034,
+      "step": 3887
+    },
+    {
+      "epoch": 0.03888,
+      "grad_norm": 0.4786747395992279,
+      "learning_rate": 0.003,
+      "loss": 4.0787,
+      "step": 3888
+    },
+    {
+      "epoch": 0.03889,
+      "grad_norm": 0.5030584931373596,
+      "learning_rate": 0.003,
+      "loss": 4.0971,
+      "step": 3889
+    },
+    {
+      "epoch": 0.0389,
+      "grad_norm": 0.6045381426811218,
+      "learning_rate": 0.003,
+      "loss": 4.0885,
+      "step": 3890
+    },
+    {
+      "epoch": 0.03891,
+      "grad_norm": 0.6243774890899658,
+      "learning_rate": 0.003,
+      "loss": 4.0961,
+      "step": 3891
+    },
+    {
+      "epoch": 0.03892,
+      "grad_norm": 0.7076771855354309,
+      "learning_rate": 0.003,
+      "loss": 4.1158,
+      "step": 3892
+    },
+    {
+      "epoch": 0.03893,
+      "grad_norm": 0.6316627264022827,
+      "learning_rate": 0.003,
+      "loss": 4.0896,
+      "step": 3893
+    },
+    {
+      "epoch": 0.03894,
+      "grad_norm": 0.6711505055427551,
+      "learning_rate": 0.003,
+      "loss": 4.1012,
+      "step": 3894
+    },
+    {
+      "epoch": 0.03895,
+      "grad_norm": 0.7016406655311584,
+      "learning_rate": 0.003,
+      "loss": 4.0776,
+      "step": 3895
+    },
+    {
+      "epoch": 0.03896,
+      "grad_norm": 0.6848783493041992,
+      "learning_rate": 0.003,
+      "loss": 4.1034,
+      "step": 3896
+    },
+    {
+      "epoch": 0.03897,
+      "grad_norm": 0.701120913028717,
+      "learning_rate": 0.003,
+      "loss": 4.0781,
+      "step": 3897
+    },
+    {
+      "epoch": 0.03898,
+      "grad_norm": 0.678917407989502,
+      "learning_rate": 0.003,
+      "loss": 4.1108,
+      "step": 3898
+    },
+    {
+      "epoch": 0.03899,
+      "grad_norm": 0.7172825336456299,
+      "learning_rate": 0.003,
+      "loss": 4.1019,
+      "step": 3899
+    },
+    {
+      "epoch": 0.039,
+      "grad_norm": 0.6747111082077026,
+      "learning_rate": 0.003,
+      "loss": 4.1015,
+      "step": 3900
+    },
+    {
+      "epoch": 0.03901,
+      "grad_norm": 0.7526534795761108,
+      "learning_rate": 0.003,
+      "loss": 4.1173,
+      "step": 3901
+    },
+    {
+      "epoch": 0.03902,
+      "grad_norm": 0.7362321615219116,
+      "learning_rate": 0.003,
+      "loss": 4.0883,
+      "step": 3902
+    },
+    {
+      "epoch": 0.03903,
+      "grad_norm": 0.8664571642875671,
+      "learning_rate": 0.003,
+      "loss": 4.1302,
+      "step": 3903
+    },
+    {
+      "epoch": 0.03904,
+      "grad_norm": 1.0233482122421265,
+      "learning_rate": 0.003,
+      "loss": 4.1165,
+      "step": 3904
+    },
+    {
+      "epoch": 0.03905,
+      "grad_norm": 0.9382357001304626,
+      "learning_rate": 0.003,
+      "loss": 4.1356,
+      "step": 3905
+    },
+    {
+      "epoch": 0.03906,
+      "grad_norm": 0.687033474445343,
+      "learning_rate": 0.003,
+      "loss": 4.1007,
+      "step": 3906
+    },
+    {
+      "epoch": 0.03907,
+      "grad_norm": 0.6406064033508301,
+      "learning_rate": 0.003,
+      "loss": 4.1115,
+      "step": 3907
+    },
+    {
+      "epoch": 0.03908,
+      "grad_norm": 0.6577279567718506,
+      "learning_rate": 0.003,
+      "loss": 4.0944,
+      "step": 3908
+    },
+    {
+      "epoch": 0.03909,
+      "grad_norm": 0.6069698333740234,
+      "learning_rate": 0.003,
+      "loss": 4.1004,
+      "step": 3909
+    },
+    {
+      "epoch": 0.0391,
+      "grad_norm": 0.625200629234314,
+      "learning_rate": 0.003,
+      "loss": 4.1285,
+      "step": 3910
+    },
+    {
+      "epoch": 0.03911,
+      "grad_norm": 0.6532104015350342,
+      "learning_rate": 0.003,
+      "loss": 4.1117,
+      "step": 3911
+    },
+    {
+      "epoch": 0.03912,
+      "grad_norm": 0.5893357992172241,
+      "learning_rate": 0.003,
+      "loss": 4.1101,
+      "step": 3912
+    },
+    {
+      "epoch": 0.03913,
+      "grad_norm": 0.6330502033233643,
+      "learning_rate": 0.003,
+      "loss": 4.0822,
+      "step": 3913
+    },
+    {
+      "epoch": 0.03914,
+      "grad_norm": 0.7185488343238831,
+      "learning_rate": 0.003,
+      "loss": 4.1191,
+      "step": 3914
+    },
+    {
+      "epoch": 0.03915,
+      "grad_norm": 0.8594509363174438,
+      "learning_rate": 0.003,
+      "loss": 4.0951,
+      "step": 3915
+    },
+    {
+      "epoch": 0.03916,
+      "grad_norm": 0.9027094841003418,
+      "learning_rate": 0.003,
+      "loss": 4.1242,
+      "step": 3916
+    },
+    {
+      "epoch": 0.03917,
+      "grad_norm": 0.9196935892105103,
+      "learning_rate": 0.003,
+      "loss": 4.1213,
+      "step": 3917
+    },
+    {
+      "epoch": 0.03918,
+      "grad_norm": 1.003941535949707,
+      "learning_rate": 0.003,
+      "loss": 4.1127,
+      "step": 3918
+    },
+    {
+      "epoch": 0.03919,
+      "grad_norm": 1.0010123252868652,
+      "learning_rate": 0.003,
+      "loss": 4.0827,
+      "step": 3919
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.9059627056121826,
+      "learning_rate": 0.003,
+      "loss": 4.1073,
+      "step": 3920
+    },
+    {
+      "epoch": 0.03921,
+      "grad_norm": 0.95637047290802,
+      "learning_rate": 0.003,
+      "loss": 4.1083,
+      "step": 3921
+    },
+    {
+      "epoch": 0.03922,
+      "grad_norm": 0.8252183198928833,
+      "learning_rate": 0.003,
+      "loss": 4.1469,
+      "step": 3922
+    },
+    {
+      "epoch": 0.03923,
+      "grad_norm": 0.8936133980751038,
+      "learning_rate": 0.003,
+      "loss": 4.103,
+      "step": 3923
+    },
+    {
+      "epoch": 0.03924,
+      "grad_norm": 0.7836267352104187,
+      "learning_rate": 0.003,
+      "loss": 4.1067,
+      "step": 3924
+    },
+    {
+      "epoch": 0.03925,
+      "grad_norm": 0.7859375476837158,
+      "learning_rate": 0.003,
+      "loss": 4.1147,
+      "step": 3925
+    },
+    {
+      "epoch": 0.03926,
+      "grad_norm": 0.7374272346496582,
+      "learning_rate": 0.003,
+      "loss": 4.1025,
+      "step": 3926
+    },
+    {
+      "epoch": 0.03927,
+      "grad_norm": 0.8471856117248535,
+      "learning_rate": 0.003,
+      "loss": 4.1198,
+      "step": 3927
+    },
+    {
+      "epoch": 0.03928,
+      "grad_norm": 0.85481858253479,
+      "learning_rate": 0.003,
+      "loss": 4.1374,
+      "step": 3928
+    },
+    {
+      "epoch": 0.03929,
+      "grad_norm": 0.907612144947052,
+      "learning_rate": 0.003,
+      "loss": 4.1445,
+      "step": 3929
+    },
+    {
+      "epoch": 0.0393,
+      "grad_norm": 0.7699798345565796,
+      "learning_rate": 0.003,
+      "loss": 4.1283,
+      "step": 3930
+    },
+    {
+      "epoch": 0.03931,
+      "grad_norm": 0.6045697331428528,
+      "learning_rate": 0.003,
+      "loss": 4.1255,
+      "step": 3931
+    },
+    {
+      "epoch": 0.03932,
+      "grad_norm": 0.6712193489074707,
+      "learning_rate": 0.003,
+      "loss": 4.1268,
+      "step": 3932
+    },
+    {
+      "epoch": 0.03933,
+      "grad_norm": 0.8346224427223206,
+      "learning_rate": 0.003,
+      "loss": 4.1207,
+      "step": 3933
+    },
+    {
+      "epoch": 0.03934,
+      "grad_norm": 0.8903274536132812,
+      "learning_rate": 0.003,
+      "loss": 4.12,
+      "step": 3934
+    },
+    {
+      "epoch": 0.03935,
+      "grad_norm": 0.7318519353866577,
+      "learning_rate": 0.003,
+      "loss": 4.1156,
+      "step": 3935
+    },
+    {
+      "epoch": 0.03936,
+      "grad_norm": 0.6603662967681885,
+      "learning_rate": 0.003,
+      "loss": 4.0714,
+      "step": 3936
+    },
+    {
+      "epoch": 0.03937,
+      "grad_norm": 0.6446847915649414,
+      "learning_rate": 0.003,
+      "loss": 4.1072,
+      "step": 3937
+    },
+    {
+      "epoch": 0.03938,
+      "grad_norm": 0.6806734204292297,
+      "learning_rate": 0.003,
+      "loss": 4.1099,
+      "step": 3938
+    },
+    {
+      "epoch": 0.03939,
+      "grad_norm": 0.5945215225219727,
+      "learning_rate": 0.003,
+      "loss": 4.088,
+      "step": 3939
+    },
+    {
+      "epoch": 0.0394,
+      "grad_norm": 0.5750917196273804,
+      "learning_rate": 0.003,
+      "loss": 4.116,
+      "step": 3940
+    },
+    {
+      "epoch": 0.03941,
+      "grad_norm": 0.5394409894943237,
+      "learning_rate": 0.003,
+      "loss": 4.1041,
+      "step": 3941
+    },
+    {
+      "epoch": 0.03942,
+      "grad_norm": 0.49402347207069397,
+      "learning_rate": 0.003,
+      "loss": 4.1168,
+      "step": 3942
+    },
+    {
+      "epoch": 0.03943,
+      "grad_norm": 0.39833274483680725,
+      "learning_rate": 0.003,
+      "loss": 4.1017,
+      "step": 3943
+    },
+    {
+      "epoch": 0.03944,
+      "grad_norm": 0.4659424126148224,
+      "learning_rate": 0.003,
+      "loss": 4.1164,
+      "step": 3944
+    },
+    {
+      "epoch": 0.03945,
+      "grad_norm": 0.4315028488636017,
+      "learning_rate": 0.003,
+      "loss": 4.0865,
+      "step": 3945
+    },
+    {
+      "epoch": 0.03946,
+      "grad_norm": 0.40673503279685974,
+      "learning_rate": 0.003,
+      "loss": 4.1113,
+      "step": 3946
+    },
+    {
+      "epoch": 0.03947,
+      "grad_norm": 0.38524332642555237,
+      "learning_rate": 0.003,
+      "loss": 4.1127,
+      "step": 3947
+    },
+    {
+      "epoch": 0.03948,
+      "grad_norm": 0.33258089423179626,
+      "learning_rate": 0.003,
+      "loss": 4.1268,
+      "step": 3948
+    },
+    {
+      "epoch": 0.03949,
+      "grad_norm": 0.38337442278862,
+      "learning_rate": 0.003,
+      "loss": 4.0688,
+      "step": 3949
+    },
+    {
+      "epoch": 0.0395,
+      "grad_norm": 0.46301016211509705,
+      "learning_rate": 0.003,
+      "loss": 4.0994,
+      "step": 3950
+    },
+    {
+      "epoch": 0.03951,
+      "grad_norm": 0.7521165013313293,
+      "learning_rate": 0.003,
+      "loss": 4.1331,
+      "step": 3951
+    },
+    {
+      "epoch": 0.03952,
+      "grad_norm": 1.1936590671539307,
+      "learning_rate": 0.003,
+      "loss": 4.1283,
+      "step": 3952
+    },
+    {
+      "epoch": 0.03953,
+      "grad_norm": 0.9014281034469604,
+      "learning_rate": 0.003,
+      "loss": 4.1033,
+      "step": 3953
+    },
+    {
+      "epoch": 0.03954,
+      "grad_norm": 0.5299249887466431,
+      "learning_rate": 0.003,
+      "loss": 4.0948,
+      "step": 3954
+    },
+    {
+      "epoch": 0.03955,
+      "grad_norm": 0.621684193611145,
+      "learning_rate": 0.003,
+      "loss": 4.0979,
+      "step": 3955
+    },
+    {
+      "epoch": 0.03956,
+      "grad_norm": 0.7184932827949524,
+      "learning_rate": 0.003,
+      "loss": 4.101,
+      "step": 3956
+    },
+    {
+      "epoch": 0.03957,
+      "grad_norm": 0.6237415075302124,
+      "learning_rate": 0.003,
+      "loss": 4.1048,
+      "step": 3957
+    },
+    {
+      "epoch": 0.03958,
+      "grad_norm": 0.5475152134895325,
+      "learning_rate": 0.003,
+      "loss": 4.0784,
+      "step": 3958
+    },
+    {
+      "epoch": 0.03959,
+      "grad_norm": 0.5959770083427429,
+      "learning_rate": 0.003,
+      "loss": 4.0812,
+      "step": 3959
+    },
+    {
+      "epoch": 0.0396,
+      "grad_norm": 0.6074774861335754,
+      "learning_rate": 0.003,
+      "loss": 4.1001,
+      "step": 3960
+    },
+    {
+      "epoch": 0.03961,
+      "grad_norm": 0.6236320734024048,
+      "learning_rate": 0.003,
+      "loss": 4.0973,
+      "step": 3961
+    },
+    {
+      "epoch": 0.03962,
+      "grad_norm": 0.6013439297676086,
+      "learning_rate": 0.003,
+      "loss": 4.1081,
+      "step": 3962
+    },
+    {
+      "epoch": 0.03963,
+      "grad_norm": 0.6263412237167358,
+      "learning_rate": 0.003,
+      "loss": 4.0929,
+      "step": 3963
+    },
+    {
+      "epoch": 0.03964,
+      "grad_norm": 0.656822919845581,
+      "learning_rate": 0.003,
+      "loss": 4.0974,
+      "step": 3964
+    },
+    {
+      "epoch": 0.03965,
+      "grad_norm": 0.6711769700050354,
+      "learning_rate": 0.003,
+      "loss": 4.096,
+      "step": 3965
+    },
+    {
+      "epoch": 0.03966,
+      "grad_norm": 0.7558622360229492,
+      "learning_rate": 0.003,
+      "loss": 4.1394,
+      "step": 3966
+    },
+    {
+      "epoch": 0.03967,
+      "grad_norm": 0.8678721785545349,
+      "learning_rate": 0.003,
+      "loss": 4.101,
+      "step": 3967
+    },
+    {
+      "epoch": 0.03968,
+      "grad_norm": 0.9608453512191772,
+      "learning_rate": 0.003,
+      "loss": 4.1351,
+      "step": 3968
+    },
+    {
+      "epoch": 0.03969,
+      "grad_norm": 0.92333984375,
+      "learning_rate": 0.003,
+      "loss": 4.1176,
+      "step": 3969
+    },
+    {
+      "epoch": 0.0397,
+      "grad_norm": 1.0910831689834595,
+      "learning_rate": 0.003,
+      "loss": 4.1298,
+      "step": 3970
+    },
+    {
+      "epoch": 0.03971,
+      "grad_norm": 0.8948251008987427,
+      "learning_rate": 0.003,
+      "loss": 4.1057,
+      "step": 3971
+    },
+    {
+      "epoch": 0.03972,
+      "grad_norm": 0.8917779922485352,
+      "learning_rate": 0.003,
+      "loss": 4.1289,
+      "step": 3972
+    },
+    {
+      "epoch": 0.03973,
+      "grad_norm": 0.9352078437805176,
+      "learning_rate": 0.003,
+      "loss": 4.1347,
+      "step": 3973
+    },
+    {
+      "epoch": 0.03974,
+      "grad_norm": 0.8667543530464172,
+      "learning_rate": 0.003,
+      "loss": 4.1303,
+      "step": 3974
+    },
+    {
+      "epoch": 0.03975,
+      "grad_norm": 0.7579936385154724,
+      "learning_rate": 0.003,
+      "loss": 4.1131,
+      "step": 3975
+    },
+    {
+      "epoch": 0.03976,
+      "grad_norm": 0.7740127444267273,
+      "learning_rate": 0.003,
+      "loss": 4.1498,
+      "step": 3976
+    },
+    {
+      "epoch": 0.03977,
+      "grad_norm": 0.7652050256729126,
+      "learning_rate": 0.003,
+      "loss": 4.1352,
+      "step": 3977
+    },
+    {
+      "epoch": 0.03978,
+      "grad_norm": 0.7869767546653748,
+      "learning_rate": 0.003,
+      "loss": 4.1027,
+      "step": 3978
+    },
+    {
+      "epoch": 0.03979,
+      "grad_norm": 0.8766190409660339,
+      "learning_rate": 0.003,
+      "loss": 4.1282,
+      "step": 3979
+    },
+    {
+      "epoch": 0.0398,
+      "grad_norm": 0.9912131428718567,
+      "learning_rate": 0.003,
+      "loss": 4.1235,
+      "step": 3980
+    },
+    {
+      "epoch": 0.03981,
+      "grad_norm": 1.0468101501464844,
+      "learning_rate": 0.003,
+      "loss": 4.1214,
+      "step": 3981
+    },
+    {
+      "epoch": 0.03982,
+      "grad_norm": 0.8397706747055054,
+      "learning_rate": 0.003,
+      "loss": 4.1007,
+      "step": 3982
+    },
+    {
+      "epoch": 0.03983,
+      "grad_norm": 0.6968387365341187,
+      "learning_rate": 0.003,
+      "loss": 4.1115,
+      "step": 3983
+    },
+    {
+      "epoch": 0.03984,
+      "grad_norm": 0.6742430925369263,
+      "learning_rate": 0.003,
+      "loss": 4.1251,
+      "step": 3984
+    },
+    {
+      "epoch": 0.03985,
+      "grad_norm": 0.7048434019088745,
+      "learning_rate": 0.003,
+      "loss": 4.1183,
+      "step": 3985
+    },
+    {
+      "epoch": 0.03986,
+      "grad_norm": 0.7287977337837219,
+      "learning_rate": 0.003,
+      "loss": 4.1185,
+      "step": 3986
+    },
+    {
+      "epoch": 0.03987,
+      "grad_norm": 0.6307860016822815,
+      "learning_rate": 0.003,
+      "loss": 4.1175,
+      "step": 3987
+    },
+    {
+      "epoch": 0.03988,
+      "grad_norm": 0.6006014943122864,
+      "learning_rate": 0.003,
+      "loss": 4.1214,
+      "step": 3988
+    },
+    {
+      "epoch": 0.03989,
+      "grad_norm": 0.614997386932373,
+      "learning_rate": 0.003,
+      "loss": 4.1068,
+      "step": 3989
+    },
+    {
+      "epoch": 0.0399,
+      "grad_norm": 0.6875625252723694,
+      "learning_rate": 0.003,
+      "loss": 4.082,
+      "step": 3990
+    },
+    {
+      "epoch": 0.03991,
+      "grad_norm": 0.7122326493263245,
+      "learning_rate": 0.003,
+      "loss": 4.1265,
+      "step": 3991
+    },
+    {
+      "epoch": 0.03992,
+      "grad_norm": 0.8080908060073853,
+      "learning_rate": 0.003,
+      "loss": 4.1106,
+      "step": 3992
+    },
+    {
+      "epoch": 0.03993,
+      "grad_norm": 0.7636401653289795,
+      "learning_rate": 0.003,
+      "loss": 4.1051,
+      "step": 3993
+    },
+    {
+      "epoch": 0.03994,
+      "grad_norm": 0.7076770663261414,
+      "learning_rate": 0.003,
+      "loss": 4.1036,
+      "step": 3994
+    },
+    {
+      "epoch": 0.03995,
+      "grad_norm": 0.5683047771453857,
+      "learning_rate": 0.003,
+      "loss": 4.0834,
+      "step": 3995
+    },
+    {
+      "epoch": 0.03996,
+      "grad_norm": 0.4724688231945038,
+      "learning_rate": 0.003,
+      "loss": 4.1167,
+      "step": 3996
+    },
+    {
+      "epoch": 0.03997,
+      "grad_norm": 0.5588562488555908,
+      "learning_rate": 0.003,
+      "loss": 4.116,
+      "step": 3997
+    },
+    {
+      "epoch": 0.03998,
+      "grad_norm": 0.5613643527030945,
+      "learning_rate": 0.003,
+      "loss": 4.117,
+      "step": 3998
+    },
+    {
+      "epoch": 0.03999,
+      "grad_norm": 0.5133464932441711,
+      "learning_rate": 0.003,
+      "loss": 4.1186,
+      "step": 3999
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.46894940733909607,
+      "learning_rate": 0.003,
+      "loss": 4.1184,
+      "step": 4000
+    },
+    {
+      "epoch": 0.04001,
+      "grad_norm": 0.4599146544933319,
+      "learning_rate": 0.003,
+      "loss": 4.1147,
+      "step": 4001
+    },
+    {
+      "epoch": 0.04002,
+      "grad_norm": 0.5162279605865479,
+      "learning_rate": 0.003,
+      "loss": 4.0894,
+      "step": 4002
+    },
+    {
+      "epoch": 0.04003,
+      "grad_norm": 0.5921787023544312,
+      "learning_rate": 0.003,
+      "loss": 4.1119,
+      "step": 4003
+    },
+    {
+      "epoch": 0.04004,
+      "grad_norm": 0.6687480807304382,
+      "learning_rate": 0.003,
+      "loss": 4.1184,
+      "step": 4004
+    },
+    {
+      "epoch": 0.04005,
+      "grad_norm": 0.6211435794830322,
+      "learning_rate": 0.003,
+      "loss": 4.1235,
+      "step": 4005
+    },
+    {
+      "epoch": 0.04006,
+      "grad_norm": 0.6189464330673218,
+      "learning_rate": 0.003,
+      "loss": 4.08,
+      "step": 4006
+    },
+    {
+      "epoch": 0.04007,
+      "grad_norm": 0.6097008585929871,
+      "learning_rate": 0.003,
+      "loss": 4.1087,
+      "step": 4007
+    },
+    {
+      "epoch": 0.04008,
+      "grad_norm": 0.6154138445854187,
+      "learning_rate": 0.003,
+      "loss": 4.0813,
+      "step": 4008
+    },
+    {
+      "epoch": 0.04009,
+      "grad_norm": 0.7050077319145203,
+      "learning_rate": 0.003,
+      "loss": 4.0846,
+      "step": 4009
+    },
+    {
+      "epoch": 0.0401,
+      "grad_norm": 0.7118995189666748,
+      "learning_rate": 0.003,
+      "loss": 4.1047,
+      "step": 4010
+    },
+    {
+      "epoch": 0.04011,
+      "grad_norm": 0.7219801545143127,
+      "learning_rate": 0.003,
+      "loss": 4.1096,
+      "step": 4011
+    },
+    {
+      "epoch": 0.04012,
+      "grad_norm": 0.7222893834114075,
+      "learning_rate": 0.003,
+      "loss": 4.103,
+      "step": 4012
+    },
+    {
+      "epoch": 0.04013,
+      "grad_norm": 0.7958313822746277,
+      "learning_rate": 0.003,
+      "loss": 4.107,
+      "step": 4013
+    },
+    {
+      "epoch": 0.04014,
+      "grad_norm": 0.6716787815093994,
+      "learning_rate": 0.003,
+      "loss": 4.0862,
+      "step": 4014
+    },
+    {
+      "epoch": 0.04015,
+      "grad_norm": 0.6670402884483337,
+      "learning_rate": 0.003,
+      "loss": 4.0676,
+      "step": 4015
+    },
+    {
+      "epoch": 0.04016,
+      "grad_norm": 0.8088099956512451,
+      "learning_rate": 0.003,
+      "loss": 4.0907,
+      "step": 4016
+    },
+    {
+      "epoch": 0.04017,
+      "grad_norm": 0.9719444513320923,
+      "learning_rate": 0.003,
+      "loss": 4.1373,
+      "step": 4017
+    },
+    {
+      "epoch": 0.04018,
+      "grad_norm": 0.9042829871177673,
+      "learning_rate": 0.003,
+      "loss": 4.109,
+      "step": 4018
+    },
+    {
+      "epoch": 0.04019,
+      "grad_norm": 0.701447069644928,
+      "learning_rate": 0.003,
+      "loss": 4.0814,
+      "step": 4019
+    },
+    {
+      "epoch": 0.0402,
+      "grad_norm": 0.6191059350967407,
+      "learning_rate": 0.003,
+      "loss": 4.1141,
+      "step": 4020
+    },
+    {
+      "epoch": 0.04021,
+      "grad_norm": 0.6724329590797424,
+      "learning_rate": 0.003,
+      "loss": 4.0897,
+      "step": 4021
+    },
+    {
+      "epoch": 0.04022,
+      "grad_norm": 0.6767911911010742,
+      "learning_rate": 0.003,
+      "loss": 4.0873,
+      "step": 4022
+    },
+    {
+      "epoch": 0.04023,
+      "grad_norm": 0.6602100133895874,
+      "learning_rate": 0.003,
+      "loss": 4.0899,
+      "step": 4023
+    },
+    {
+      "epoch": 0.04024,
+      "grad_norm": 0.6715804934501648,
+      "learning_rate": 0.003,
+      "loss": 4.0818,
+      "step": 4024
+    },
+    {
+      "epoch": 0.04025,
+      "grad_norm": 0.6610874533653259,
+      "learning_rate": 0.003,
+      "loss": 4.087,
+      "step": 4025
+    },
+    {
+      "epoch": 0.04026,
+      "grad_norm": 0.7230667471885681,
+      "learning_rate": 0.003,
+      "loss": 4.1123,
+      "step": 4026
+    },
+    {
+      "epoch": 0.04027,
+      "grad_norm": 0.6509302258491516,
+      "learning_rate": 0.003,
+      "loss": 4.1145,
+      "step": 4027
+    },
+    {
+      "epoch": 0.04028,
+      "grad_norm": 0.7460821866989136,
+      "learning_rate": 0.003,
+      "loss": 4.0966,
+      "step": 4028
+    },
+    {
+      "epoch": 0.04029,
+      "grad_norm": 0.760715901851654,
+      "learning_rate": 0.003,
+      "loss": 4.1252,
+      "step": 4029
+    },
+    {
+      "epoch": 0.0403,
+      "grad_norm": 0.7702258825302124,
+      "learning_rate": 0.003,
+      "loss": 4.0785,
+      "step": 4030
+    },
+    {
+      "epoch": 0.04031,
+      "grad_norm": 0.7244075536727905,
+      "learning_rate": 0.003,
+      "loss": 4.0951,
+      "step": 4031
+    },
+    {
+      "epoch": 0.04032,
+      "grad_norm": 0.7782586812973022,
+      "learning_rate": 0.003,
+      "loss": 4.1218,
+      "step": 4032
+    },
+    {
+      "epoch": 0.04033,
+      "grad_norm": 0.7341326475143433,
+      "learning_rate": 0.003,
+      "loss": 4.1199,
+      "step": 4033
+    },
+    {
+      "epoch": 0.04034,
+      "grad_norm": 0.7792084813117981,
+      "learning_rate": 0.003,
+      "loss": 4.0892,
+      "step": 4034
+    },
+    {
+      "epoch": 0.04035,
+      "grad_norm": 0.8536400198936462,
+      "learning_rate": 0.003,
+      "loss": 4.1163,
+      "step": 4035
+    },
+    {
+      "epoch": 0.04036,
+      "grad_norm": 0.7702471017837524,
+      "learning_rate": 0.003,
+      "loss": 4.1248,
+      "step": 4036
+    },
+    {
+      "epoch": 0.04037,
+      "grad_norm": 0.7094979882240295,
+      "learning_rate": 0.003,
+      "loss": 4.078,
+      "step": 4037
+    },
+    {
+      "epoch": 0.04038,
+      "grad_norm": 0.6657090187072754,
+      "learning_rate": 0.003,
+      "loss": 4.0899,
+      "step": 4038
+    },
+    {
+      "epoch": 0.04039,
+      "grad_norm": 0.8088093400001526,
+      "learning_rate": 0.003,
+      "loss": 4.0984,
+      "step": 4039
+    },
+    {
+      "epoch": 0.0404,
+      "grad_norm": 0.879915177822113,
+      "learning_rate": 0.003,
+      "loss": 4.0966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.04041,
+      "grad_norm": 0.9084352850914001,
+      "learning_rate": 0.003,
+      "loss": 4.098,
+      "step": 4041
+    },
+    {
+      "epoch": 0.04042,
+      "grad_norm": 0.7722291946411133,
+      "learning_rate": 0.003,
+      "loss": 4.133,
+      "step": 4042
+    },
+    {
+      "epoch": 0.04043,
+      "grad_norm": 0.71905916929245,
+      "learning_rate": 0.003,
+      "loss": 4.1207,
+      "step": 4043
+    },
+    {
+      "epoch": 0.04044,
+      "grad_norm": 0.7158097624778748,
+      "learning_rate": 0.003,
+      "loss": 4.113,
+      "step": 4044
+    },
+    {
+      "epoch": 0.04045,
+      "grad_norm": 0.6895061731338501,
+      "learning_rate": 0.003,
+      "loss": 4.0598,
+      "step": 4045
+    },
+    {
+      "epoch": 0.04046,
+      "grad_norm": 0.681125283241272,
+      "learning_rate": 0.003,
+      "loss": 4.1085,
+      "step": 4046
+    },
+    {
+      "epoch": 0.04047,
+      "grad_norm": 0.7285493016242981,
+      "learning_rate": 0.003,
+      "loss": 4.1004,
+      "step": 4047
+    },
+    {
+      "epoch": 0.04048,
+      "grad_norm": 0.7340379357337952,
+      "learning_rate": 0.003,
+      "loss": 4.0923,
+      "step": 4048
+    },
+    {
+      "epoch": 0.04049,
+      "grad_norm": 0.7420255541801453,
+      "learning_rate": 0.003,
+      "loss": 4.1345,
+      "step": 4049
+    },
+    {
+      "epoch": 0.0405,
+      "grad_norm": 0.7484999299049377,
+      "learning_rate": 0.003,
+      "loss": 4.1319,
+      "step": 4050
+    },
+    {
+      "epoch": 0.04051,
+      "grad_norm": 0.6874270439147949,
+      "learning_rate": 0.003,
+      "loss": 4.1096,
+      "step": 4051
+    },
+    {
+      "epoch": 0.04052,
+      "grad_norm": 0.6694976091384888,
+      "learning_rate": 0.003,
+      "loss": 4.1155,
+      "step": 4052
+    },
+    {
+      "epoch": 0.04053,
+      "grad_norm": 0.6139923930168152,
+      "learning_rate": 0.003,
+      "loss": 4.0646,
+      "step": 4053
+    },
+    {
+      "epoch": 0.04054,
+      "grad_norm": 0.5986779928207397,
+      "learning_rate": 0.003,
+      "loss": 4.0822,
+      "step": 4054
+    },
+    {
+      "epoch": 0.04055,
+      "grad_norm": 0.5122130513191223,
+      "learning_rate": 0.003,
+      "loss": 4.0753,
+      "step": 4055
+    },
+    {
+      "epoch": 0.04056,
+      "grad_norm": 0.5276870727539062,
+      "learning_rate": 0.003,
+      "loss": 4.0983,
+      "step": 4056
+    },
+    {
+      "epoch": 0.04057,
+      "grad_norm": 0.5313277244567871,
+      "learning_rate": 0.003,
+      "loss": 4.0979,
+      "step": 4057
+    },
+    {
+      "epoch": 0.04058,
+      "grad_norm": 0.7404656410217285,
+      "learning_rate": 0.003,
+      "loss": 4.0743,
+      "step": 4058
+    },
+    {
+      "epoch": 0.04059,
+      "grad_norm": 0.905890166759491,
+      "learning_rate": 0.003,
+      "loss": 4.065,
+      "step": 4059
+    },
+    {
+      "epoch": 0.0406,
+      "grad_norm": 0.9699053168296814,
+      "learning_rate": 0.003,
+      "loss": 4.1289,
+      "step": 4060
+    },
+    {
+      "epoch": 0.04061,
+      "grad_norm": 0.7840712070465088,
+      "learning_rate": 0.003,
+      "loss": 4.107,
+      "step": 4061
+    },
+    {
+      "epoch": 0.04062,
+      "grad_norm": 0.6911324262619019,
+      "learning_rate": 0.003,
+      "loss": 4.088,
+      "step": 4062
+    },
+    {
+      "epoch": 0.04063,
+      "grad_norm": 0.664695143699646,
+      "learning_rate": 0.003,
+      "loss": 4.0923,
+      "step": 4063
+    },
+    {
+      "epoch": 0.04064,
+      "grad_norm": 0.7076813578605652,
+      "learning_rate": 0.003,
+      "loss": 4.108,
+      "step": 4064
+    },
+    {
+      "epoch": 0.04065,
+      "grad_norm": 0.6082136034965515,
+      "learning_rate": 0.003,
+      "loss": 4.1387,
+      "step": 4065
+    },
+    {
+      "epoch": 0.04066,
+      "grad_norm": 0.6230340003967285,
+      "learning_rate": 0.003,
+      "loss": 4.1007,
+      "step": 4066
+    },
+    {
+      "epoch": 0.04067,
+      "grad_norm": 0.5546422004699707,
+      "learning_rate": 0.003,
+      "loss": 4.1013,
+      "step": 4067
+    },
+    {
+      "epoch": 0.04068,
+      "grad_norm": 0.5739226937294006,
+      "learning_rate": 0.003,
+      "loss": 4.1097,
+      "step": 4068
+    },
+    {
+      "epoch": 0.04069,
+      "grad_norm": 0.5410630702972412,
+      "learning_rate": 0.003,
+      "loss": 4.0983,
+      "step": 4069
+    },
+    {
+      "epoch": 0.0407,
+      "grad_norm": 0.5467224717140198,
+      "learning_rate": 0.003,
+      "loss": 4.1009,
+      "step": 4070
+    },
+    {
+      "epoch": 0.04071,
+      "grad_norm": 0.6489109992980957,
+      "learning_rate": 0.003,
+      "loss": 4.0694,
+      "step": 4071
+    },
+    {
+      "epoch": 0.04072,
+      "grad_norm": 0.6744621992111206,
+      "learning_rate": 0.003,
+      "loss": 4.0941,
+      "step": 4072
+    },
+    {
+      "epoch": 0.04073,
+      "grad_norm": 0.60545814037323,
+      "learning_rate": 0.003,
+      "loss": 4.078,
+      "step": 4073
+    },
+    {
+      "epoch": 0.04074,
+      "grad_norm": 0.6391845941543579,
+      "learning_rate": 0.003,
+      "loss": 4.1066,
+      "step": 4074
+    },
+    {
+      "epoch": 0.04075,
+      "grad_norm": 0.7024280428886414,
+      "learning_rate": 0.003,
+      "loss": 4.104,
+      "step": 4075
+    },
+    {
+      "epoch": 0.04076,
+      "grad_norm": 0.7279633283615112,
+      "learning_rate": 0.003,
+      "loss": 4.1025,
+      "step": 4076
+    },
+    {
+      "epoch": 0.04077,
+      "grad_norm": 0.9424205422401428,
+      "learning_rate": 0.003,
+      "loss": 4.1223,
+      "step": 4077
+    },
+    {
+      "epoch": 0.04078,
+      "grad_norm": 1.1304396390914917,
+      "learning_rate": 0.003,
+      "loss": 4.1063,
+      "step": 4078
+    },
+    {
+      "epoch": 0.04079,
+      "grad_norm": 0.8722311854362488,
+      "learning_rate": 0.003,
+      "loss": 4.1045,
+      "step": 4079
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.760799765586853,
+      "learning_rate": 0.003,
+      "loss": 4.0984,
+      "step": 4080
+    },
+    {
+      "epoch": 0.04081,
+      "grad_norm": 0.7816235423088074,
+      "learning_rate": 0.003,
+      "loss": 4.0759,
+      "step": 4081
+    },
+    {
+      "epoch": 0.04082,
+      "grad_norm": 0.7581563591957092,
+      "learning_rate": 0.003,
+      "loss": 4.113,
+      "step": 4082
+    },
+    {
+      "epoch": 0.04083,
+      "grad_norm": 0.7936528921127319,
+      "learning_rate": 0.003,
+      "loss": 4.0904,
+      "step": 4083
+    },
+    {
+      "epoch": 0.04084,
+      "grad_norm": 0.772882342338562,
+      "learning_rate": 0.003,
+      "loss": 4.1315,
+      "step": 4084
+    },
+    {
+      "epoch": 0.04085,
+      "grad_norm": 0.7227948904037476,
+      "learning_rate": 0.003,
+      "loss": 4.1071,
+      "step": 4085
+    },
+    {
+      "epoch": 0.04086,
+      "grad_norm": 0.6548799276351929,
+      "learning_rate": 0.003,
+      "loss": 4.1279,
+      "step": 4086
+    },
+    {
+      "epoch": 0.04087,
+      "grad_norm": 0.5630327463150024,
+      "learning_rate": 0.003,
+      "loss": 4.1065,
+      "step": 4087
+    },
+    {
+      "epoch": 0.04088,
+      "grad_norm": 0.527557373046875,
+      "learning_rate": 0.003,
+      "loss": 4.084,
+      "step": 4088
+    },
+    {
+      "epoch": 0.04089,
+      "grad_norm": 0.46402889490127563,
+      "learning_rate": 0.003,
+      "loss": 4.0822,
+      "step": 4089
+    },
+    {
+      "epoch": 0.0409,
+      "grad_norm": 0.44137027859687805,
+      "learning_rate": 0.003,
+      "loss": 4.0969,
+      "step": 4090
+    },
+    {
+      "epoch": 0.04091,
+      "grad_norm": 0.4541914165019989,
+      "learning_rate": 0.003,
+      "loss": 4.1083,
+      "step": 4091
+    },
+    {
+      "epoch": 0.04092,
+      "grad_norm": 0.4600992500782013,
+      "learning_rate": 0.003,
+      "loss": 4.065,
+      "step": 4092
+    },
+    {
+      "epoch": 0.04093,
+      "grad_norm": 0.4791017472743988,
+      "learning_rate": 0.003,
+      "loss": 4.0906,
+      "step": 4093
+    },
+    {
+      "epoch": 0.04094,
+      "grad_norm": 0.527622640132904,
+      "learning_rate": 0.003,
+      "loss": 4.083,
+      "step": 4094
+    },
+    {
+      "epoch": 0.04095,
+      "grad_norm": 0.5713629126548767,
+      "learning_rate": 0.003,
+      "loss": 4.0586,
+      "step": 4095
+    },
+    {
+      "epoch": 0.04096,
+      "grad_norm": 0.6901856064796448,
+      "learning_rate": 0.003,
+      "loss": 4.069,
+      "step": 4096
+    },
+    {
+      "epoch": 0.04097,
+      "grad_norm": 0.8031033873558044,
+      "learning_rate": 0.003,
+      "loss": 4.0895,
+      "step": 4097
+    },
+    {
+      "epoch": 0.04098,
+      "grad_norm": 0.9514955878257751,
+      "learning_rate": 0.003,
+      "loss": 4.0993,
+      "step": 4098
+    },
+    {
+      "epoch": 0.04099,
+      "grad_norm": 1.1810649633407593,
+      "learning_rate": 0.003,
+      "loss": 4.1333,
+      "step": 4099
+    },
+    {
+      "epoch": 0.041,
+      "grad_norm": 0.7748354077339172,
+      "learning_rate": 0.003,
+      "loss": 4.0811,
+      "step": 4100
+    },
+    {
+      "epoch": 0.04101,
+      "grad_norm": 0.6190698146820068,
+      "learning_rate": 0.003,
+      "loss": 4.0752,
+      "step": 4101
+    },
+    {
+      "epoch": 0.04102,
+      "grad_norm": 0.7298320531845093,
+      "learning_rate": 0.003,
+      "loss": 4.0954,
+      "step": 4102
+    },
+    {
+      "epoch": 0.04103,
+      "grad_norm": 0.8667885661125183,
+      "learning_rate": 0.003,
+      "loss": 4.1242,
+      "step": 4103
+    },
+    {
+      "epoch": 0.04104,
+      "grad_norm": 0.8904606699943542,
+      "learning_rate": 0.003,
+      "loss": 4.109,
+      "step": 4104
+    },
+    {
+      "epoch": 0.04105,
+      "grad_norm": 0.8015432953834534,
+      "learning_rate": 0.003,
+      "loss": 4.0754,
+      "step": 4105
+    },
+    {
+      "epoch": 0.04106,
+      "grad_norm": 0.7645514011383057,
+      "learning_rate": 0.003,
+      "loss": 4.1128,
+      "step": 4106
+    },
+    {
+      "epoch": 0.04107,
+      "grad_norm": 0.7201929092407227,
+      "learning_rate": 0.003,
+      "loss": 4.1034,
+      "step": 4107
+    },
+    {
+      "epoch": 0.04108,
+      "grad_norm": 0.696420431137085,
+      "learning_rate": 0.003,
+      "loss": 4.0919,
+      "step": 4108
+    },
+    {
+      "epoch": 0.04109,
+      "grad_norm": 0.7951487302780151,
+      "learning_rate": 0.003,
+      "loss": 4.086,
+      "step": 4109
+    },
+    {
+      "epoch": 0.0411,
+      "grad_norm": 0.7593970894813538,
+      "learning_rate": 0.003,
+      "loss": 4.0985,
+      "step": 4110
+    },
+    {
+      "epoch": 0.04111,
+      "grad_norm": 0.6601749658584595,
+      "learning_rate": 0.003,
+      "loss": 4.0781,
+      "step": 4111
+    },
+    {
+      "epoch": 0.04112,
+      "grad_norm": 0.5873904228210449,
+      "learning_rate": 0.003,
+      "loss": 4.0962,
+      "step": 4112
+    },
+    {
+      "epoch": 0.04113,
+      "grad_norm": 0.6075222492218018,
+      "learning_rate": 0.003,
+      "loss": 4.1353,
+      "step": 4113
+    },
+    {
+      "epoch": 0.04114,
+      "grad_norm": 0.5487315654754639,
+      "learning_rate": 0.003,
+      "loss": 4.0762,
+      "step": 4114
+    },
+    {
+      "epoch": 0.04115,
+      "grad_norm": 0.5993032455444336,
+      "learning_rate": 0.003,
+      "loss": 4.0868,
+      "step": 4115
+    },
+    {
+      "epoch": 0.04116,
+      "grad_norm": 0.5999573469161987,
+      "learning_rate": 0.003,
+      "loss": 4.1028,
+      "step": 4116
+    },
+    {
+      "epoch": 0.04117,
+      "grad_norm": 0.7142841815948486,
+      "learning_rate": 0.003,
+      "loss": 4.0677,
+      "step": 4117
+    },
+    {
+      "epoch": 0.04118,
+      "grad_norm": 0.8221079707145691,
+      "learning_rate": 0.003,
+      "loss": 4.1275,
+      "step": 4118
+    },
+    {
+      "epoch": 0.04119,
+      "grad_norm": 0.8343155384063721,
+      "learning_rate": 0.003,
+      "loss": 4.1067,
+      "step": 4119
+    },
+    {
+      "epoch": 0.0412,
+      "grad_norm": 0.7892839312553406,
+      "learning_rate": 0.003,
+      "loss": 4.0994,
+      "step": 4120
+    },
+    {
+      "epoch": 0.04121,
+      "grad_norm": 0.7228533625602722,
+      "learning_rate": 0.003,
+      "loss": 4.1201,
+      "step": 4121
+    },
+    {
+      "epoch": 0.04122,
+      "grad_norm": 0.6834357976913452,
+      "learning_rate": 0.003,
+      "loss": 4.0926,
+      "step": 4122
+    },
+    {
+      "epoch": 0.04123,
+      "grad_norm": 0.6968315243721008,
+      "learning_rate": 0.003,
+      "loss": 4.1044,
+      "step": 4123
+    },
+    {
+      "epoch": 0.04124,
+      "grad_norm": 0.5444803833961487,
+      "learning_rate": 0.003,
+      "loss": 4.0792,
+      "step": 4124
+    },
+    {
+      "epoch": 0.04125,
+      "grad_norm": 0.6157690286636353,
+      "learning_rate": 0.003,
+      "loss": 4.1037,
+      "step": 4125
+    },
+    {
+      "epoch": 0.04126,
+      "grad_norm": 0.5248627066612244,
+      "learning_rate": 0.003,
+      "loss": 4.0927,
+      "step": 4126
+    },
+    {
+      "epoch": 0.04127,
+      "grad_norm": 0.606207549571991,
+      "learning_rate": 0.003,
+      "loss": 4.0837,
+      "step": 4127
+    },
+    {
+      "epoch": 0.04128,
+      "grad_norm": 0.7370710968971252,
+      "learning_rate": 0.003,
+      "loss": 4.0728,
+      "step": 4128
+    },
+    {
+      "epoch": 0.04129,
+      "grad_norm": 0.8737143278121948,
+      "learning_rate": 0.003,
+      "loss": 4.1193,
+      "step": 4129
+    },
+    {
+      "epoch": 0.0413,
+      "grad_norm": 0.8792223334312439,
+      "learning_rate": 0.003,
+      "loss": 4.0871,
+      "step": 4130
+    },
+    {
+      "epoch": 0.04131,
+      "grad_norm": 0.6732345819473267,
+      "learning_rate": 0.003,
+      "loss": 4.107,
+      "step": 4131
+    },
+    {
+      "epoch": 0.04132,
+      "grad_norm": 0.6241254210472107,
+      "learning_rate": 0.003,
+      "loss": 4.069,
+      "step": 4132
+    },
+    {
+      "epoch": 0.04133,
+      "grad_norm": 0.6735711693763733,
+      "learning_rate": 0.003,
+      "loss": 4.1148,
+      "step": 4133
+    },
+    {
+      "epoch": 0.04134,
+      "grad_norm": 0.6963285803794861,
+      "learning_rate": 0.003,
+      "loss": 4.1039,
+      "step": 4134
+    },
+    {
+      "epoch": 0.04135,
+      "grad_norm": 0.8845453262329102,
+      "learning_rate": 0.003,
+      "loss": 4.0929,
+      "step": 4135
+    },
+    {
+      "epoch": 0.04136,
+      "grad_norm": 0.9322511553764343,
+      "learning_rate": 0.003,
+      "loss": 4.0952,
+      "step": 4136
+    },
+    {
+      "epoch": 0.04137,
+      "grad_norm": 1.0508934259414673,
+      "learning_rate": 0.003,
+      "loss": 4.1499,
+      "step": 4137
+    },
+    {
+      "epoch": 0.04138,
+      "grad_norm": 1.0728346109390259,
+      "learning_rate": 0.003,
+      "loss": 4.1354,
+      "step": 4138
+    },
+    {
+      "epoch": 0.04139,
+      "grad_norm": 0.9741144180297852,
+      "learning_rate": 0.003,
+      "loss": 4.1361,
+      "step": 4139
+    },
+    {
+      "epoch": 0.0414,
+      "grad_norm": 0.8693930506706238,
+      "learning_rate": 0.003,
+      "loss": 4.1331,
+      "step": 4140
+    },
+    {
+      "epoch": 0.04141,
+      "grad_norm": 1.100584626197815,
+      "learning_rate": 0.003,
+      "loss": 4.121,
+      "step": 4141
+    },
+    {
+      "epoch": 0.04142,
+      "grad_norm": 0.9084172248840332,
+      "learning_rate": 0.003,
+      "loss": 4.1395,
+      "step": 4142
+    },
+    {
+      "epoch": 0.04143,
+      "grad_norm": 0.9111177921295166,
+      "learning_rate": 0.003,
+      "loss": 4.1253,
+      "step": 4143
+    },
+    {
+      "epoch": 0.04144,
+      "grad_norm": 1.0545722246170044,
+      "learning_rate": 0.003,
+      "loss": 4.1361,
+      "step": 4144
+    },
+    {
+      "epoch": 0.04145,
+      "grad_norm": 1.0651071071624756,
+      "learning_rate": 0.003,
+      "loss": 4.1248,
+      "step": 4145
+    },
+    {
+      "epoch": 0.04146,
+      "grad_norm": 0.8738028407096863,
+      "learning_rate": 0.003,
+      "loss": 4.1388,
+      "step": 4146
+    },
+    {
+      "epoch": 0.04147,
+      "grad_norm": 0.8165324926376343,
+      "learning_rate": 0.003,
+      "loss": 4.1437,
+      "step": 4147
+    },
+    {
+      "epoch": 0.04148,
+      "grad_norm": 0.7123237252235413,
+      "learning_rate": 0.003,
+      "loss": 4.1,
+      "step": 4148
+    },
+    {
+      "epoch": 0.04149,
+      "grad_norm": 0.6672841310501099,
+      "learning_rate": 0.003,
+      "loss": 4.1053,
+      "step": 4149
+    },
+    {
+      "epoch": 0.0415,
+      "grad_norm": 0.6620326042175293,
+      "learning_rate": 0.003,
+      "loss": 4.1102,
+      "step": 4150
+    },
+    {
+      "epoch": 0.04151,
+      "grad_norm": 0.6698993444442749,
+      "learning_rate": 0.003,
+      "loss": 4.1249,
+      "step": 4151
+    },
+    {
+      "epoch": 0.04152,
+      "grad_norm": 0.6350617408752441,
+      "learning_rate": 0.003,
+      "loss": 4.123,
+      "step": 4152
+    },
+    {
+      "epoch": 0.04153,
+      "grad_norm": 0.47288987040519714,
+      "learning_rate": 0.003,
+      "loss": 4.0828,
+      "step": 4153
+    },
+    {
+      "epoch": 0.04154,
+      "grad_norm": 0.4522964656352997,
+      "learning_rate": 0.003,
+      "loss": 4.1151,
+      "step": 4154
+    },
+    {
+      "epoch": 0.04155,
+      "grad_norm": 0.426760196685791,
+      "learning_rate": 0.003,
+      "loss": 4.0877,
+      "step": 4155
+    },
+    {
+      "epoch": 0.04156,
+      "grad_norm": 0.43916746973991394,
+      "learning_rate": 0.003,
+      "loss": 4.1082,
+      "step": 4156
+    },
+    {
+      "epoch": 0.04157,
+      "grad_norm": 0.47124361991882324,
+      "learning_rate": 0.003,
+      "loss": 4.121,
+      "step": 4157
+    },
+    {
+      "epoch": 0.04158,
+      "grad_norm": 0.5592663288116455,
+      "learning_rate": 0.003,
+      "loss": 4.1227,
+      "step": 4158
+    },
+    {
+      "epoch": 0.04159,
+      "grad_norm": 0.747325599193573,
+      "learning_rate": 0.003,
+      "loss": 4.0958,
+      "step": 4159
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 1.02601957321167,
+      "learning_rate": 0.003,
+      "loss": 4.1178,
+      "step": 4160
+    },
+    {
+      "epoch": 0.04161,
+      "grad_norm": 1.0634429454803467,
+      "learning_rate": 0.003,
+      "loss": 4.1286,
+      "step": 4161
+    },
+    {
+      "epoch": 0.04162,
+      "grad_norm": 0.5860819220542908,
+      "learning_rate": 0.003,
+      "loss": 4.0741,
+      "step": 4162
+    },
+    {
+      "epoch": 0.04163,
+      "grad_norm": 0.6616426706314087,
+      "learning_rate": 0.003,
+      "loss": 4.105,
+      "step": 4163
+    },
+    {
+      "epoch": 0.04164,
+      "grad_norm": 0.6689631938934326,
+      "learning_rate": 0.003,
+      "loss": 4.1215,
+      "step": 4164
+    },
+    {
+      "epoch": 0.04165,
+      "grad_norm": 0.5675843954086304,
+      "learning_rate": 0.003,
+      "loss": 4.0931,
+      "step": 4165
+    },
+    {
+      "epoch": 0.04166,
+      "grad_norm": 0.4940520226955414,
+      "learning_rate": 0.003,
+      "loss": 4.0834,
+      "step": 4166
+    },
+    {
+      "epoch": 0.04167,
+      "grad_norm": 0.5212199091911316,
+      "learning_rate": 0.003,
+      "loss": 4.1052,
+      "step": 4167
+    },
+    {
+      "epoch": 0.04168,
+      "grad_norm": 0.52357017993927,
+      "learning_rate": 0.003,
+      "loss": 4.1065,
+      "step": 4168
+    },
+    {
+      "epoch": 0.04169,
+      "grad_norm": 0.43375542759895325,
+      "learning_rate": 0.003,
+      "loss": 4.1089,
+      "step": 4169
+    },
+    {
+      "epoch": 0.0417,
+      "grad_norm": 0.44883838295936584,
+      "learning_rate": 0.003,
+      "loss": 4.0702,
+      "step": 4170
+    },
+    {
+      "epoch": 0.04171,
+      "grad_norm": 0.4587332606315613,
+      "learning_rate": 0.003,
+      "loss": 4.0923,
+      "step": 4171
+    },
+    {
+      "epoch": 0.04172,
+      "grad_norm": 0.5260483026504517,
+      "learning_rate": 0.003,
+      "loss": 4.1095,
+      "step": 4172
+    },
+    {
+      "epoch": 0.04173,
+      "grad_norm": 0.7267121076583862,
+      "learning_rate": 0.003,
+      "loss": 4.1096,
+      "step": 4173
+    },
+    {
+      "epoch": 0.04174,
+      "grad_norm": 0.8692538738250732,
+      "learning_rate": 0.003,
+      "loss": 4.0847,
+      "step": 4174
+    },
+    {
+      "epoch": 0.04175,
+      "grad_norm": 0.8145063519477844,
+      "learning_rate": 0.003,
+      "loss": 4.1339,
+      "step": 4175
+    },
+    {
+      "epoch": 0.04176,
+      "grad_norm": 0.7225742340087891,
+      "learning_rate": 0.003,
+      "loss": 4.0897,
+      "step": 4176
+    },
+    {
+      "epoch": 0.04177,
+      "grad_norm": 0.6365543007850647,
+      "learning_rate": 0.003,
+      "loss": 4.0705,
+      "step": 4177
+    },
+    {
+      "epoch": 0.04178,
+      "grad_norm": 0.6694263815879822,
+      "learning_rate": 0.003,
+      "loss": 4.1007,
+      "step": 4178
+    },
+    {
+      "epoch": 0.04179,
+      "grad_norm": 0.7857269644737244,
+      "learning_rate": 0.003,
+      "loss": 4.0813,
+      "step": 4179
+    },
+    {
+      "epoch": 0.0418,
+      "grad_norm": 0.8134697675704956,
+      "learning_rate": 0.003,
+      "loss": 4.0975,
+      "step": 4180
+    },
+    {
+      "epoch": 0.04181,
+      "grad_norm": 0.687165379524231,
+      "learning_rate": 0.003,
+      "loss": 4.0979,
+      "step": 4181
+    },
+    {
+      "epoch": 0.04182,
+      "grad_norm": 0.6623665690422058,
+      "learning_rate": 0.003,
+      "loss": 4.0791,
+      "step": 4182
+    },
+    {
+      "epoch": 0.04183,
+      "grad_norm": 0.659334659576416,
+      "learning_rate": 0.003,
+      "loss": 4.0857,
+      "step": 4183
+    },
+    {
+      "epoch": 0.04184,
+      "grad_norm": 0.6392843723297119,
+      "learning_rate": 0.003,
+      "loss": 4.1403,
+      "step": 4184
+    },
+    {
+      "epoch": 0.04185,
+      "grad_norm": 0.5640465617179871,
+      "learning_rate": 0.003,
+      "loss": 4.1165,
+      "step": 4185
+    },
+    {
+      "epoch": 0.04186,
+      "grad_norm": 0.5710539221763611,
+      "learning_rate": 0.003,
+      "loss": 4.0614,
+      "step": 4186
+    },
+    {
+      "epoch": 0.04187,
+      "grad_norm": 0.4746825695037842,
+      "learning_rate": 0.003,
+      "loss": 4.109,
+      "step": 4187
+    },
+    {
+      "epoch": 0.04188,
+      "grad_norm": 0.4887617826461792,
+      "learning_rate": 0.003,
+      "loss": 4.0725,
+      "step": 4188
+    },
+    {
+      "epoch": 0.04189,
+      "grad_norm": 0.40521180629730225,
+      "learning_rate": 0.003,
+      "loss": 4.0863,
+      "step": 4189
+    },
+    {
+      "epoch": 0.0419,
+      "grad_norm": 0.47049522399902344,
+      "learning_rate": 0.003,
+      "loss": 4.0839,
+      "step": 4190
+    },
+    {
+      "epoch": 0.04191,
+      "grad_norm": 0.5449577569961548,
+      "learning_rate": 0.003,
+      "loss": 4.0892,
+      "step": 4191
+    },
+    {
+      "epoch": 0.04192,
+      "grad_norm": 0.6885932087898254,
+      "learning_rate": 0.003,
+      "loss": 4.0744,
+      "step": 4192
+    },
+    {
+      "epoch": 0.04193,
+      "grad_norm": 0.9400091171264648,
+      "learning_rate": 0.003,
+      "loss": 4.0641,
+      "step": 4193
+    },
+    {
+      "epoch": 0.04194,
+      "grad_norm": 1.1772154569625854,
+      "learning_rate": 0.003,
+      "loss": 4.0711,
+      "step": 4194
+    },
+    {
+      "epoch": 0.04195,
+      "grad_norm": 0.7265138030052185,
+      "learning_rate": 0.003,
+      "loss": 4.1131,
+      "step": 4195
+    },
+    {
+      "epoch": 0.04196,
+      "grad_norm": 0.702653169631958,
+      "learning_rate": 0.003,
+      "loss": 4.0712,
+      "step": 4196
+    },
+    {
+      "epoch": 0.04197,
+      "grad_norm": 0.8408281803131104,
+      "learning_rate": 0.003,
+      "loss": 4.0896,
+      "step": 4197
+    },
+    {
+      "epoch": 0.04198,
+      "grad_norm": 0.7780887484550476,
+      "learning_rate": 0.003,
+      "loss": 4.0825,
+      "step": 4198
+    },
+    {
+      "epoch": 0.04199,
+      "grad_norm": 0.7158482074737549,
+      "learning_rate": 0.003,
+      "loss": 4.1183,
+      "step": 4199
+    },
+    {
+      "epoch": 0.042,
+      "grad_norm": 0.6765742301940918,
+      "learning_rate": 0.003,
+      "loss": 4.0976,
+      "step": 4200
+    },
+    {
+      "epoch": 0.04201,
+      "grad_norm": 0.6093567609786987,
+      "learning_rate": 0.003,
+      "loss": 4.0736,
+      "step": 4201
+    },
+    {
+      "epoch": 0.04202,
+      "grad_norm": 0.6598721146583557,
+      "learning_rate": 0.003,
+      "loss": 4.0915,
+      "step": 4202
+    },
+    {
+      "epoch": 0.04203,
+      "grad_norm": 0.5697882175445557,
+      "learning_rate": 0.003,
+      "loss": 4.1095,
+      "step": 4203
+    },
+    {
+      "epoch": 0.04204,
+      "grad_norm": 0.6565430760383606,
+      "learning_rate": 0.003,
+      "loss": 4.0775,
+      "step": 4204
+    },
+    {
+      "epoch": 0.04205,
+      "grad_norm": 0.8136803507804871,
+      "learning_rate": 0.003,
+      "loss": 4.1051,
+      "step": 4205
+    },
+    {
+      "epoch": 0.04206,
+      "grad_norm": 0.828262984752655,
+      "learning_rate": 0.003,
+      "loss": 4.1037,
+      "step": 4206
+    },
+    {
+      "epoch": 0.04207,
+      "grad_norm": 0.7792331576347351,
+      "learning_rate": 0.003,
+      "loss": 4.0951,
+      "step": 4207
+    },
+    {
+      "epoch": 0.04208,
+      "grad_norm": 0.9303550124168396,
+      "learning_rate": 0.003,
+      "loss": 4.1121,
+      "step": 4208
+    },
+    {
+      "epoch": 0.04209,
+      "grad_norm": 0.8936018347740173,
+      "learning_rate": 0.003,
+      "loss": 4.1162,
+      "step": 4209
+    },
+    {
+      "epoch": 0.0421,
+      "grad_norm": 1.0039626359939575,
+      "learning_rate": 0.003,
+      "loss": 4.081,
+      "step": 4210
+    },
+    {
+      "epoch": 0.04211,
+      "grad_norm": 0.9981667995452881,
+      "learning_rate": 0.003,
+      "loss": 4.1234,
+      "step": 4211
+    },
+    {
+      "epoch": 0.04212,
+      "grad_norm": 0.8137405514717102,
+      "learning_rate": 0.003,
+      "loss": 4.1121,
+      "step": 4212
+    },
+    {
+      "epoch": 0.04213,
+      "grad_norm": 0.8376567959785461,
+      "learning_rate": 0.003,
+      "loss": 4.0954,
+      "step": 4213
+    },
+    {
+      "epoch": 0.04214,
+      "grad_norm": 0.7845513820648193,
+      "learning_rate": 0.003,
+      "loss": 4.0967,
+      "step": 4214
+    },
+    {
+      "epoch": 0.04215,
+      "grad_norm": 0.7330971360206604,
+      "learning_rate": 0.003,
+      "loss": 4.1202,
+      "step": 4215
+    },
+    {
+      "epoch": 0.04216,
+      "grad_norm": 0.7391840219497681,
+      "learning_rate": 0.003,
+      "loss": 4.0842,
+      "step": 4216
+    },
+    {
+      "epoch": 0.04217,
+      "grad_norm": 0.8007631897926331,
+      "learning_rate": 0.003,
+      "loss": 4.1075,
+      "step": 4217
+    },
+    {
+      "epoch": 0.04218,
+      "grad_norm": 0.8993299007415771,
+      "learning_rate": 0.003,
+      "loss": 4.1425,
+      "step": 4218
+    },
+    {
+      "epoch": 0.04219,
+      "grad_norm": 0.9009442925453186,
+      "learning_rate": 0.003,
+      "loss": 4.0612,
+      "step": 4219
+    },
+    {
+      "epoch": 0.0422,
+      "grad_norm": 1.0134148597717285,
+      "learning_rate": 0.003,
+      "loss": 4.1351,
+      "step": 4220
+    },
+    {
+      "epoch": 0.04221,
+      "grad_norm": 0.9685496687889099,
+      "learning_rate": 0.003,
+      "loss": 4.1095,
+      "step": 4221
+    },
+    {
+      "epoch": 0.04222,
+      "grad_norm": 0.8909528255462646,
+      "learning_rate": 0.003,
+      "loss": 4.0763,
+      "step": 4222
+    },
+    {
+      "epoch": 0.04223,
+      "grad_norm": 0.806991457939148,
+      "learning_rate": 0.003,
+      "loss": 4.1347,
+      "step": 4223
+    },
+    {
+      "epoch": 0.04224,
+      "grad_norm": 0.7712184190750122,
+      "learning_rate": 0.003,
+      "loss": 4.1378,
+      "step": 4224
+    },
+    {
+      "epoch": 0.04225,
+      "grad_norm": 0.6012980341911316,
+      "learning_rate": 0.003,
+      "loss": 4.119,
+      "step": 4225
+    },
+    {
+      "epoch": 0.04226,
+      "grad_norm": 0.6349509358406067,
+      "learning_rate": 0.003,
+      "loss": 4.1038,
+      "step": 4226
+    },
+    {
+      "epoch": 0.04227,
+      "grad_norm": 0.6691918969154358,
+      "learning_rate": 0.003,
+      "loss": 4.1017,
+      "step": 4227
+    },
+    {
+      "epoch": 0.04228,
+      "grad_norm": 0.622721254825592,
+      "learning_rate": 0.003,
+      "loss": 4.0855,
+      "step": 4228
+    },
+    {
+      "epoch": 0.04229,
+      "grad_norm": 0.57576984167099,
+      "learning_rate": 0.003,
+      "loss": 4.1088,
+      "step": 4229
+    },
+    {
+      "epoch": 0.0423,
+      "grad_norm": 0.6181168556213379,
+      "learning_rate": 0.003,
+      "loss": 4.0976,
+      "step": 4230
+    },
+    {
+      "epoch": 0.04231,
+      "grad_norm": 0.6395006775856018,
+      "learning_rate": 0.003,
+      "loss": 4.0956,
+      "step": 4231
+    },
+    {
+      "epoch": 0.04232,
+      "grad_norm": 0.5587146878242493,
+      "learning_rate": 0.003,
+      "loss": 4.1051,
+      "step": 4232
+    },
+    {
+      "epoch": 0.04233,
+      "grad_norm": 0.4529360234737396,
+      "learning_rate": 0.003,
+      "loss": 4.0546,
+      "step": 4233
+    },
+    {
+      "epoch": 0.04234,
+      "grad_norm": 0.4365110397338867,
+      "learning_rate": 0.003,
+      "loss": 4.0861,
+      "step": 4234
+    },
+    {
+      "epoch": 0.04235,
+      "grad_norm": 0.3945704996585846,
+      "learning_rate": 0.003,
+      "loss": 4.0683,
+      "step": 4235
+    },
+    {
+      "epoch": 0.04236,
+      "grad_norm": 0.4571150541305542,
+      "learning_rate": 0.003,
+      "loss": 4.0782,
+      "step": 4236
+    },
+    {
+      "epoch": 0.04237,
+      "grad_norm": 0.4355223476886749,
+      "learning_rate": 0.003,
+      "loss": 4.0505,
+      "step": 4237
+    },
+    {
+      "epoch": 0.04238,
+      "grad_norm": 0.47384676337242126,
+      "learning_rate": 0.003,
+      "loss": 4.1093,
+      "step": 4238
+    },
+    {
+      "epoch": 0.04239,
+      "grad_norm": 0.5304744243621826,
+      "learning_rate": 0.003,
+      "loss": 4.0914,
+      "step": 4239
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.5990294814109802,
+      "learning_rate": 0.003,
+      "loss": 4.0945,
+      "step": 4240
+    },
+    {
+      "epoch": 0.04241,
+      "grad_norm": 0.6438462734222412,
+      "learning_rate": 0.003,
+      "loss": 4.1062,
+      "step": 4241
+    },
+    {
+      "epoch": 0.04242,
+      "grad_norm": 0.6008359789848328,
+      "learning_rate": 0.003,
+      "loss": 4.0942,
+      "step": 4242
+    },
+    {
+      "epoch": 0.04243,
+      "grad_norm": 0.653829038143158,
+      "learning_rate": 0.003,
+      "loss": 4.08,
+      "step": 4243
+    },
+    {
+      "epoch": 0.04244,
+      "grad_norm": 0.6691821813583374,
+      "learning_rate": 0.003,
+      "loss": 4.0863,
+      "step": 4244
+    },
+    {
+      "epoch": 0.04245,
+      "grad_norm": 0.62837815284729,
+      "learning_rate": 0.003,
+      "loss": 4.0801,
+      "step": 4245
+    },
+    {
+      "epoch": 0.04246,
+      "grad_norm": 0.6627157330513,
+      "learning_rate": 0.003,
+      "loss": 4.1066,
+      "step": 4246
+    },
+    {
+      "epoch": 0.04247,
+      "grad_norm": 0.7871987819671631,
+      "learning_rate": 0.003,
+      "loss": 4.0829,
+      "step": 4247
+    },
+    {
+      "epoch": 0.04248,
+      "grad_norm": 0.9460709095001221,
+      "learning_rate": 0.003,
+      "loss": 4.0872,
+      "step": 4248
+    },
+    {
+      "epoch": 0.04249,
+      "grad_norm": 0.9994218945503235,
+      "learning_rate": 0.003,
+      "loss": 4.0868,
+      "step": 4249
+    },
+    {
+      "epoch": 0.0425,
+      "grad_norm": 0.8460412621498108,
+      "learning_rate": 0.003,
+      "loss": 4.1031,
+      "step": 4250
+    },
+    {
+      "epoch": 0.04251,
+      "grad_norm": 0.6389533281326294,
+      "learning_rate": 0.003,
+      "loss": 4.0593,
+      "step": 4251
+    },
+    {
+      "epoch": 0.04252,
+      "grad_norm": 0.7343540191650391,
+      "learning_rate": 0.003,
+      "loss": 4.1074,
+      "step": 4252
+    },
+    {
+      "epoch": 0.04253,
+      "grad_norm": 0.8084548115730286,
+      "learning_rate": 0.003,
+      "loss": 4.0739,
+      "step": 4253
+    },
+    {
+      "epoch": 0.04254,
+      "grad_norm": 0.804446280002594,
+      "learning_rate": 0.003,
+      "loss": 4.0996,
+      "step": 4254
+    },
+    {
+      "epoch": 0.04255,
+      "grad_norm": 0.6976709365844727,
+      "learning_rate": 0.003,
+      "loss": 4.0749,
+      "step": 4255
+    },
+    {
+      "epoch": 0.04256,
+      "grad_norm": 0.6876462697982788,
+      "learning_rate": 0.003,
+      "loss": 4.0828,
+      "step": 4256
+    },
+    {
+      "epoch": 0.04257,
+      "grad_norm": 0.6768009662628174,
+      "learning_rate": 0.003,
+      "loss": 4.1066,
+      "step": 4257
+    },
+    {
+      "epoch": 0.04258,
+      "grad_norm": 0.735969603061676,
+      "learning_rate": 0.003,
+      "loss": 4.1081,
+      "step": 4258
+    },
+    {
+      "epoch": 0.04259,
+      "grad_norm": 0.83452969789505,
+      "learning_rate": 0.003,
+      "loss": 4.0784,
+      "step": 4259
+    },
+    {
+      "epoch": 0.0426,
+      "grad_norm": 0.7557306289672852,
+      "learning_rate": 0.003,
+      "loss": 4.0942,
+      "step": 4260
+    },
+    {
+      "epoch": 0.04261,
+      "grad_norm": 0.7074769735336304,
+      "learning_rate": 0.003,
+      "loss": 4.1068,
+      "step": 4261
+    },
+    {
+      "epoch": 0.04262,
+      "grad_norm": 0.6406824588775635,
+      "learning_rate": 0.003,
+      "loss": 4.1203,
+      "step": 4262
+    },
+    {
+      "epoch": 0.04263,
+      "grad_norm": 0.6755427122116089,
+      "learning_rate": 0.003,
+      "loss": 4.104,
+      "step": 4263
+    },
+    {
+      "epoch": 0.04264,
+      "grad_norm": 0.7064899206161499,
+      "learning_rate": 0.003,
+      "loss": 4.1126,
+      "step": 4264
+    },
+    {
+      "epoch": 0.04265,
+      "grad_norm": 0.673495888710022,
+      "learning_rate": 0.003,
+      "loss": 4.1086,
+      "step": 4265
+    },
+    {
+      "epoch": 0.04266,
+      "grad_norm": 0.6447572708129883,
+      "learning_rate": 0.003,
+      "loss": 4.0893,
+      "step": 4266
+    },
+    {
+      "epoch": 0.04267,
+      "grad_norm": 0.5904159545898438,
+      "learning_rate": 0.003,
+      "loss": 4.0625,
+      "step": 4267
+    },
+    {
+      "epoch": 0.04268,
+      "grad_norm": 0.6268547177314758,
+      "learning_rate": 0.003,
+      "loss": 4.1053,
+      "step": 4268
+    },
+    {
+      "epoch": 0.04269,
+      "grad_norm": 0.5750144124031067,
+      "learning_rate": 0.003,
+      "loss": 4.0683,
+      "step": 4269
+    },
+    {
+      "epoch": 0.0427,
+      "grad_norm": 0.44790661334991455,
+      "learning_rate": 0.003,
+      "loss": 4.08,
+      "step": 4270
+    },
+    {
+      "epoch": 0.04271,
+      "grad_norm": 0.4415220022201538,
+      "learning_rate": 0.003,
+      "loss": 4.094,
+      "step": 4271
+    },
+    {
+      "epoch": 0.04272,
+      "grad_norm": 0.39944225549697876,
+      "learning_rate": 0.003,
+      "loss": 4.0812,
+      "step": 4272
+    },
+    {
+      "epoch": 0.04273,
+      "grad_norm": 0.3969830870628357,
+      "learning_rate": 0.003,
+      "loss": 4.0616,
+      "step": 4273
+    },
+    {
+      "epoch": 0.04274,
+      "grad_norm": 0.41689732670783997,
+      "learning_rate": 0.003,
+      "loss": 4.0832,
+      "step": 4274
+    },
+    {
+      "epoch": 0.04275,
+      "grad_norm": 0.4759061932563782,
+      "learning_rate": 0.003,
+      "loss": 4.0875,
+      "step": 4275
+    },
+    {
+      "epoch": 0.04276,
+      "grad_norm": 0.6165413856506348,
+      "learning_rate": 0.003,
+      "loss": 4.1064,
+      "step": 4276
+    },
+    {
+      "epoch": 0.04277,
+      "grad_norm": 0.9280685782432556,
+      "learning_rate": 0.003,
+      "loss": 4.1024,
+      "step": 4277
+    },
+    {
+      "epoch": 0.04278,
+      "grad_norm": 1.2253838777542114,
+      "learning_rate": 0.003,
+      "loss": 4.1203,
+      "step": 4278
+    },
+    {
+      "epoch": 0.04279,
+      "grad_norm": 0.5876243710517883,
+      "learning_rate": 0.003,
+      "loss": 4.0781,
+      "step": 4279
+    },
+    {
+      "epoch": 0.0428,
+      "grad_norm": 0.5460189580917358,
+      "learning_rate": 0.003,
+      "loss": 4.0575,
+      "step": 4280
+    },
+    {
+      "epoch": 0.04281,
+      "grad_norm": 0.81770920753479,
+      "learning_rate": 0.003,
+      "loss": 4.1182,
+      "step": 4281
+    },
+    {
+      "epoch": 0.04282,
+      "grad_norm": 0.7643664479255676,
+      "learning_rate": 0.003,
+      "loss": 4.0913,
+      "step": 4282
+    },
+    {
+      "epoch": 0.04283,
+      "grad_norm": 0.7183843851089478,
+      "learning_rate": 0.003,
+      "loss": 4.1257,
+      "step": 4283
+    },
+    {
+      "epoch": 0.04284,
+      "grad_norm": 0.6911137104034424,
+      "learning_rate": 0.003,
+      "loss": 4.0797,
+      "step": 4284
+    },
+    {
+      "epoch": 0.04285,
+      "grad_norm": 0.6681384444236755,
+      "learning_rate": 0.003,
+      "loss": 4.0955,
+      "step": 4285
+    },
+    {
+      "epoch": 0.04286,
+      "grad_norm": 0.7612639665603638,
+      "learning_rate": 0.003,
+      "loss": 4.117,
+      "step": 4286
+    },
+    {
+      "epoch": 0.04287,
+      "grad_norm": 0.8821431398391724,
+      "learning_rate": 0.003,
+      "loss": 4.1038,
+      "step": 4287
+    },
+    {
+      "epoch": 0.04288,
+      "grad_norm": 0.8891422748565674,
+      "learning_rate": 0.003,
+      "loss": 4.0906,
+      "step": 4288
+    },
+    {
+      "epoch": 0.04289,
+      "grad_norm": 0.8912457823753357,
+      "learning_rate": 0.003,
+      "loss": 4.101,
+      "step": 4289
+    },
+    {
+      "epoch": 0.0429,
+      "grad_norm": 0.9452528953552246,
+      "learning_rate": 0.003,
+      "loss": 4.0881,
+      "step": 4290
+    },
+    {
+      "epoch": 0.04291,
+      "grad_norm": 0.9560781121253967,
+      "learning_rate": 0.003,
+      "loss": 4.0997,
+      "step": 4291
+    },
+    {
+      "epoch": 0.04292,
+      "grad_norm": 0.9647819995880127,
+      "learning_rate": 0.003,
+      "loss": 4.1085,
+      "step": 4292
+    },
+    {
+      "epoch": 0.04293,
+      "grad_norm": 1.0337492227554321,
+      "learning_rate": 0.003,
+      "loss": 4.1468,
+      "step": 4293
+    },
+    {
+      "epoch": 0.04294,
+      "grad_norm": 0.9274333119392395,
+      "learning_rate": 0.003,
+      "loss": 4.1336,
+      "step": 4294
+    },
+    {
+      "epoch": 0.04295,
+      "grad_norm": 0.9098928570747375,
+      "learning_rate": 0.003,
+      "loss": 4.1007,
+      "step": 4295
+    },
+    {
+      "epoch": 0.04296,
+      "grad_norm": 0.9552505612373352,
+      "learning_rate": 0.003,
+      "loss": 4.1028,
+      "step": 4296
+    },
+    {
+      "epoch": 0.04297,
+      "grad_norm": 0.9055096507072449,
+      "learning_rate": 0.003,
+      "loss": 4.0878,
+      "step": 4297
+    },
+    {
+      "epoch": 0.04298,
+      "grad_norm": 0.9436898231506348,
+      "learning_rate": 0.003,
+      "loss": 4.1346,
+      "step": 4298
+    },
+    {
+      "epoch": 0.04299,
+      "grad_norm": 0.9265126585960388,
+      "learning_rate": 0.003,
+      "loss": 4.1257,
+      "step": 4299
+    },
+    {
+      "epoch": 0.043,
+      "grad_norm": 0.9316915273666382,
+      "learning_rate": 0.003,
+      "loss": 4.1105,
+      "step": 4300
+    },
+    {
+      "epoch": 0.04301,
+      "grad_norm": 0.9089905023574829,
+      "learning_rate": 0.003,
+      "loss": 4.1575,
+      "step": 4301
+    },
+    {
+      "epoch": 0.04302,
+      "grad_norm": 0.9315720200538635,
+      "learning_rate": 0.003,
+      "loss": 4.1455,
+      "step": 4302
+    },
+    {
+      "epoch": 0.04303,
+      "grad_norm": 0.9286166429519653,
+      "learning_rate": 0.003,
+      "loss": 4.1383,
+      "step": 4303
+    },
+    {
+      "epoch": 0.04304,
+      "grad_norm": 0.91733717918396,
+      "learning_rate": 0.003,
+      "loss": 4.1311,
+      "step": 4304
+    },
+    {
+      "epoch": 0.04305,
+      "grad_norm": 1.065048098564148,
+      "learning_rate": 0.003,
+      "loss": 4.1359,
+      "step": 4305
+    },
+    {
+      "epoch": 0.04306,
+      "grad_norm": 0.9198899269104004,
+      "learning_rate": 0.003,
+      "loss": 4.1058,
+      "step": 4306
+    },
+    {
+      "epoch": 0.04307,
+      "grad_norm": 0.729816198348999,
+      "learning_rate": 0.003,
+      "loss": 4.1072,
+      "step": 4307
+    },
+    {
+      "epoch": 0.04308,
+      "grad_norm": 0.6812875866889954,
+      "learning_rate": 0.003,
+      "loss": 4.1182,
+      "step": 4308
+    },
+    {
+      "epoch": 0.04309,
+      "grad_norm": 0.6616823077201843,
+      "learning_rate": 0.003,
+      "loss": 4.1055,
+      "step": 4309
+    },
+    {
+      "epoch": 0.0431,
+      "grad_norm": 0.7262138724327087,
+      "learning_rate": 0.003,
+      "loss": 4.1286,
+      "step": 4310
+    },
+    {
+      "epoch": 0.04311,
+      "grad_norm": 0.6826511025428772,
+      "learning_rate": 0.003,
+      "loss": 4.12,
+      "step": 4311
+    },
+    {
+      "epoch": 0.04312,
+      "grad_norm": 0.5878548622131348,
+      "learning_rate": 0.003,
+      "loss": 4.097,
+      "step": 4312
+    },
+    {
+      "epoch": 0.04313,
+      "grad_norm": 0.52091383934021,
+      "learning_rate": 0.003,
+      "loss": 4.1045,
+      "step": 4313
+    },
+    {
+      "epoch": 0.04314,
+      "grad_norm": 0.5465619564056396,
+      "learning_rate": 0.003,
+      "loss": 4.0927,
+      "step": 4314
+    },
+    {
+      "epoch": 0.04315,
+      "grad_norm": 0.5617761015892029,
+      "learning_rate": 0.003,
+      "loss": 4.1144,
+      "step": 4315
+    },
+    {
+      "epoch": 0.04316,
+      "grad_norm": 0.5584519505500793,
+      "learning_rate": 0.003,
+      "loss": 4.1098,
+      "step": 4316
+    },
+    {
+      "epoch": 0.04317,
+      "grad_norm": 0.591358482837677,
+      "learning_rate": 0.003,
+      "loss": 4.1218,
+      "step": 4317
+    },
+    {
+      "epoch": 0.04318,
+      "grad_norm": 0.6004156470298767,
+      "learning_rate": 0.003,
+      "loss": 4.0658,
+      "step": 4318
+    },
+    {
+      "epoch": 0.04319,
+      "grad_norm": 0.6281335353851318,
+      "learning_rate": 0.003,
+      "loss": 4.0613,
+      "step": 4319
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.5984190702438354,
+      "learning_rate": 0.003,
+      "loss": 4.0833,
+      "step": 4320
+    },
+    {
+      "epoch": 0.04321,
+      "grad_norm": 0.4503112733364105,
+      "learning_rate": 0.003,
+      "loss": 4.0829,
+      "step": 4321
+    },
+    {
+      "epoch": 0.04322,
+      "grad_norm": 0.44634100794792175,
+      "learning_rate": 0.003,
+      "loss": 4.1154,
+      "step": 4322
+    },
+    {
+      "epoch": 0.04323,
+      "grad_norm": 0.4618377089500427,
+      "learning_rate": 0.003,
+      "loss": 4.0814,
+      "step": 4323
+    },
+    {
+      "epoch": 0.04324,
+      "grad_norm": 0.4412762224674225,
+      "learning_rate": 0.003,
+      "loss": 4.0752,
+      "step": 4324
+    },
+    {
+      "epoch": 0.04325,
+      "grad_norm": 0.4403208792209625,
+      "learning_rate": 0.003,
+      "loss": 4.1039,
+      "step": 4325
+    },
+    {
+      "epoch": 0.04326,
+      "grad_norm": 0.4678369164466858,
+      "learning_rate": 0.003,
+      "loss": 4.0834,
+      "step": 4326
+    },
+    {
+      "epoch": 0.04327,
+      "grad_norm": 0.5307203531265259,
+      "learning_rate": 0.003,
+      "loss": 4.0657,
+      "step": 4327
+    },
+    {
+      "epoch": 0.04328,
+      "grad_norm": 0.6740878820419312,
+      "learning_rate": 0.003,
+      "loss": 4.0843,
+      "step": 4328
+    },
+    {
+      "epoch": 0.04329,
+      "grad_norm": 0.827231764793396,
+      "learning_rate": 0.003,
+      "loss": 4.1075,
+      "step": 4329
+    },
+    {
+      "epoch": 0.0433,
+      "grad_norm": 0.9776306748390198,
+      "learning_rate": 0.003,
+      "loss": 4.0858,
+      "step": 4330
+    },
+    {
+      "epoch": 0.04331,
+      "grad_norm": 0.7999715805053711,
+      "learning_rate": 0.003,
+      "loss": 4.0784,
+      "step": 4331
+    },
+    {
+      "epoch": 0.04332,
+      "grad_norm": 0.6406177282333374,
+      "learning_rate": 0.003,
+      "loss": 4.1209,
+      "step": 4332
+    },
+    {
+      "epoch": 0.04333,
+      "grad_norm": 0.7425047159194946,
+      "learning_rate": 0.003,
+      "loss": 4.0948,
+      "step": 4333
+    },
+    {
+      "epoch": 0.04334,
+      "grad_norm": 0.7017237544059753,
+      "learning_rate": 0.003,
+      "loss": 4.0823,
+      "step": 4334
+    },
+    {
+      "epoch": 0.04335,
+      "grad_norm": 0.6225997805595398,
+      "learning_rate": 0.003,
+      "loss": 4.0669,
+      "step": 4335
+    },
+    {
+      "epoch": 0.04336,
+      "grad_norm": 0.5787286162376404,
+      "learning_rate": 0.003,
+      "loss": 4.1011,
+      "step": 4336
+    },
+    {
+      "epoch": 0.04337,
+      "grad_norm": 0.4820355474948883,
+      "learning_rate": 0.003,
+      "loss": 4.0832,
+      "step": 4337
+    },
+    {
+      "epoch": 0.04338,
+      "grad_norm": 0.454394668340683,
+      "learning_rate": 0.003,
+      "loss": 4.0853,
+      "step": 4338
+    },
+    {
+      "epoch": 0.04339,
+      "grad_norm": 0.48518452048301697,
+      "learning_rate": 0.003,
+      "loss": 4.0904,
+      "step": 4339
+    },
+    {
+      "epoch": 0.0434,
+      "grad_norm": 0.6182829737663269,
+      "learning_rate": 0.003,
+      "loss": 4.0741,
+      "step": 4340
+    },
+    {
+      "epoch": 0.04341,
+      "grad_norm": 0.6660934090614319,
+      "learning_rate": 0.003,
+      "loss": 4.0862,
+      "step": 4341
+    },
+    {
+      "epoch": 0.04342,
+      "grad_norm": 0.7125284671783447,
+      "learning_rate": 0.003,
+      "loss": 4.0761,
+      "step": 4342
+    },
+    {
+      "epoch": 0.04343,
+      "grad_norm": 0.6455511450767517,
+      "learning_rate": 0.003,
+      "loss": 4.114,
+      "step": 4343
+    },
+    {
+      "epoch": 0.04344,
+      "grad_norm": 0.5411390066146851,
+      "learning_rate": 0.003,
+      "loss": 4.0557,
+      "step": 4344
+    },
+    {
+      "epoch": 0.04345,
+      "grad_norm": 0.6171619296073914,
+      "learning_rate": 0.003,
+      "loss": 4.0961,
+      "step": 4345
+    },
+    {
+      "epoch": 0.04346,
+      "grad_norm": 0.8696421980857849,
+      "learning_rate": 0.003,
+      "loss": 4.0988,
+      "step": 4346
+    },
+    {
+      "epoch": 0.04347,
+      "grad_norm": 1.0493338108062744,
+      "learning_rate": 0.003,
+      "loss": 4.112,
+      "step": 4347
+    },
+    {
+      "epoch": 0.04348,
+      "grad_norm": 0.9583601951599121,
+      "learning_rate": 0.003,
+      "loss": 4.0934,
+      "step": 4348
+    },
+    {
+      "epoch": 0.04349,
+      "grad_norm": 0.8417112827301025,
+      "learning_rate": 0.003,
+      "loss": 4.1091,
+      "step": 4349
+    },
+    {
+      "epoch": 0.0435,
+      "grad_norm": 0.7813431620597839,
+      "learning_rate": 0.003,
+      "loss": 4.0902,
+      "step": 4350
+    },
+    {
+      "epoch": 0.04351,
+      "grad_norm": 0.7515244483947754,
+      "learning_rate": 0.003,
+      "loss": 4.1075,
+      "step": 4351
+    },
+    {
+      "epoch": 0.04352,
+      "grad_norm": 0.7575326561927795,
+      "learning_rate": 0.003,
+      "loss": 4.1207,
+      "step": 4352
+    },
+    {
+      "epoch": 0.04353,
+      "grad_norm": 0.8079462647438049,
+      "learning_rate": 0.003,
+      "loss": 4.0973,
+      "step": 4353
+    },
+    {
+      "epoch": 0.04354,
+      "grad_norm": 0.8209161162376404,
+      "learning_rate": 0.003,
+      "loss": 4.0994,
+      "step": 4354
+    },
+    {
+      "epoch": 0.04355,
+      "grad_norm": 0.8110191226005554,
+      "learning_rate": 0.003,
+      "loss": 4.1047,
+      "step": 4355
+    },
+    {
+      "epoch": 0.04356,
+      "grad_norm": 0.8562638759613037,
+      "learning_rate": 0.003,
+      "loss": 4.1157,
+      "step": 4356
+    },
+    {
+      "epoch": 0.04357,
+      "grad_norm": 0.773973822593689,
+      "learning_rate": 0.003,
+      "loss": 4.0948,
+      "step": 4357
+    },
+    {
+      "epoch": 0.04358,
+      "grad_norm": 0.8397206664085388,
+      "learning_rate": 0.003,
+      "loss": 4.0858,
+      "step": 4358
+    },
+    {
+      "epoch": 0.04359,
+      "grad_norm": 0.7626861929893494,
+      "learning_rate": 0.003,
+      "loss": 4.1059,
+      "step": 4359
+    },
+    {
+      "epoch": 0.0436,
+      "grad_norm": 0.80926513671875,
+      "learning_rate": 0.003,
+      "loss": 4.1105,
+      "step": 4360
+    },
+    {
+      "epoch": 0.04361,
+      "grad_norm": 0.8279222846031189,
+      "learning_rate": 0.003,
+      "loss": 4.077,
+      "step": 4361
+    },
+    {
+      "epoch": 0.04362,
+      "grad_norm": 0.7730289697647095,
+      "learning_rate": 0.003,
+      "loss": 4.0847,
+      "step": 4362
+    },
+    {
+      "epoch": 0.04363,
+      "grad_norm": 0.6915876865386963,
+      "learning_rate": 0.003,
+      "loss": 4.089,
+      "step": 4363
+    },
+    {
+      "epoch": 0.04364,
+      "grad_norm": 0.6894673705101013,
+      "learning_rate": 0.003,
+      "loss": 4.1015,
+      "step": 4364
+    },
+    {
+      "epoch": 0.04365,
+      "grad_norm": 0.6546920537948608,
+      "learning_rate": 0.003,
+      "loss": 4.1016,
+      "step": 4365
+    },
+    {
+      "epoch": 0.04366,
+      "grad_norm": 0.6718312501907349,
+      "learning_rate": 0.003,
+      "loss": 4.0577,
+      "step": 4366
+    },
+    {
+      "epoch": 0.04367,
+      "grad_norm": 0.7026388049125671,
+      "learning_rate": 0.003,
+      "loss": 4.1063,
+      "step": 4367
+    },
+    {
+      "epoch": 0.04368,
+      "grad_norm": 0.664130687713623,
+      "learning_rate": 0.003,
+      "loss": 4.0906,
+      "step": 4368
+    },
+    {
+      "epoch": 0.04369,
+      "grad_norm": 0.4905566871166229,
+      "learning_rate": 0.003,
+      "loss": 4.0799,
+      "step": 4369
+    },
+    {
+      "epoch": 0.0437,
+      "grad_norm": 0.5002229809761047,
+      "learning_rate": 0.003,
+      "loss": 4.1057,
+      "step": 4370
+    },
+    {
+      "epoch": 0.04371,
+      "grad_norm": 0.5647168159484863,
+      "learning_rate": 0.003,
+      "loss": 4.0918,
+      "step": 4371
+    },
+    {
+      "epoch": 0.04372,
+      "grad_norm": 0.5772929787635803,
+      "learning_rate": 0.003,
+      "loss": 4.0724,
+      "step": 4372
+    },
+    {
+      "epoch": 0.04373,
+      "grad_norm": 0.5668372511863708,
+      "learning_rate": 0.003,
+      "loss": 4.0962,
+      "step": 4373
+    },
+    {
+      "epoch": 0.04374,
+      "grad_norm": 0.5907851457595825,
+      "learning_rate": 0.003,
+      "loss": 4.1375,
+      "step": 4374
+    },
+    {
+      "epoch": 0.04375,
+      "grad_norm": 0.5372269749641418,
+      "learning_rate": 0.003,
+      "loss": 4.0494,
+      "step": 4375
+    },
+    {
+      "epoch": 0.04376,
+      "grad_norm": 0.5136074423789978,
+      "learning_rate": 0.003,
+      "loss": 4.0899,
+      "step": 4376
+    },
+    {
+      "epoch": 0.04377,
+      "grad_norm": 0.5012854933738708,
+      "learning_rate": 0.003,
+      "loss": 4.1148,
+      "step": 4377
+    },
+    {
+      "epoch": 0.04378,
+      "grad_norm": 0.5583826899528503,
+      "learning_rate": 0.003,
+      "loss": 4.1,
+      "step": 4378
+    },
+    {
+      "epoch": 0.04379,
+      "grad_norm": 0.609851062297821,
+      "learning_rate": 0.003,
+      "loss": 4.115,
+      "step": 4379
+    },
+    {
+      "epoch": 0.0438,
+      "grad_norm": 0.6667990684509277,
+      "learning_rate": 0.003,
+      "loss": 4.0873,
+      "step": 4380
+    },
+    {
+      "epoch": 0.04381,
+      "grad_norm": 0.769900381565094,
+      "learning_rate": 0.003,
+      "loss": 4.0798,
+      "step": 4381
+    },
+    {
+      "epoch": 0.04382,
+      "grad_norm": 0.918383777141571,
+      "learning_rate": 0.003,
+      "loss": 4.0954,
+      "step": 4382
+    },
+    {
+      "epoch": 0.04383,
+      "grad_norm": 1.0884766578674316,
+      "learning_rate": 0.003,
+      "loss": 4.0835,
+      "step": 4383
+    },
+    {
+      "epoch": 0.04384,
+      "grad_norm": 0.753715991973877,
+      "learning_rate": 0.003,
+      "loss": 4.0845,
+      "step": 4384
+    },
+    {
+      "epoch": 0.04385,
+      "grad_norm": 0.5258980989456177,
+      "learning_rate": 0.003,
+      "loss": 4.112,
+      "step": 4385
+    },
+    {
+      "epoch": 0.04386,
+      "grad_norm": 0.7058178186416626,
+      "learning_rate": 0.003,
+      "loss": 4.0925,
+      "step": 4386
+    },
+    {
+      "epoch": 0.04387,
+      "grad_norm": 0.865214467048645,
+      "learning_rate": 0.003,
+      "loss": 4.1175,
+      "step": 4387
+    },
+    {
+      "epoch": 0.04388,
+      "grad_norm": 0.872906506061554,
+      "learning_rate": 0.003,
+      "loss": 4.0803,
+      "step": 4388
+    },
+    {
+      "epoch": 0.04389,
+      "grad_norm": 0.785380482673645,
+      "learning_rate": 0.003,
+      "loss": 4.1128,
+      "step": 4389
+    },
+    {
+      "epoch": 0.0439,
+      "grad_norm": 0.6976808309555054,
+      "learning_rate": 0.003,
+      "loss": 4.1064,
+      "step": 4390
+    },
+    {
+      "epoch": 0.04391,
+      "grad_norm": 0.7179602384567261,
+      "learning_rate": 0.003,
+      "loss": 4.0775,
+      "step": 4391
+    },
+    {
+      "epoch": 0.04392,
+      "grad_norm": 0.7050849199295044,
+      "learning_rate": 0.003,
+      "loss": 4.0875,
+      "step": 4392
+    },
+    {
+      "epoch": 0.04393,
+      "grad_norm": 0.6494352221488953,
+      "learning_rate": 0.003,
+      "loss": 4.0743,
+      "step": 4393
+    },
+    {
+      "epoch": 0.04394,
+      "grad_norm": 0.5354273915290833,
+      "learning_rate": 0.003,
+      "loss": 4.0899,
+      "step": 4394
+    },
+    {
+      "epoch": 0.04395,
+      "grad_norm": 0.5617183446884155,
+      "learning_rate": 0.003,
+      "loss": 4.0725,
+      "step": 4395
+    },
+    {
+      "epoch": 0.04396,
+      "grad_norm": 0.5766990780830383,
+      "learning_rate": 0.003,
+      "loss": 4.0507,
+      "step": 4396
+    },
+    {
+      "epoch": 0.04397,
+      "grad_norm": 0.6702291369438171,
+      "learning_rate": 0.003,
+      "loss": 4.0688,
+      "step": 4397
+    },
+    {
+      "epoch": 0.04398,
+      "grad_norm": 0.7613652944564819,
+      "learning_rate": 0.003,
+      "loss": 4.0895,
+      "step": 4398
+    },
+    {
+      "epoch": 0.04399,
+      "grad_norm": 0.8499076962471008,
+      "learning_rate": 0.003,
+      "loss": 4.1,
+      "step": 4399
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 1.0589154958724976,
+      "learning_rate": 0.003,
+      "loss": 4.1061,
+      "step": 4400
+    },
+    {
+      "epoch": 0.04401,
+      "grad_norm": 0.9029095768928528,
+      "learning_rate": 0.003,
+      "loss": 4.1163,
+      "step": 4401
+    },
+    {
+      "epoch": 0.04402,
+      "grad_norm": 0.7743659019470215,
+      "learning_rate": 0.003,
+      "loss": 4.118,
+      "step": 4402
+    },
+    {
+      "epoch": 0.04403,
+      "grad_norm": 0.6107021570205688,
+      "learning_rate": 0.003,
+      "loss": 4.0767,
+      "step": 4403
+    },
+    {
+      "epoch": 0.04404,
+      "grad_norm": 0.578995406627655,
+      "learning_rate": 0.003,
+      "loss": 4.1058,
+      "step": 4404
+    },
+    {
+      "epoch": 0.04405,
+      "grad_norm": 0.5411956906318665,
+      "learning_rate": 0.003,
+      "loss": 4.0945,
+      "step": 4405
+    },
+    {
+      "epoch": 0.04406,
+      "grad_norm": 0.5497092008590698,
+      "learning_rate": 0.003,
+      "loss": 4.0942,
+      "step": 4406
+    },
+    {
+      "epoch": 0.04407,
+      "grad_norm": 0.5126742124557495,
+      "learning_rate": 0.003,
+      "loss": 4.0628,
+      "step": 4407
+    },
+    {
+      "epoch": 0.04408,
+      "grad_norm": 0.4807611107826233,
+      "learning_rate": 0.003,
+      "loss": 4.0686,
+      "step": 4408
+    },
+    {
+      "epoch": 0.04409,
+      "grad_norm": 0.4981795847415924,
+      "learning_rate": 0.003,
+      "loss": 4.1015,
+      "step": 4409
+    },
+    {
+      "epoch": 0.0441,
+      "grad_norm": 0.5225616097450256,
+      "learning_rate": 0.003,
+      "loss": 4.0843,
+      "step": 4410
+    },
+    {
+      "epoch": 0.04411,
+      "grad_norm": 0.525507390499115,
+      "learning_rate": 0.003,
+      "loss": 4.0664,
+      "step": 4411
+    },
+    {
+      "epoch": 0.04412,
+      "grad_norm": 0.551207423210144,
+      "learning_rate": 0.003,
+      "loss": 4.0549,
+      "step": 4412
+    },
+    {
+      "epoch": 0.04413,
+      "grad_norm": 0.5674916505813599,
+      "learning_rate": 0.003,
+      "loss": 4.0627,
+      "step": 4413
+    },
+    {
+      "epoch": 0.04414,
+      "grad_norm": 0.5803220868110657,
+      "learning_rate": 0.003,
+      "loss": 4.1154,
+      "step": 4414
+    },
+    {
+      "epoch": 0.04415,
+      "grad_norm": 0.6080368161201477,
+      "learning_rate": 0.003,
+      "loss": 4.08,
+      "step": 4415
+    },
+    {
+      "epoch": 0.04416,
+      "grad_norm": 0.6551415920257568,
+      "learning_rate": 0.003,
+      "loss": 4.0703,
+      "step": 4416
+    },
+    {
+      "epoch": 0.04417,
+      "grad_norm": 0.727580189704895,
+      "learning_rate": 0.003,
+      "loss": 4.0476,
+      "step": 4417
+    },
+    {
+      "epoch": 0.04418,
+      "grad_norm": 0.869605302810669,
+      "learning_rate": 0.003,
+      "loss": 4.0966,
+      "step": 4418
+    },
+    {
+      "epoch": 0.04419,
+      "grad_norm": 0.9329266548156738,
+      "learning_rate": 0.003,
+      "loss": 4.1187,
+      "step": 4419
+    },
+    {
+      "epoch": 0.0442,
+      "grad_norm": 0.971718966960907,
+      "learning_rate": 0.003,
+      "loss": 4.0936,
+      "step": 4420
+    },
+    {
+      "epoch": 0.04421,
+      "grad_norm": 0.8899560570716858,
+      "learning_rate": 0.003,
+      "loss": 4.1288,
+      "step": 4421
+    },
+    {
+      "epoch": 0.04422,
+      "grad_norm": 0.8837299942970276,
+      "learning_rate": 0.003,
+      "loss": 4.0887,
+      "step": 4422
+    },
+    {
+      "epoch": 0.04423,
+      "grad_norm": 0.8218461871147156,
+      "learning_rate": 0.003,
+      "loss": 4.1124,
+      "step": 4423
+    },
+    {
+      "epoch": 0.04424,
+      "grad_norm": 0.834577202796936,
+      "learning_rate": 0.003,
+      "loss": 4.0602,
+      "step": 4424
+    },
+    {
+      "epoch": 0.04425,
+      "grad_norm": 0.7609786987304688,
+      "learning_rate": 0.003,
+      "loss": 4.0845,
+      "step": 4425
+    },
+    {
+      "epoch": 0.04426,
+      "grad_norm": 0.71391761302948,
+      "learning_rate": 0.003,
+      "loss": 4.1059,
+      "step": 4426
+    },
+    {
+      "epoch": 0.04427,
+      "grad_norm": 0.6980590224266052,
+      "learning_rate": 0.003,
+      "loss": 4.0932,
+      "step": 4427
+    },
+    {
+      "epoch": 0.04428,
+      "grad_norm": 0.6043670773506165,
+      "learning_rate": 0.003,
+      "loss": 4.0832,
+      "step": 4428
+    },
+    {
+      "epoch": 0.04429,
+      "grad_norm": 0.6160145401954651,
+      "learning_rate": 0.003,
+      "loss": 4.1247,
+      "step": 4429
+    },
+    {
+      "epoch": 0.0443,
+      "grad_norm": 0.6831215620040894,
+      "learning_rate": 0.003,
+      "loss": 4.0845,
+      "step": 4430
+    },
+    {
+      "epoch": 0.04431,
+      "grad_norm": 0.8066039681434631,
+      "learning_rate": 0.003,
+      "loss": 4.1129,
+      "step": 4431
+    },
+    {
+      "epoch": 0.04432,
+      "grad_norm": 0.8776717782020569,
+      "learning_rate": 0.003,
+      "loss": 4.1202,
+      "step": 4432
+    },
+    {
+      "epoch": 0.04433,
+      "grad_norm": 0.7881397008895874,
+      "learning_rate": 0.003,
+      "loss": 4.0943,
+      "step": 4433
+    },
+    {
+      "epoch": 0.04434,
+      "grad_norm": 0.6989492177963257,
+      "learning_rate": 0.003,
+      "loss": 4.0936,
+      "step": 4434
+    },
+    {
+      "epoch": 0.04435,
+      "grad_norm": 0.6727581024169922,
+      "learning_rate": 0.003,
+      "loss": 4.1041,
+      "step": 4435
+    },
+    {
+      "epoch": 0.04436,
+      "grad_norm": 0.6831278204917908,
+      "learning_rate": 0.003,
+      "loss": 4.0768,
+      "step": 4436
+    },
+    {
+      "epoch": 0.04437,
+      "grad_norm": 0.7069624066352844,
+      "learning_rate": 0.003,
+      "loss": 4.1015,
+      "step": 4437
+    },
+    {
+      "epoch": 0.04438,
+      "grad_norm": 0.8832647204399109,
+      "learning_rate": 0.003,
+      "loss": 4.1453,
+      "step": 4438
+    },
+    {
+      "epoch": 0.04439,
+      "grad_norm": 1.118896245956421,
+      "learning_rate": 0.003,
+      "loss": 4.1217,
+      "step": 4439
+    },
+    {
+      "epoch": 0.0444,
+      "grad_norm": 0.8600191473960876,
+      "learning_rate": 0.003,
+      "loss": 4.069,
+      "step": 4440
+    },
+    {
+      "epoch": 0.04441,
+      "grad_norm": 0.7522971630096436,
+      "learning_rate": 0.003,
+      "loss": 4.1027,
+      "step": 4441
+    },
+    {
+      "epoch": 0.04442,
+      "grad_norm": 0.6988781690597534,
+      "learning_rate": 0.003,
+      "loss": 4.0958,
+      "step": 4442
+    },
+    {
+      "epoch": 0.04443,
+      "grad_norm": 0.7806941270828247,
+      "learning_rate": 0.003,
+      "loss": 4.0726,
+      "step": 4443
+    },
+    {
+      "epoch": 0.04444,
+      "grad_norm": 0.7053735256195068,
+      "learning_rate": 0.003,
+      "loss": 4.0715,
+      "step": 4444
+    },
+    {
+      "epoch": 0.04445,
+      "grad_norm": 0.6177310943603516,
+      "learning_rate": 0.003,
+      "loss": 4.0985,
+      "step": 4445
+    },
+    {
+      "epoch": 0.04446,
+      "grad_norm": 0.6128456592559814,
+      "learning_rate": 0.003,
+      "loss": 4.0674,
+      "step": 4446
+    },
+    {
+      "epoch": 0.04447,
+      "grad_norm": 0.6953614950180054,
+      "learning_rate": 0.003,
+      "loss": 4.0729,
+      "step": 4447
+    },
+    {
+      "epoch": 0.04448,
+      "grad_norm": 0.6975845098495483,
+      "learning_rate": 0.003,
+      "loss": 4.0715,
+      "step": 4448
+    },
+    {
+      "epoch": 0.04449,
+      "grad_norm": 0.5980611443519592,
+      "learning_rate": 0.003,
+      "loss": 4.0817,
+      "step": 4449
+    },
+    {
+      "epoch": 0.0445,
+      "grad_norm": 0.5174282789230347,
+      "learning_rate": 0.003,
+      "loss": 4.0862,
+      "step": 4450
+    },
+    {
+      "epoch": 0.04451,
+      "grad_norm": 0.44481992721557617,
+      "learning_rate": 0.003,
+      "loss": 4.0805,
+      "step": 4451
+    },
+    {
+      "epoch": 0.04452,
+      "grad_norm": 0.38826221227645874,
+      "learning_rate": 0.003,
+      "loss": 4.0951,
+      "step": 4452
+    },
+    {
+      "epoch": 0.04453,
+      "grad_norm": 0.4051187038421631,
+      "learning_rate": 0.003,
+      "loss": 4.1021,
+      "step": 4453
+    },
+    {
+      "epoch": 0.04454,
+      "grad_norm": 0.3912207782268524,
+      "learning_rate": 0.003,
+      "loss": 4.082,
+      "step": 4454
+    },
+    {
+      "epoch": 0.04455,
+      "grad_norm": 0.35977083444595337,
+      "learning_rate": 0.003,
+      "loss": 4.0894,
+      "step": 4455
+    },
+    {
+      "epoch": 0.04456,
+      "grad_norm": 0.32243481278419495,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 4456
+    },
+    {
+      "epoch": 0.04457,
+      "grad_norm": 0.39922428131103516,
+      "learning_rate": 0.003,
+      "loss": 4.0733,
+      "step": 4457
+    },
+    {
+      "epoch": 0.04458,
+      "grad_norm": 0.43713390827178955,
+      "learning_rate": 0.003,
+      "loss": 4.0947,
+      "step": 4458
+    },
+    {
+      "epoch": 0.04459,
+      "grad_norm": 0.4921168088912964,
+      "learning_rate": 0.003,
+      "loss": 4.1015,
+      "step": 4459
+    },
+    {
+      "epoch": 0.0446,
+      "grad_norm": 0.5851754546165466,
+      "learning_rate": 0.003,
+      "loss": 4.101,
+      "step": 4460
+    },
+    {
+      "epoch": 0.04461,
+      "grad_norm": 0.7543833255767822,
+      "learning_rate": 0.003,
+      "loss": 4.1073,
+      "step": 4461
+    },
+    {
+      "epoch": 0.04462,
+      "grad_norm": 0.8701854348182678,
+      "learning_rate": 0.003,
+      "loss": 4.0672,
+      "step": 4462
+    },
+    {
+      "epoch": 0.04463,
+      "grad_norm": 0.928347110748291,
+      "learning_rate": 0.003,
+      "loss": 4.0845,
+      "step": 4463
+    },
+    {
+      "epoch": 0.04464,
+      "grad_norm": 0.923896312713623,
+      "learning_rate": 0.003,
+      "loss": 4.1012,
+      "step": 4464
+    },
+    {
+      "epoch": 0.04465,
+      "grad_norm": 0.9406884908676147,
+      "learning_rate": 0.003,
+      "loss": 4.094,
+      "step": 4465
+    },
+    {
+      "epoch": 0.04466,
+      "grad_norm": 0.9898063540458679,
+      "learning_rate": 0.003,
+      "loss": 4.1073,
+      "step": 4466
+    },
+    {
+      "epoch": 0.04467,
+      "grad_norm": 0.9611609578132629,
+      "learning_rate": 0.003,
+      "loss": 4.0995,
+      "step": 4467
+    },
+    {
+      "epoch": 0.04468,
+      "grad_norm": 0.7379047870635986,
+      "learning_rate": 0.003,
+      "loss": 4.0749,
+      "step": 4468
+    },
+    {
+      "epoch": 0.04469,
+      "grad_norm": 0.7446435689926147,
+      "learning_rate": 0.003,
+      "loss": 4.1068,
+      "step": 4469
+    },
+    {
+      "epoch": 0.0447,
+      "grad_norm": 0.7013334631919861,
+      "learning_rate": 0.003,
+      "loss": 4.105,
+      "step": 4470
+    },
+    {
+      "epoch": 0.04471,
+      "grad_norm": 0.7607833743095398,
+      "learning_rate": 0.003,
+      "loss": 4.1282,
+      "step": 4471
+    },
+    {
+      "epoch": 0.04472,
+      "grad_norm": 0.8320324420928955,
+      "learning_rate": 0.003,
+      "loss": 4.0827,
+      "step": 4472
+    },
+    {
+      "epoch": 0.04473,
+      "grad_norm": 0.8373231291770935,
+      "learning_rate": 0.003,
+      "loss": 4.0888,
+      "step": 4473
+    },
+    {
+      "epoch": 0.04474,
+      "grad_norm": 0.6635037660598755,
+      "learning_rate": 0.003,
+      "loss": 4.0777,
+      "step": 4474
+    },
+    {
+      "epoch": 0.04475,
+      "grad_norm": 0.6026391386985779,
+      "learning_rate": 0.003,
+      "loss": 4.0667,
+      "step": 4475
+    },
+    {
+      "epoch": 0.04476,
+      "grad_norm": 0.6115633249282837,
+      "learning_rate": 0.003,
+      "loss": 4.0767,
+      "step": 4476
+    },
+    {
+      "epoch": 0.04477,
+      "grad_norm": 0.7230142951011658,
+      "learning_rate": 0.003,
+      "loss": 4.0963,
+      "step": 4477
+    },
+    {
+      "epoch": 0.04478,
+      "grad_norm": 0.7399475574493408,
+      "learning_rate": 0.003,
+      "loss": 4.0947,
+      "step": 4478
+    },
+    {
+      "epoch": 0.04479,
+      "grad_norm": 0.6423735022544861,
+      "learning_rate": 0.003,
+      "loss": 4.073,
+      "step": 4479
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.6744748950004578,
+      "learning_rate": 0.003,
+      "loss": 4.0837,
+      "step": 4480
+    },
+    {
+      "epoch": 0.04481,
+      "grad_norm": 0.8687645196914673,
+      "learning_rate": 0.003,
+      "loss": 4.0847,
+      "step": 4481
+    },
+    {
+      "epoch": 0.04482,
+      "grad_norm": 0.9180449843406677,
+      "learning_rate": 0.003,
+      "loss": 4.1124,
+      "step": 4482
+    },
+    {
+      "epoch": 0.04483,
+      "grad_norm": 0.9672601819038391,
+      "learning_rate": 0.003,
+      "loss": 4.1022,
+      "step": 4483
+    },
+    {
+      "epoch": 0.04484,
+      "grad_norm": 0.9588479399681091,
+      "learning_rate": 0.003,
+      "loss": 4.1184,
+      "step": 4484
+    },
+    {
+      "epoch": 0.04485,
+      "grad_norm": 0.8815595507621765,
+      "learning_rate": 0.003,
+      "loss": 4.1263,
+      "step": 4485
+    },
+    {
+      "epoch": 0.04486,
+      "grad_norm": 0.7194766402244568,
+      "learning_rate": 0.003,
+      "loss": 4.0816,
+      "step": 4486
+    },
+    {
+      "epoch": 0.04487,
+      "grad_norm": 0.6366438269615173,
+      "learning_rate": 0.003,
+      "loss": 4.0901,
+      "step": 4487
+    },
+    {
+      "epoch": 0.04488,
+      "grad_norm": 0.7553848028182983,
+      "learning_rate": 0.003,
+      "loss": 4.1114,
+      "step": 4488
+    },
+    {
+      "epoch": 0.04489,
+      "grad_norm": 0.818514347076416,
+      "learning_rate": 0.003,
+      "loss": 4.116,
+      "step": 4489
+    },
+    {
+      "epoch": 0.0449,
+      "grad_norm": 0.8752609491348267,
+      "learning_rate": 0.003,
+      "loss": 4.1136,
+      "step": 4490
+    },
+    {
+      "epoch": 0.04491,
+      "grad_norm": 0.8151006698608398,
+      "learning_rate": 0.003,
+      "loss": 4.0999,
+      "step": 4491
+    },
+    {
+      "epoch": 0.04492,
+      "grad_norm": 0.852803111076355,
+      "learning_rate": 0.003,
+      "loss": 4.1152,
+      "step": 4492
+    },
+    {
+      "epoch": 0.04493,
+      "grad_norm": 0.8379290699958801,
+      "learning_rate": 0.003,
+      "loss": 4.1206,
+      "step": 4493
+    },
+    {
+      "epoch": 0.04494,
+      "grad_norm": 0.7042667269706726,
+      "learning_rate": 0.003,
+      "loss": 4.0958,
+      "step": 4494
+    },
+    {
+      "epoch": 0.04495,
+      "grad_norm": 0.625234842300415,
+      "learning_rate": 0.003,
+      "loss": 4.0924,
+      "step": 4495
+    },
+    {
+      "epoch": 0.04496,
+      "grad_norm": 0.666206955909729,
+      "learning_rate": 0.003,
+      "loss": 4.131,
+      "step": 4496
+    },
+    {
+      "epoch": 0.04497,
+      "grad_norm": 0.6894402503967285,
+      "learning_rate": 0.003,
+      "loss": 4.1099,
+      "step": 4497
+    },
+    {
+      "epoch": 0.04498,
+      "grad_norm": 0.6315938234329224,
+      "learning_rate": 0.003,
+      "loss": 4.0843,
+      "step": 4498
+    },
+    {
+      "epoch": 0.04499,
+      "grad_norm": 0.6740808486938477,
+      "learning_rate": 0.003,
+      "loss": 4.054,
+      "step": 4499
+    },
+    {
+      "epoch": 0.045,
+      "grad_norm": 0.6926952600479126,
+      "learning_rate": 0.003,
+      "loss": 4.0867,
+      "step": 4500
+    },
+    {
+      "epoch": 0.04501,
+      "grad_norm": 0.7954657673835754,
+      "learning_rate": 0.003,
+      "loss": 4.0802,
+      "step": 4501
+    },
+    {
+      "epoch": 0.04502,
+      "grad_norm": 0.9196973443031311,
+      "learning_rate": 0.003,
+      "loss": 4.1264,
+      "step": 4502
+    },
+    {
+      "epoch": 0.04503,
+      "grad_norm": 0.7730826139450073,
+      "learning_rate": 0.003,
+      "loss": 4.1229,
+      "step": 4503
+    },
+    {
+      "epoch": 0.04504,
+      "grad_norm": 0.5750721096992493,
+      "learning_rate": 0.003,
+      "loss": 4.1155,
+      "step": 4504
+    },
+    {
+      "epoch": 0.04505,
+      "grad_norm": 0.5613157749176025,
+      "learning_rate": 0.003,
+      "loss": 4.1075,
+      "step": 4505
+    },
+    {
+      "epoch": 0.04506,
+      "grad_norm": 0.6329727172851562,
+      "learning_rate": 0.003,
+      "loss": 4.0883,
+      "step": 4506
+    },
+    {
+      "epoch": 0.04507,
+      "grad_norm": 0.5682700872421265,
+      "learning_rate": 0.003,
+      "loss": 4.0608,
+      "step": 4507
+    },
+    {
+      "epoch": 0.04508,
+      "grad_norm": 0.584396481513977,
+      "learning_rate": 0.003,
+      "loss": 4.0825,
+      "step": 4508
+    },
+    {
+      "epoch": 0.04509,
+      "grad_norm": 0.5912066698074341,
+      "learning_rate": 0.003,
+      "loss": 4.1055,
+      "step": 4509
+    },
+    {
+      "epoch": 0.0451,
+      "grad_norm": 0.6018584966659546,
+      "learning_rate": 0.003,
+      "loss": 4.1098,
+      "step": 4510
+    },
+    {
+      "epoch": 0.04511,
+      "grad_norm": 0.5022575855255127,
+      "learning_rate": 0.003,
+      "loss": 4.0875,
+      "step": 4511
+    },
+    {
+      "epoch": 0.04512,
+      "grad_norm": 0.48740530014038086,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 4512
+    },
+    {
+      "epoch": 0.04513,
+      "grad_norm": 0.5775710940361023,
+      "learning_rate": 0.003,
+      "loss": 4.0732,
+      "step": 4513
+    },
+    {
+      "epoch": 0.04514,
+      "grad_norm": 0.5699050426483154,
+      "learning_rate": 0.003,
+      "loss": 4.0821,
+      "step": 4514
+    },
+    {
+      "epoch": 0.04515,
+      "grad_norm": 0.5856686234474182,
+      "learning_rate": 0.003,
+      "loss": 4.0755,
+      "step": 4515
+    },
+    {
+      "epoch": 0.04516,
+      "grad_norm": 0.7566087245941162,
+      "learning_rate": 0.003,
+      "loss": 4.0915,
+      "step": 4516
+    },
+    {
+      "epoch": 0.04517,
+      "grad_norm": 0.8498972654342651,
+      "learning_rate": 0.003,
+      "loss": 4.1093,
+      "step": 4517
+    },
+    {
+      "epoch": 0.04518,
+      "grad_norm": 0.8767289519309998,
+      "learning_rate": 0.003,
+      "loss": 4.0846,
+      "step": 4518
+    },
+    {
+      "epoch": 0.04519,
+      "grad_norm": 0.8576000332832336,
+      "learning_rate": 0.003,
+      "loss": 4.0813,
+      "step": 4519
+    },
+    {
+      "epoch": 0.0452,
+      "grad_norm": 0.6596296429634094,
+      "learning_rate": 0.003,
+      "loss": 4.0841,
+      "step": 4520
+    },
+    {
+      "epoch": 0.04521,
+      "grad_norm": 0.5750494599342346,
+      "learning_rate": 0.003,
+      "loss": 4.0684,
+      "step": 4521
+    },
+    {
+      "epoch": 0.04522,
+      "grad_norm": 0.6795118451118469,
+      "learning_rate": 0.003,
+      "loss": 4.0692,
+      "step": 4522
+    },
+    {
+      "epoch": 0.04523,
+      "grad_norm": 0.9297488331794739,
+      "learning_rate": 0.003,
+      "loss": 4.0791,
+      "step": 4523
+    },
+    {
+      "epoch": 0.04524,
+      "grad_norm": 0.9432056546211243,
+      "learning_rate": 0.003,
+      "loss": 4.0664,
+      "step": 4524
+    },
+    {
+      "epoch": 0.04525,
+      "grad_norm": 0.8285968899726868,
+      "learning_rate": 0.003,
+      "loss": 4.0753,
+      "step": 4525
+    },
+    {
+      "epoch": 0.04526,
+      "grad_norm": 0.757490873336792,
+      "learning_rate": 0.003,
+      "loss": 4.101,
+      "step": 4526
+    },
+    {
+      "epoch": 0.04527,
+      "grad_norm": 0.8935887813568115,
+      "learning_rate": 0.003,
+      "loss": 4.0875,
+      "step": 4527
+    },
+    {
+      "epoch": 0.04528,
+      "grad_norm": 0.8422304391860962,
+      "learning_rate": 0.003,
+      "loss": 4.0663,
+      "step": 4528
+    },
+    {
+      "epoch": 0.04529,
+      "grad_norm": 0.7896732687950134,
+      "learning_rate": 0.003,
+      "loss": 4.0717,
+      "step": 4529
+    },
+    {
+      "epoch": 0.0453,
+      "grad_norm": 0.7451812624931335,
+      "learning_rate": 0.003,
+      "loss": 4.0673,
+      "step": 4530
+    },
+    {
+      "epoch": 0.04531,
+      "grad_norm": 0.6093108654022217,
+      "learning_rate": 0.003,
+      "loss": 4.0647,
+      "step": 4531
+    },
+    {
+      "epoch": 0.04532,
+      "grad_norm": 0.61775141954422,
+      "learning_rate": 0.003,
+      "loss": 4.0927,
+      "step": 4532
+    },
+    {
+      "epoch": 0.04533,
+      "grad_norm": 0.6170506477355957,
+      "learning_rate": 0.003,
+      "loss": 4.0544,
+      "step": 4533
+    },
+    {
+      "epoch": 0.04534,
+      "grad_norm": 0.7349357604980469,
+      "learning_rate": 0.003,
+      "loss": 4.0998,
+      "step": 4534
+    },
+    {
+      "epoch": 0.04535,
+      "grad_norm": 0.7496950626373291,
+      "learning_rate": 0.003,
+      "loss": 4.1216,
+      "step": 4535
+    },
+    {
+      "epoch": 0.04536,
+      "grad_norm": 0.7089481949806213,
+      "learning_rate": 0.003,
+      "loss": 4.1188,
+      "step": 4536
+    },
+    {
+      "epoch": 0.04537,
+      "grad_norm": 0.7460286617279053,
+      "learning_rate": 0.003,
+      "loss": 4.0994,
+      "step": 4537
+    },
+    {
+      "epoch": 0.04538,
+      "grad_norm": 0.7358576059341431,
+      "learning_rate": 0.003,
+      "loss": 4.1007,
+      "step": 4538
+    },
+    {
+      "epoch": 0.04539,
+      "grad_norm": 0.8624926805496216,
+      "learning_rate": 0.003,
+      "loss": 4.069,
+      "step": 4539
+    },
+    {
+      "epoch": 0.0454,
+      "grad_norm": 0.8797523379325867,
+      "learning_rate": 0.003,
+      "loss": 4.0805,
+      "step": 4540
+    },
+    {
+      "epoch": 0.04541,
+      "grad_norm": 0.7332608103752136,
+      "learning_rate": 0.003,
+      "loss": 4.1213,
+      "step": 4541
+    },
+    {
+      "epoch": 0.04542,
+      "grad_norm": 0.6648062467575073,
+      "learning_rate": 0.003,
+      "loss": 4.0683,
+      "step": 4542
+    },
+    {
+      "epoch": 0.04543,
+      "grad_norm": 0.6967626810073853,
+      "learning_rate": 0.003,
+      "loss": 4.0998,
+      "step": 4543
+    },
+    {
+      "epoch": 0.04544,
+      "grad_norm": 0.8820528984069824,
+      "learning_rate": 0.003,
+      "loss": 4.0852,
+      "step": 4544
+    },
+    {
+      "epoch": 0.04545,
+      "grad_norm": 1.0358401536941528,
+      "learning_rate": 0.003,
+      "loss": 4.0984,
+      "step": 4545
+    },
+    {
+      "epoch": 0.04546,
+      "grad_norm": 0.9551036357879639,
+      "learning_rate": 0.003,
+      "loss": 4.0831,
+      "step": 4546
+    },
+    {
+      "epoch": 0.04547,
+      "grad_norm": 0.8078659176826477,
+      "learning_rate": 0.003,
+      "loss": 4.0859,
+      "step": 4547
+    },
+    {
+      "epoch": 0.04548,
+      "grad_norm": 0.7178813219070435,
+      "learning_rate": 0.003,
+      "loss": 4.0956,
+      "step": 4548
+    },
+    {
+      "epoch": 0.04549,
+      "grad_norm": 0.6956031918525696,
+      "learning_rate": 0.003,
+      "loss": 4.086,
+      "step": 4549
+    },
+    {
+      "epoch": 0.0455,
+      "grad_norm": 0.7719703912734985,
+      "learning_rate": 0.003,
+      "loss": 4.1175,
+      "step": 4550
+    },
+    {
+      "epoch": 0.04551,
+      "grad_norm": 0.8239540457725525,
+      "learning_rate": 0.003,
+      "loss": 4.1305,
+      "step": 4551
+    },
+    {
+      "epoch": 0.04552,
+      "grad_norm": 0.6435329914093018,
+      "learning_rate": 0.003,
+      "loss": 4.1074,
+      "step": 4552
+    },
+    {
+      "epoch": 0.04553,
+      "grad_norm": 0.624059796333313,
+      "learning_rate": 0.003,
+      "loss": 4.0936,
+      "step": 4553
+    },
+    {
+      "epoch": 0.04554,
+      "grad_norm": 0.6374339461326599,
+      "learning_rate": 0.003,
+      "loss": 4.081,
+      "step": 4554
+    },
+    {
+      "epoch": 0.04555,
+      "grad_norm": 0.6004091501235962,
+      "learning_rate": 0.003,
+      "loss": 4.0915,
+      "step": 4555
+    },
+    {
+      "epoch": 0.04556,
+      "grad_norm": 0.6625860333442688,
+      "learning_rate": 0.003,
+      "loss": 4.0942,
+      "step": 4556
+    },
+    {
+      "epoch": 0.04557,
+      "grad_norm": 0.6363006830215454,
+      "learning_rate": 0.003,
+      "loss": 4.0779,
+      "step": 4557
+    },
+    {
+      "epoch": 0.04558,
+      "grad_norm": 0.5139217972755432,
+      "learning_rate": 0.003,
+      "loss": 4.102,
+      "step": 4558
+    },
+    {
+      "epoch": 0.04559,
+      "grad_norm": 0.5389745831489563,
+      "learning_rate": 0.003,
+      "loss": 4.1157,
+      "step": 4559
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.5130912065505981,
+      "learning_rate": 0.003,
+      "loss": 4.0728,
+      "step": 4560
+    },
+    {
+      "epoch": 0.04561,
+      "grad_norm": 0.4410141706466675,
+      "learning_rate": 0.003,
+      "loss": 4.0814,
+      "step": 4561
+    },
+    {
+      "epoch": 0.04562,
+      "grad_norm": 0.4584052860736847,
+      "learning_rate": 0.003,
+      "loss": 4.1027,
+      "step": 4562
+    },
+    {
+      "epoch": 0.04563,
+      "grad_norm": 0.4929635226726532,
+      "learning_rate": 0.003,
+      "loss": 4.0693,
+      "step": 4563
+    },
+    {
+      "epoch": 0.04564,
+      "grad_norm": 0.5389020442962646,
+      "learning_rate": 0.003,
+      "loss": 4.0826,
+      "step": 4564
+    },
+    {
+      "epoch": 0.04565,
+      "grad_norm": 0.5814055800437927,
+      "learning_rate": 0.003,
+      "loss": 4.0529,
+      "step": 4565
+    },
+    {
+      "epoch": 0.04566,
+      "grad_norm": 0.691055953502655,
+      "learning_rate": 0.003,
+      "loss": 4.0887,
+      "step": 4566
+    },
+    {
+      "epoch": 0.04567,
+      "grad_norm": 0.7630618810653687,
+      "learning_rate": 0.003,
+      "loss": 4.0844,
+      "step": 4567
+    },
+    {
+      "epoch": 0.04568,
+      "grad_norm": 0.6998851895332336,
+      "learning_rate": 0.003,
+      "loss": 4.0608,
+      "step": 4568
+    },
+    {
+      "epoch": 0.04569,
+      "grad_norm": 0.641377866268158,
+      "learning_rate": 0.003,
+      "loss": 4.085,
+      "step": 4569
+    },
+    {
+      "epoch": 0.0457,
+      "grad_norm": 0.6208799481391907,
+      "learning_rate": 0.003,
+      "loss": 4.0761,
+      "step": 4570
+    },
+    {
+      "epoch": 0.04571,
+      "grad_norm": 0.6723332405090332,
+      "learning_rate": 0.003,
+      "loss": 4.1044,
+      "step": 4571
+    },
+    {
+      "epoch": 0.04572,
+      "grad_norm": 0.7085137367248535,
+      "learning_rate": 0.003,
+      "loss": 4.0794,
+      "step": 4572
+    },
+    {
+      "epoch": 0.04573,
+      "grad_norm": 0.5809813141822815,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 4573
+    },
+    {
+      "epoch": 0.04574,
+      "grad_norm": 0.638818621635437,
+      "learning_rate": 0.003,
+      "loss": 4.0651,
+      "step": 4574
+    },
+    {
+      "epoch": 0.04575,
+      "grad_norm": 0.6738321781158447,
+      "learning_rate": 0.003,
+      "loss": 4.0931,
+      "step": 4575
+    },
+    {
+      "epoch": 0.04576,
+      "grad_norm": 0.6550054550170898,
+      "learning_rate": 0.003,
+      "loss": 4.0823,
+      "step": 4576
+    },
+    {
+      "epoch": 0.04577,
+      "grad_norm": 0.6867873072624207,
+      "learning_rate": 0.003,
+      "loss": 4.1054,
+      "step": 4577
+    },
+    {
+      "epoch": 0.04578,
+      "grad_norm": 0.7424322962760925,
+      "learning_rate": 0.003,
+      "loss": 4.0716,
+      "step": 4578
+    },
+    {
+      "epoch": 0.04579,
+      "grad_norm": 1.0748231410980225,
+      "learning_rate": 0.003,
+      "loss": 4.0673,
+      "step": 4579
+    },
+    {
+      "epoch": 0.0458,
+      "grad_norm": 1.1119617223739624,
+      "learning_rate": 0.003,
+      "loss": 4.1098,
+      "step": 4580
+    },
+    {
+      "epoch": 0.04581,
+      "grad_norm": 0.816440999507904,
+      "learning_rate": 0.003,
+      "loss": 4.1143,
+      "step": 4581
+    },
+    {
+      "epoch": 0.04582,
+      "grad_norm": 0.9225358963012695,
+      "learning_rate": 0.003,
+      "loss": 4.1256,
+      "step": 4582
+    },
+    {
+      "epoch": 0.04583,
+      "grad_norm": 1.3155148029327393,
+      "learning_rate": 0.003,
+      "loss": 4.11,
+      "step": 4583
+    },
+    {
+      "epoch": 0.04584,
+      "grad_norm": 0.9258817434310913,
+      "learning_rate": 0.003,
+      "loss": 4.103,
+      "step": 4584
+    },
+    {
+      "epoch": 0.04585,
+      "grad_norm": 1.0143473148345947,
+      "learning_rate": 0.003,
+      "loss": 4.1076,
+      "step": 4585
+    },
+    {
+      "epoch": 0.04586,
+      "grad_norm": 0.9881197214126587,
+      "learning_rate": 0.003,
+      "loss": 4.1142,
+      "step": 4586
+    },
+    {
+      "epoch": 0.04587,
+      "grad_norm": 0.8126780390739441,
+      "learning_rate": 0.003,
+      "loss": 4.1266,
+      "step": 4587
+    },
+    {
+      "epoch": 0.04588,
+      "grad_norm": 0.7166712880134583,
+      "learning_rate": 0.003,
+      "loss": 4.1214,
+      "step": 4588
+    },
+    {
+      "epoch": 0.04589,
+      "grad_norm": 0.652772068977356,
+      "learning_rate": 0.003,
+      "loss": 4.0853,
+      "step": 4589
+    },
+    {
+      "epoch": 0.0459,
+      "grad_norm": 0.671996533870697,
+      "learning_rate": 0.003,
+      "loss": 4.1087,
+      "step": 4590
+    },
+    {
+      "epoch": 0.04591,
+      "grad_norm": 0.7832691669464111,
+      "learning_rate": 0.003,
+      "loss": 4.0765,
+      "step": 4591
+    },
+    {
+      "epoch": 0.04592,
+      "grad_norm": 0.861909031867981,
+      "learning_rate": 0.003,
+      "loss": 4.1029,
+      "step": 4592
+    },
+    {
+      "epoch": 0.04593,
+      "grad_norm": 0.8999555706977844,
+      "learning_rate": 0.003,
+      "loss": 4.1338,
+      "step": 4593
+    },
+    {
+      "epoch": 0.04594,
+      "grad_norm": 0.7495047450065613,
+      "learning_rate": 0.003,
+      "loss": 4.1199,
+      "step": 4594
+    },
+    {
+      "epoch": 0.04595,
+      "grad_norm": 0.6235268712043762,
+      "learning_rate": 0.003,
+      "loss": 4.1032,
+      "step": 4595
+    },
+    {
+      "epoch": 0.04596,
+      "grad_norm": 0.6757757067680359,
+      "learning_rate": 0.003,
+      "loss": 4.1047,
+      "step": 4596
+    },
+    {
+      "epoch": 0.04597,
+      "grad_norm": 0.7834587693214417,
+      "learning_rate": 0.003,
+      "loss": 4.1098,
+      "step": 4597
+    },
+    {
+      "epoch": 0.04598,
+      "grad_norm": 0.7589462399482727,
+      "learning_rate": 0.003,
+      "loss": 4.1431,
+      "step": 4598
+    },
+    {
+      "epoch": 0.04599,
+      "grad_norm": 0.7294259667396545,
+      "learning_rate": 0.003,
+      "loss": 4.088,
+      "step": 4599
+    },
+    {
+      "epoch": 0.046,
+      "grad_norm": 0.5923947095870972,
+      "learning_rate": 0.003,
+      "loss": 4.0993,
+      "step": 4600
+    },
+    {
+      "epoch": 0.04601,
+      "grad_norm": 0.5730067491531372,
+      "learning_rate": 0.003,
+      "loss": 4.0984,
+      "step": 4601
+    },
+    {
+      "epoch": 0.04602,
+      "grad_norm": 0.5987418293952942,
+      "learning_rate": 0.003,
+      "loss": 4.0931,
+      "step": 4602
+    },
+    {
+      "epoch": 0.04603,
+      "grad_norm": 0.6320177912712097,
+      "learning_rate": 0.003,
+      "loss": 4.1053,
+      "step": 4603
+    },
+    {
+      "epoch": 0.04604,
+      "grad_norm": 0.6222275495529175,
+      "learning_rate": 0.003,
+      "loss": 4.0985,
+      "step": 4604
+    },
+    {
+      "epoch": 0.04605,
+      "grad_norm": 0.5284239053726196,
+      "learning_rate": 0.003,
+      "loss": 4.1079,
+      "step": 4605
+    },
+    {
+      "epoch": 0.04606,
+      "grad_norm": 0.3622366189956665,
+      "learning_rate": 0.003,
+      "loss": 4.0661,
+      "step": 4606
+    },
+    {
+      "epoch": 0.04607,
+      "grad_norm": 0.41870078444480896,
+      "learning_rate": 0.003,
+      "loss": 4.0751,
+      "step": 4607
+    },
+    {
+      "epoch": 0.04608,
+      "grad_norm": 0.4503072500228882,
+      "learning_rate": 0.003,
+      "loss": 4.0589,
+      "step": 4608
+    },
+    {
+      "epoch": 0.04609,
+      "grad_norm": 0.4834275245666504,
+      "learning_rate": 0.003,
+      "loss": 4.0801,
+      "step": 4609
+    },
+    {
+      "epoch": 0.0461,
+      "grad_norm": 0.543554961681366,
+      "learning_rate": 0.003,
+      "loss": 4.0613,
+      "step": 4610
+    },
+    {
+      "epoch": 0.04611,
+      "grad_norm": 0.6071869134902954,
+      "learning_rate": 0.003,
+      "loss": 4.1102,
+      "step": 4611
+    },
+    {
+      "epoch": 0.04612,
+      "grad_norm": 0.6595013737678528,
+      "learning_rate": 0.003,
+      "loss": 4.0935,
+      "step": 4612
+    },
+    {
+      "epoch": 0.04613,
+      "grad_norm": 0.6050468683242798,
+      "learning_rate": 0.003,
+      "loss": 4.0768,
+      "step": 4613
+    },
+    {
+      "epoch": 0.04614,
+      "grad_norm": 0.5637344717979431,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 4614
+    },
+    {
+      "epoch": 0.04615,
+      "grad_norm": 0.5728361010551453,
+      "learning_rate": 0.003,
+      "loss": 4.0805,
+      "step": 4615
+    },
+    {
+      "epoch": 0.04616,
+      "grad_norm": 0.6123324036598206,
+      "learning_rate": 0.003,
+      "loss": 4.0623,
+      "step": 4616
+    },
+    {
+      "epoch": 0.04617,
+      "grad_norm": 0.6856390833854675,
+      "learning_rate": 0.003,
+      "loss": 4.0876,
+      "step": 4617
+    },
+    {
+      "epoch": 0.04618,
+      "grad_norm": 0.7527903318405151,
+      "learning_rate": 0.003,
+      "loss": 4.0757,
+      "step": 4618
+    },
+    {
+      "epoch": 0.04619,
+      "grad_norm": 0.8059439659118652,
+      "learning_rate": 0.003,
+      "loss": 4.0758,
+      "step": 4619
+    },
+    {
+      "epoch": 0.0462,
+      "grad_norm": 0.925609290599823,
+      "learning_rate": 0.003,
+      "loss": 4.1113,
+      "step": 4620
+    },
+    {
+      "epoch": 0.04621,
+      "grad_norm": 1.0445528030395508,
+      "learning_rate": 0.003,
+      "loss": 4.1238,
+      "step": 4621
+    },
+    {
+      "epoch": 0.04622,
+      "grad_norm": 1.04862642288208,
+      "learning_rate": 0.003,
+      "loss": 4.1261,
+      "step": 4622
+    },
+    {
+      "epoch": 0.04623,
+      "grad_norm": 0.8834772706031799,
+      "learning_rate": 0.003,
+      "loss": 4.1049,
+      "step": 4623
+    },
+    {
+      "epoch": 0.04624,
+      "grad_norm": 0.9410740733146667,
+      "learning_rate": 0.003,
+      "loss": 4.0836,
+      "step": 4624
+    },
+    {
+      "epoch": 0.04625,
+      "grad_norm": 0.6689598560333252,
+      "learning_rate": 0.003,
+      "loss": 4.0991,
+      "step": 4625
+    },
+    {
+      "epoch": 0.04626,
+      "grad_norm": 0.6782405376434326,
+      "learning_rate": 0.003,
+      "loss": 4.1038,
+      "step": 4626
+    },
+    {
+      "epoch": 0.04627,
+      "grad_norm": 0.5961948037147522,
+      "learning_rate": 0.003,
+      "loss": 4.0851,
+      "step": 4627
+    },
+    {
+      "epoch": 0.04628,
+      "grad_norm": 0.8130809664726257,
+      "learning_rate": 0.003,
+      "loss": 4.1164,
+      "step": 4628
+    },
+    {
+      "epoch": 0.04629,
+      "grad_norm": 0.8763521313667297,
+      "learning_rate": 0.003,
+      "loss": 4.0941,
+      "step": 4629
+    },
+    {
+      "epoch": 0.0463,
+      "grad_norm": 0.911468505859375,
+      "learning_rate": 0.003,
+      "loss": 4.112,
+      "step": 4630
+    },
+    {
+      "epoch": 0.04631,
+      "grad_norm": 1.0289772748947144,
+      "learning_rate": 0.003,
+      "loss": 4.0999,
+      "step": 4631
+    },
+    {
+      "epoch": 0.04632,
+      "grad_norm": 0.940280556678772,
+      "learning_rate": 0.003,
+      "loss": 4.0895,
+      "step": 4632
+    },
+    {
+      "epoch": 0.04633,
+      "grad_norm": 0.7662308812141418,
+      "learning_rate": 0.003,
+      "loss": 4.0915,
+      "step": 4633
+    },
+    {
+      "epoch": 0.04634,
+      "grad_norm": 0.7879605889320374,
+      "learning_rate": 0.003,
+      "loss": 4.0936,
+      "step": 4634
+    },
+    {
+      "epoch": 0.04635,
+      "grad_norm": 0.8211584687232971,
+      "learning_rate": 0.003,
+      "loss": 4.0984,
+      "step": 4635
+    },
+    {
+      "epoch": 0.04636,
+      "grad_norm": 0.8329793810844421,
+      "learning_rate": 0.003,
+      "loss": 4.1345,
+      "step": 4636
+    },
+    {
+      "epoch": 0.04637,
+      "grad_norm": 0.7591750621795654,
+      "learning_rate": 0.003,
+      "loss": 4.0735,
+      "step": 4637
+    },
+    {
+      "epoch": 0.04638,
+      "grad_norm": 0.8332642912864685,
+      "learning_rate": 0.003,
+      "loss": 4.1152,
+      "step": 4638
+    },
+    {
+      "epoch": 0.04639,
+      "grad_norm": 0.9349274039268494,
+      "learning_rate": 0.003,
+      "loss": 4.087,
+      "step": 4639
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 1.0191375017166138,
+      "learning_rate": 0.003,
+      "loss": 4.1145,
+      "step": 4640
+    },
+    {
+      "epoch": 0.04641,
+      "grad_norm": 0.9005967974662781,
+      "learning_rate": 0.003,
+      "loss": 4.1264,
+      "step": 4641
+    },
+    {
+      "epoch": 0.04642,
+      "grad_norm": 0.7704114317893982,
+      "learning_rate": 0.003,
+      "loss": 4.0693,
+      "step": 4642
+    },
+    {
+      "epoch": 0.04643,
+      "grad_norm": 0.6552042365074158,
+      "learning_rate": 0.003,
+      "loss": 4.0906,
+      "step": 4643
+    },
+    {
+      "epoch": 0.04644,
+      "grad_norm": 0.5903872847557068,
+      "learning_rate": 0.003,
+      "loss": 4.0939,
+      "step": 4644
+    },
+    {
+      "epoch": 0.04645,
+      "grad_norm": 0.5108548998832703,
+      "learning_rate": 0.003,
+      "loss": 4.0824,
+      "step": 4645
+    },
+    {
+      "epoch": 0.04646,
+      "grad_norm": 0.5088199973106384,
+      "learning_rate": 0.003,
+      "loss": 4.0837,
+      "step": 4646
+    },
+    {
+      "epoch": 0.04647,
+      "grad_norm": 0.5300918221473694,
+      "learning_rate": 0.003,
+      "loss": 4.078,
+      "step": 4647
+    },
+    {
+      "epoch": 0.04648,
+      "grad_norm": 0.4461018443107605,
+      "learning_rate": 0.003,
+      "loss": 4.0942,
+      "step": 4648
+    },
+    {
+      "epoch": 0.04649,
+      "grad_norm": 0.4164982736110687,
+      "learning_rate": 0.003,
+      "loss": 4.0777,
+      "step": 4649
+    },
+    {
+      "epoch": 0.0465,
+      "grad_norm": 0.4256885349750519,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 4650
+    },
+    {
+      "epoch": 0.04651,
+      "grad_norm": 0.4963768720626831,
+      "learning_rate": 0.003,
+      "loss": 4.0817,
+      "step": 4651
+    },
+    {
+      "epoch": 0.04652,
+      "grad_norm": 0.6006476879119873,
+      "learning_rate": 0.003,
+      "loss": 4.0899,
+      "step": 4652
+    },
+    {
+      "epoch": 0.04653,
+      "grad_norm": 0.6713421940803528,
+      "learning_rate": 0.003,
+      "loss": 4.0889,
+      "step": 4653
+    },
+    {
+      "epoch": 0.04654,
+      "grad_norm": 0.7249282002449036,
+      "learning_rate": 0.003,
+      "loss": 4.0861,
+      "step": 4654
+    },
+    {
+      "epoch": 0.04655,
+      "grad_norm": 0.6726330518722534,
+      "learning_rate": 0.003,
+      "loss": 4.0925,
+      "step": 4655
+    },
+    {
+      "epoch": 0.04656,
+      "grad_norm": 0.536870002746582,
+      "learning_rate": 0.003,
+      "loss": 4.0603,
+      "step": 4656
+    },
+    {
+      "epoch": 0.04657,
+      "grad_norm": 0.5610942244529724,
+      "learning_rate": 0.003,
+      "loss": 4.0979,
+      "step": 4657
+    },
+    {
+      "epoch": 0.04658,
+      "grad_norm": 0.7012815475463867,
+      "learning_rate": 0.003,
+      "loss": 4.0774,
+      "step": 4658
+    },
+    {
+      "epoch": 0.04659,
+      "grad_norm": 0.7252200245857239,
+      "learning_rate": 0.003,
+      "loss": 4.0777,
+      "step": 4659
+    },
+    {
+      "epoch": 0.0466,
+      "grad_norm": 0.6837133765220642,
+      "learning_rate": 0.003,
+      "loss": 4.0979,
+      "step": 4660
+    },
+    {
+      "epoch": 0.04661,
+      "grad_norm": 0.6772074103355408,
+      "learning_rate": 0.003,
+      "loss": 4.0811,
+      "step": 4661
+    },
+    {
+      "epoch": 0.04662,
+      "grad_norm": 0.6671146154403687,
+      "learning_rate": 0.003,
+      "loss": 4.0668,
+      "step": 4662
+    },
+    {
+      "epoch": 0.04663,
+      "grad_norm": 0.6181881427764893,
+      "learning_rate": 0.003,
+      "loss": 4.0871,
+      "step": 4663
+    },
+    {
+      "epoch": 0.04664,
+      "grad_norm": 0.6068332195281982,
+      "learning_rate": 0.003,
+      "loss": 4.063,
+      "step": 4664
+    },
+    {
+      "epoch": 0.04665,
+      "grad_norm": 0.5939993858337402,
+      "learning_rate": 0.003,
+      "loss": 4.0631,
+      "step": 4665
+    },
+    {
+      "epoch": 0.04666,
+      "grad_norm": 0.6058046221733093,
+      "learning_rate": 0.003,
+      "loss": 4.0589,
+      "step": 4666
+    },
+    {
+      "epoch": 0.04667,
+      "grad_norm": 0.6686825752258301,
+      "learning_rate": 0.003,
+      "loss": 4.0649,
+      "step": 4667
+    },
+    {
+      "epoch": 0.04668,
+      "grad_norm": 0.7094378471374512,
+      "learning_rate": 0.003,
+      "loss": 4.0403,
+      "step": 4668
+    },
+    {
+      "epoch": 0.04669,
+      "grad_norm": 0.8139551281929016,
+      "learning_rate": 0.003,
+      "loss": 4.098,
+      "step": 4669
+    },
+    {
+      "epoch": 0.0467,
+      "grad_norm": 1.0420807600021362,
+      "learning_rate": 0.003,
+      "loss": 4.0572,
+      "step": 4670
+    },
+    {
+      "epoch": 0.04671,
+      "grad_norm": 1.1174323558807373,
+      "learning_rate": 0.003,
+      "loss": 4.0918,
+      "step": 4671
+    },
+    {
+      "epoch": 0.04672,
+      "grad_norm": 0.8469231724739075,
+      "learning_rate": 0.003,
+      "loss": 4.1005,
+      "step": 4672
+    },
+    {
+      "epoch": 0.04673,
+      "grad_norm": 0.8073984980583191,
+      "learning_rate": 0.003,
+      "loss": 4.1079,
+      "step": 4673
+    },
+    {
+      "epoch": 0.04674,
+      "grad_norm": 0.76691734790802,
+      "learning_rate": 0.003,
+      "loss": 4.0734,
+      "step": 4674
+    },
+    {
+      "epoch": 0.04675,
+      "grad_norm": 0.8686924576759338,
+      "learning_rate": 0.003,
+      "loss": 4.1028,
+      "step": 4675
+    },
+    {
+      "epoch": 0.04676,
+      "grad_norm": 1.0538427829742432,
+      "learning_rate": 0.003,
+      "loss": 4.1144,
+      "step": 4676
+    },
+    {
+      "epoch": 0.04677,
+      "grad_norm": 1.003646731376648,
+      "learning_rate": 0.003,
+      "loss": 4.1167,
+      "step": 4677
+    },
+    {
+      "epoch": 0.04678,
+      "grad_norm": 0.7191857695579529,
+      "learning_rate": 0.003,
+      "loss": 4.0671,
+      "step": 4678
+    },
+    {
+      "epoch": 0.04679,
+      "grad_norm": 0.6493638753890991,
+      "learning_rate": 0.003,
+      "loss": 4.095,
+      "step": 4679
+    },
+    {
+      "epoch": 0.0468,
+      "grad_norm": 0.6600551605224609,
+      "learning_rate": 0.003,
+      "loss": 4.0645,
+      "step": 4680
+    },
+    {
+      "epoch": 0.04681,
+      "grad_norm": 0.7209051251411438,
+      "learning_rate": 0.003,
+      "loss": 4.1177,
+      "step": 4681
+    },
+    {
+      "epoch": 0.04682,
+      "grad_norm": 0.8192289471626282,
+      "learning_rate": 0.003,
+      "loss": 4.088,
+      "step": 4682
+    },
+    {
+      "epoch": 0.04683,
+      "grad_norm": 0.7203931212425232,
+      "learning_rate": 0.003,
+      "loss": 4.0776,
+      "step": 4683
+    },
+    {
+      "epoch": 0.04684,
+      "grad_norm": 0.6532163023948669,
+      "learning_rate": 0.003,
+      "loss": 4.0983,
+      "step": 4684
+    },
+    {
+      "epoch": 0.04685,
+      "grad_norm": 0.6800267100334167,
+      "learning_rate": 0.003,
+      "loss": 4.0821,
+      "step": 4685
+    },
+    {
+      "epoch": 0.04686,
+      "grad_norm": 0.6990789771080017,
+      "learning_rate": 0.003,
+      "loss": 4.0892,
+      "step": 4686
+    },
+    {
+      "epoch": 0.04687,
+      "grad_norm": 0.6912499666213989,
+      "learning_rate": 0.003,
+      "loss": 4.119,
+      "step": 4687
+    },
+    {
+      "epoch": 0.04688,
+      "grad_norm": 0.7285837531089783,
+      "learning_rate": 0.003,
+      "loss": 4.1009,
+      "step": 4688
+    },
+    {
+      "epoch": 0.04689,
+      "grad_norm": 0.6683054566383362,
+      "learning_rate": 0.003,
+      "loss": 4.1139,
+      "step": 4689
+    },
+    {
+      "epoch": 0.0469,
+      "grad_norm": 0.5229889154434204,
+      "learning_rate": 0.003,
+      "loss": 4.0586,
+      "step": 4690
+    },
+    {
+      "epoch": 0.04691,
+      "grad_norm": 0.49395525455474854,
+      "learning_rate": 0.003,
+      "loss": 4.0734,
+      "step": 4691
+    },
+    {
+      "epoch": 0.04692,
+      "grad_norm": 0.46230220794677734,
+      "learning_rate": 0.003,
+      "loss": 4.0766,
+      "step": 4692
+    },
+    {
+      "epoch": 0.04693,
+      "grad_norm": 0.4435535967350006,
+      "learning_rate": 0.003,
+      "loss": 4.1002,
+      "step": 4693
+    },
+    {
+      "epoch": 0.04694,
+      "grad_norm": 0.43420031666755676,
+      "learning_rate": 0.003,
+      "loss": 4.063,
+      "step": 4694
+    },
+    {
+      "epoch": 0.04695,
+      "grad_norm": 0.4660513699054718,
+      "learning_rate": 0.003,
+      "loss": 4.0527,
+      "step": 4695
+    },
+    {
+      "epoch": 0.04696,
+      "grad_norm": 0.5974222421646118,
+      "learning_rate": 0.003,
+      "loss": 4.0964,
+      "step": 4696
+    },
+    {
+      "epoch": 0.04697,
+      "grad_norm": 0.7868569493293762,
+      "learning_rate": 0.003,
+      "loss": 4.0853,
+      "step": 4697
+    },
+    {
+      "epoch": 0.04698,
+      "grad_norm": 0.9865483641624451,
+      "learning_rate": 0.003,
+      "loss": 4.0933,
+      "step": 4698
+    },
+    {
+      "epoch": 0.04699,
+      "grad_norm": 0.9736619591712952,
+      "learning_rate": 0.003,
+      "loss": 4.0855,
+      "step": 4699
+    },
+    {
+      "epoch": 0.047,
+      "grad_norm": 0.7273622155189514,
+      "learning_rate": 0.003,
+      "loss": 4.0936,
+      "step": 4700
+    },
+    {
+      "epoch": 0.04701,
+      "grad_norm": 0.6724737882614136,
+      "learning_rate": 0.003,
+      "loss": 4.1003,
+      "step": 4701
+    },
+    {
+      "epoch": 0.04702,
+      "grad_norm": 0.8422785997390747,
+      "learning_rate": 0.003,
+      "loss": 4.0972,
+      "step": 4702
+    },
+    {
+      "epoch": 0.04703,
+      "grad_norm": 0.743380606174469,
+      "learning_rate": 0.003,
+      "loss": 4.081,
+      "step": 4703
+    },
+    {
+      "epoch": 0.04704,
+      "grad_norm": 0.6584486365318298,
+      "learning_rate": 0.003,
+      "loss": 4.0749,
+      "step": 4704
+    },
+    {
+      "epoch": 0.04705,
+      "grad_norm": 0.6731492877006531,
+      "learning_rate": 0.003,
+      "loss": 4.1078,
+      "step": 4705
+    },
+    {
+      "epoch": 0.04706,
+      "grad_norm": 0.6525211334228516,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 4706
+    },
+    {
+      "epoch": 0.04707,
+      "grad_norm": 0.6952141523361206,
+      "learning_rate": 0.003,
+      "loss": 4.091,
+      "step": 4707
+    },
+    {
+      "epoch": 0.04708,
+      "grad_norm": 0.6215411424636841,
+      "learning_rate": 0.003,
+      "loss": 4.0452,
+      "step": 4708
+    },
+    {
+      "epoch": 0.04709,
+      "grad_norm": 0.5248569250106812,
+      "learning_rate": 0.003,
+      "loss": 4.0727,
+      "step": 4709
+    },
+    {
+      "epoch": 0.0471,
+      "grad_norm": 0.516987681388855,
+      "learning_rate": 0.003,
+      "loss": 4.0854,
+      "step": 4710
+    },
+    {
+      "epoch": 0.04711,
+      "grad_norm": 0.513450562953949,
+      "learning_rate": 0.003,
+      "loss": 4.0785,
+      "step": 4711
+    },
+    {
+      "epoch": 0.04712,
+      "grad_norm": 0.5270048975944519,
+      "learning_rate": 0.003,
+      "loss": 4.0696,
+      "step": 4712
+    },
+    {
+      "epoch": 0.04713,
+      "grad_norm": 0.5057416558265686,
+      "learning_rate": 0.003,
+      "loss": 4.0644,
+      "step": 4713
+    },
+    {
+      "epoch": 0.04714,
+      "grad_norm": 0.5090026259422302,
+      "learning_rate": 0.003,
+      "loss": 4.0702,
+      "step": 4714
+    },
+    {
+      "epoch": 0.04715,
+      "grad_norm": 0.5532310009002686,
+      "learning_rate": 0.003,
+      "loss": 4.0995,
+      "step": 4715
+    },
+    {
+      "epoch": 0.04716,
+      "grad_norm": 0.6358587145805359,
+      "learning_rate": 0.003,
+      "loss": 4.0738,
+      "step": 4716
+    },
+    {
+      "epoch": 0.04717,
+      "grad_norm": 0.7907416224479675,
+      "learning_rate": 0.003,
+      "loss": 4.0975,
+      "step": 4717
+    },
+    {
+      "epoch": 0.04718,
+      "grad_norm": 0.845724880695343,
+      "learning_rate": 0.003,
+      "loss": 4.0646,
+      "step": 4718
+    },
+    {
+      "epoch": 0.04719,
+      "grad_norm": 0.8444932699203491,
+      "learning_rate": 0.003,
+      "loss": 4.0736,
+      "step": 4719
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.8913478255271912,
+      "learning_rate": 0.003,
+      "loss": 4.1054,
+      "step": 4720
+    },
+    {
+      "epoch": 0.04721,
+      "grad_norm": 0.9782090187072754,
+      "learning_rate": 0.003,
+      "loss": 4.0768,
+      "step": 4721
+    },
+    {
+      "epoch": 0.04722,
+      "grad_norm": 1.1551313400268555,
+      "learning_rate": 0.003,
+      "loss": 4.1066,
+      "step": 4722
+    },
+    {
+      "epoch": 0.04723,
+      "grad_norm": 1.027269721031189,
+      "learning_rate": 0.003,
+      "loss": 4.1135,
+      "step": 4723
+    },
+    {
+      "epoch": 0.04724,
+      "grad_norm": 0.9385249018669128,
+      "learning_rate": 0.003,
+      "loss": 4.0926,
+      "step": 4724
+    },
+    {
+      "epoch": 0.04725,
+      "grad_norm": 1.0505073070526123,
+      "learning_rate": 0.003,
+      "loss": 4.1162,
+      "step": 4725
+    },
+    {
+      "epoch": 0.04726,
+      "grad_norm": 1.0679757595062256,
+      "learning_rate": 0.003,
+      "loss": 4.1095,
+      "step": 4726
+    },
+    {
+      "epoch": 0.04727,
+      "grad_norm": 0.9189696907997131,
+      "learning_rate": 0.003,
+      "loss": 4.092,
+      "step": 4727
+    },
+    {
+      "epoch": 0.04728,
+      "grad_norm": 0.9319504499435425,
+      "learning_rate": 0.003,
+      "loss": 4.0993,
+      "step": 4728
+    },
+    {
+      "epoch": 0.04729,
+      "grad_norm": 0.9617239832878113,
+      "learning_rate": 0.003,
+      "loss": 4.1212,
+      "step": 4729
+    },
+    {
+      "epoch": 0.0473,
+      "grad_norm": 0.8712969422340393,
+      "learning_rate": 0.003,
+      "loss": 4.1155,
+      "step": 4730
+    },
+    {
+      "epoch": 0.04731,
+      "grad_norm": 0.8123112320899963,
+      "learning_rate": 0.003,
+      "loss": 4.1049,
+      "step": 4731
+    },
+    {
+      "epoch": 0.04732,
+      "grad_norm": 0.7513419985771179,
+      "learning_rate": 0.003,
+      "loss": 4.1167,
+      "step": 4732
+    },
+    {
+      "epoch": 0.04733,
+      "grad_norm": 0.7118027210235596,
+      "learning_rate": 0.003,
+      "loss": 4.0977,
+      "step": 4733
+    },
+    {
+      "epoch": 0.04734,
+      "grad_norm": 0.6488009691238403,
+      "learning_rate": 0.003,
+      "loss": 4.1033,
+      "step": 4734
+    },
+    {
+      "epoch": 0.04735,
+      "grad_norm": 0.613771378993988,
+      "learning_rate": 0.003,
+      "loss": 4.1174,
+      "step": 4735
+    },
+    {
+      "epoch": 0.04736,
+      "grad_norm": 0.6943199038505554,
+      "learning_rate": 0.003,
+      "loss": 4.104,
+      "step": 4736
+    },
+    {
+      "epoch": 0.04737,
+      "grad_norm": 0.9270548224449158,
+      "learning_rate": 0.003,
+      "loss": 4.0737,
+      "step": 4737
+    },
+    {
+      "epoch": 0.04738,
+      "grad_norm": 1.0554243326187134,
+      "learning_rate": 0.003,
+      "loss": 4.1232,
+      "step": 4738
+    },
+    {
+      "epoch": 0.04739,
+      "grad_norm": 1.0856826305389404,
+      "learning_rate": 0.003,
+      "loss": 4.058,
+      "step": 4739
+    },
+    {
+      "epoch": 0.0474,
+      "grad_norm": 0.664472758769989,
+      "learning_rate": 0.003,
+      "loss": 4.118,
+      "step": 4740
+    },
+    {
+      "epoch": 0.04741,
+      "grad_norm": 0.576934814453125,
+      "learning_rate": 0.003,
+      "loss": 4.1056,
+      "step": 4741
+    },
+    {
+      "epoch": 0.04742,
+      "grad_norm": 0.6367508172988892,
+      "learning_rate": 0.003,
+      "loss": 4.0899,
+      "step": 4742
+    },
+    {
+      "epoch": 0.04743,
+      "grad_norm": 0.669205904006958,
+      "learning_rate": 0.003,
+      "loss": 4.1085,
+      "step": 4743
+    },
+    {
+      "epoch": 0.04744,
+      "grad_norm": 0.67984938621521,
+      "learning_rate": 0.003,
+      "loss": 4.0687,
+      "step": 4744
+    },
+    {
+      "epoch": 0.04745,
+      "grad_norm": 0.6410228610038757,
+      "learning_rate": 0.003,
+      "loss": 4.073,
+      "step": 4745
+    },
+    {
+      "epoch": 0.04746,
+      "grad_norm": 0.6202641725540161,
+      "learning_rate": 0.003,
+      "loss": 4.0876,
+      "step": 4746
+    },
+    {
+      "epoch": 0.04747,
+      "grad_norm": 0.6701365113258362,
+      "learning_rate": 0.003,
+      "loss": 4.0637,
+      "step": 4747
+    },
+    {
+      "epoch": 0.04748,
+      "grad_norm": 0.5763520002365112,
+      "learning_rate": 0.003,
+      "loss": 4.1103,
+      "step": 4748
+    },
+    {
+      "epoch": 0.04749,
+      "grad_norm": 0.4907684326171875,
+      "learning_rate": 0.003,
+      "loss": 4.1135,
+      "step": 4749
+    },
+    {
+      "epoch": 0.0475,
+      "grad_norm": 0.507577121257782,
+      "learning_rate": 0.003,
+      "loss": 4.0766,
+      "step": 4750
+    },
+    {
+      "epoch": 0.04751,
+      "grad_norm": 0.47070327401161194,
+      "learning_rate": 0.003,
+      "loss": 4.0398,
+      "step": 4751
+    },
+    {
+      "epoch": 0.04752,
+      "grad_norm": 0.37678685784339905,
+      "learning_rate": 0.003,
+      "loss": 4.0703,
+      "step": 4752
+    },
+    {
+      "epoch": 0.04753,
+      "grad_norm": 0.3762926459312439,
+      "learning_rate": 0.003,
+      "loss": 4.0742,
+      "step": 4753
+    },
+    {
+      "epoch": 0.04754,
+      "grad_norm": 0.38185784220695496,
+      "learning_rate": 0.003,
+      "loss": 4.0935,
+      "step": 4754
+    },
+    {
+      "epoch": 0.04755,
+      "grad_norm": 0.39762863516807556,
+      "learning_rate": 0.003,
+      "loss": 4.0721,
+      "step": 4755
+    },
+    {
+      "epoch": 0.04756,
+      "grad_norm": 0.42808109521865845,
+      "learning_rate": 0.003,
+      "loss": 4.073,
+      "step": 4756
+    },
+    {
+      "epoch": 0.04757,
+      "grad_norm": 0.4888444244861603,
+      "learning_rate": 0.003,
+      "loss": 4.0727,
+      "step": 4757
+    },
+    {
+      "epoch": 0.04758,
+      "grad_norm": 0.6120839715003967,
+      "learning_rate": 0.003,
+      "loss": 4.0642,
+      "step": 4758
+    },
+    {
+      "epoch": 0.04759,
+      "grad_norm": 0.7273629307746887,
+      "learning_rate": 0.003,
+      "loss": 4.0793,
+      "step": 4759
+    },
+    {
+      "epoch": 0.0476,
+      "grad_norm": 0.7623408436775208,
+      "learning_rate": 0.003,
+      "loss": 4.0606,
+      "step": 4760
+    },
+    {
+      "epoch": 0.04761,
+      "grad_norm": 0.7425439357757568,
+      "learning_rate": 0.003,
+      "loss": 4.067,
+      "step": 4761
+    },
+    {
+      "epoch": 0.04762,
+      "grad_norm": 0.6609148383140564,
+      "learning_rate": 0.003,
+      "loss": 4.0646,
+      "step": 4762
+    },
+    {
+      "epoch": 0.04763,
+      "grad_norm": 0.5869941711425781,
+      "learning_rate": 0.003,
+      "loss": 4.0642,
+      "step": 4763
+    },
+    {
+      "epoch": 0.04764,
+      "grad_norm": 0.6543363928794861,
+      "learning_rate": 0.003,
+      "loss": 4.055,
+      "step": 4764
+    },
+    {
+      "epoch": 0.04765,
+      "grad_norm": 0.6935657858848572,
+      "learning_rate": 0.003,
+      "loss": 4.1065,
+      "step": 4765
+    },
+    {
+      "epoch": 0.04766,
+      "grad_norm": 0.6691122651100159,
+      "learning_rate": 0.003,
+      "loss": 4.0795,
+      "step": 4766
+    },
+    {
+      "epoch": 0.04767,
+      "grad_norm": 0.7805852890014648,
+      "learning_rate": 0.003,
+      "loss": 4.0838,
+      "step": 4767
+    },
+    {
+      "epoch": 0.04768,
+      "grad_norm": 0.8912307024002075,
+      "learning_rate": 0.003,
+      "loss": 4.0827,
+      "step": 4768
+    },
+    {
+      "epoch": 0.04769,
+      "grad_norm": 1.0201588869094849,
+      "learning_rate": 0.003,
+      "loss": 4.0583,
+      "step": 4769
+    },
+    {
+      "epoch": 0.0477,
+      "grad_norm": 1.0079450607299805,
+      "learning_rate": 0.003,
+      "loss": 4.0734,
+      "step": 4770
+    },
+    {
+      "epoch": 0.04771,
+      "grad_norm": 0.9444756507873535,
+      "learning_rate": 0.003,
+      "loss": 4.103,
+      "step": 4771
+    },
+    {
+      "epoch": 0.04772,
+      "grad_norm": 0.9170507788658142,
+      "learning_rate": 0.003,
+      "loss": 4.1,
+      "step": 4772
+    },
+    {
+      "epoch": 0.04773,
+      "grad_norm": 0.7705607414245605,
+      "learning_rate": 0.003,
+      "loss": 4.1188,
+      "step": 4773
+    },
+    {
+      "epoch": 0.04774,
+      "grad_norm": 0.7739026546478271,
+      "learning_rate": 0.003,
+      "loss": 4.103,
+      "step": 4774
+    },
+    {
+      "epoch": 0.04775,
+      "grad_norm": 0.8026602268218994,
+      "learning_rate": 0.003,
+      "loss": 4.101,
+      "step": 4775
+    },
+    {
+      "epoch": 0.04776,
+      "grad_norm": 0.9427769780158997,
+      "learning_rate": 0.003,
+      "loss": 4.0921,
+      "step": 4776
+    },
+    {
+      "epoch": 0.04777,
+      "grad_norm": 1.033246397972107,
+      "learning_rate": 0.003,
+      "loss": 4.0943,
+      "step": 4777
+    },
+    {
+      "epoch": 0.04778,
+      "grad_norm": 0.9891014695167542,
+      "learning_rate": 0.003,
+      "loss": 4.0777,
+      "step": 4778
+    },
+    {
+      "epoch": 0.04779,
+      "grad_norm": 0.8989018201828003,
+      "learning_rate": 0.003,
+      "loss": 4.1152,
+      "step": 4779
+    },
+    {
+      "epoch": 0.0478,
+      "grad_norm": 0.7684260010719299,
+      "learning_rate": 0.003,
+      "loss": 4.1128,
+      "step": 4780
+    },
+    {
+      "epoch": 0.04781,
+      "grad_norm": 0.6779473423957825,
+      "learning_rate": 0.003,
+      "loss": 4.0838,
+      "step": 4781
+    },
+    {
+      "epoch": 0.04782,
+      "grad_norm": 0.7075977325439453,
+      "learning_rate": 0.003,
+      "loss": 4.0866,
+      "step": 4782
+    },
+    {
+      "epoch": 0.04783,
+      "grad_norm": 0.7358579635620117,
+      "learning_rate": 0.003,
+      "loss": 4.082,
+      "step": 4783
+    },
+    {
+      "epoch": 0.04784,
+      "grad_norm": 0.8492463231086731,
+      "learning_rate": 0.003,
+      "loss": 4.1067,
+      "step": 4784
+    },
+    {
+      "epoch": 0.04785,
+      "grad_norm": 1.0335021018981934,
+      "learning_rate": 0.003,
+      "loss": 4.1135,
+      "step": 4785
+    },
+    {
+      "epoch": 0.04786,
+      "grad_norm": 1.014879584312439,
+      "learning_rate": 0.003,
+      "loss": 4.1202,
+      "step": 4786
+    },
+    {
+      "epoch": 0.04787,
+      "grad_norm": 0.7597166299819946,
+      "learning_rate": 0.003,
+      "loss": 4.1063,
+      "step": 4787
+    },
+    {
+      "epoch": 0.04788,
+      "grad_norm": 0.7490633130073547,
+      "learning_rate": 0.003,
+      "loss": 4.0916,
+      "step": 4788
+    },
+    {
+      "epoch": 0.04789,
+      "grad_norm": 0.7627069354057312,
+      "learning_rate": 0.003,
+      "loss": 4.0739,
+      "step": 4789
+    },
+    {
+      "epoch": 0.0479,
+      "grad_norm": 0.7988529801368713,
+      "learning_rate": 0.003,
+      "loss": 4.0913,
+      "step": 4790
+    },
+    {
+      "epoch": 0.04791,
+      "grad_norm": 0.7073328495025635,
+      "learning_rate": 0.003,
+      "loss": 4.1077,
+      "step": 4791
+    },
+    {
+      "epoch": 0.04792,
+      "grad_norm": 0.5642073750495911,
+      "learning_rate": 0.003,
+      "loss": 4.1104,
+      "step": 4792
+    },
+    {
+      "epoch": 0.04793,
+      "grad_norm": 0.4884117543697357,
+      "learning_rate": 0.003,
+      "loss": 4.0685,
+      "step": 4793
+    },
+    {
+      "epoch": 0.04794,
+      "grad_norm": 0.5512744188308716,
+      "learning_rate": 0.003,
+      "loss": 4.0904,
+      "step": 4794
+    },
+    {
+      "epoch": 0.04795,
+      "grad_norm": 0.6215276718139648,
+      "learning_rate": 0.003,
+      "loss": 4.0903,
+      "step": 4795
+    },
+    {
+      "epoch": 0.04796,
+      "grad_norm": 0.5437957644462585,
+      "learning_rate": 0.003,
+      "loss": 4.0574,
+      "step": 4796
+    },
+    {
+      "epoch": 0.04797,
+      "grad_norm": 0.475530207157135,
+      "learning_rate": 0.003,
+      "loss": 4.0914,
+      "step": 4797
+    },
+    {
+      "epoch": 0.04798,
+      "grad_norm": 0.4487910866737366,
+      "learning_rate": 0.003,
+      "loss": 4.0521,
+      "step": 4798
+    },
+    {
+      "epoch": 0.04799,
+      "grad_norm": 0.4571172297000885,
+      "learning_rate": 0.003,
+      "loss": 4.0622,
+      "step": 4799
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.4549039602279663,
+      "learning_rate": 0.003,
+      "loss": 4.0667,
+      "step": 4800
+    },
+    {
+      "epoch": 0.04801,
+      "grad_norm": 0.4293798804283142,
+      "learning_rate": 0.003,
+      "loss": 4.0753,
+      "step": 4801
+    },
+    {
+      "epoch": 0.04802,
+      "grad_norm": 0.3992469012737274,
+      "learning_rate": 0.003,
+      "loss": 4.0461,
+      "step": 4802
+    },
+    {
+      "epoch": 0.04803,
+      "grad_norm": 0.39434272050857544,
+      "learning_rate": 0.003,
+      "loss": 4.0542,
+      "step": 4803
+    },
+    {
+      "epoch": 0.04804,
+      "grad_norm": 0.4348972737789154,
+      "learning_rate": 0.003,
+      "loss": 4.0849,
+      "step": 4804
+    },
+    {
+      "epoch": 0.04805,
+      "grad_norm": 0.4008714556694031,
+      "learning_rate": 0.003,
+      "loss": 4.086,
+      "step": 4805
+    },
+    {
+      "epoch": 0.04806,
+      "grad_norm": 0.4113296568393707,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 4806
+    },
+    {
+      "epoch": 0.04807,
+      "grad_norm": 0.4709857106208801,
+      "learning_rate": 0.003,
+      "loss": 4.0863,
+      "step": 4807
+    },
+    {
+      "epoch": 0.04808,
+      "grad_norm": 0.6104065179824829,
+      "learning_rate": 0.003,
+      "loss": 4.0655,
+      "step": 4808
+    },
+    {
+      "epoch": 0.04809,
+      "grad_norm": 0.9111732244491577,
+      "learning_rate": 0.003,
+      "loss": 4.0666,
+      "step": 4809
+    },
+    {
+      "epoch": 0.0481,
+      "grad_norm": 1.2555900812149048,
+      "learning_rate": 0.003,
+      "loss": 4.1023,
+      "step": 4810
+    },
+    {
+      "epoch": 0.04811,
+      "grad_norm": 0.6506162881851196,
+      "learning_rate": 0.003,
+      "loss": 4.0844,
+      "step": 4811
+    },
+    {
+      "epoch": 0.04812,
+      "grad_norm": 0.5703248977661133,
+      "learning_rate": 0.003,
+      "loss": 4.084,
+      "step": 4812
+    },
+    {
+      "epoch": 0.04813,
+      "grad_norm": 0.7647963166236877,
+      "learning_rate": 0.003,
+      "loss": 4.0743,
+      "step": 4813
+    },
+    {
+      "epoch": 0.04814,
+      "grad_norm": 0.8564413189888,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 4814
+    },
+    {
+      "epoch": 0.04815,
+      "grad_norm": 0.8082033395767212,
+      "learning_rate": 0.003,
+      "loss": 4.0735,
+      "step": 4815
+    },
+    {
+      "epoch": 0.04816,
+      "grad_norm": 0.6857293248176575,
+      "learning_rate": 0.003,
+      "loss": 4.0832,
+      "step": 4816
+    },
+    {
+      "epoch": 0.04817,
+      "grad_norm": 0.5996972918510437,
+      "learning_rate": 0.003,
+      "loss": 4.0569,
+      "step": 4817
+    },
+    {
+      "epoch": 0.04818,
+      "grad_norm": 0.6385634541511536,
+      "learning_rate": 0.003,
+      "loss": 4.0711,
+      "step": 4818
+    },
+    {
+      "epoch": 0.04819,
+      "grad_norm": 0.6788516044616699,
+      "learning_rate": 0.003,
+      "loss": 4.0765,
+      "step": 4819
+    },
+    {
+      "epoch": 0.0482,
+      "grad_norm": 0.6623933911323547,
+      "learning_rate": 0.003,
+      "loss": 4.0679,
+      "step": 4820
+    },
+    {
+      "epoch": 0.04821,
+      "grad_norm": 0.6214218735694885,
+      "learning_rate": 0.003,
+      "loss": 4.0961,
+      "step": 4821
+    },
+    {
+      "epoch": 0.04822,
+      "grad_norm": 0.6991559267044067,
+      "learning_rate": 0.003,
+      "loss": 4.075,
+      "step": 4822
+    },
+    {
+      "epoch": 0.04823,
+      "grad_norm": 0.7806605100631714,
+      "learning_rate": 0.003,
+      "loss": 4.0756,
+      "step": 4823
+    },
+    {
+      "epoch": 0.04824,
+      "grad_norm": 0.7918847799301147,
+      "learning_rate": 0.003,
+      "loss": 4.0651,
+      "step": 4824
+    },
+    {
+      "epoch": 0.04825,
+      "grad_norm": 0.6949120163917542,
+      "learning_rate": 0.003,
+      "loss": 4.0754,
+      "step": 4825
+    },
+    {
+      "epoch": 0.04826,
+      "grad_norm": 0.6233998537063599,
+      "learning_rate": 0.003,
+      "loss": 4.0788,
+      "step": 4826
+    },
+    {
+      "epoch": 0.04827,
+      "grad_norm": 0.6551531553268433,
+      "learning_rate": 0.003,
+      "loss": 4.0836,
+      "step": 4827
+    },
+    {
+      "epoch": 0.04828,
+      "grad_norm": 0.8566862344741821,
+      "learning_rate": 0.003,
+      "loss": 4.0983,
+      "step": 4828
+    },
+    {
+      "epoch": 0.04829,
+      "grad_norm": 1.0621153116226196,
+      "learning_rate": 0.003,
+      "loss": 4.0888,
+      "step": 4829
+    },
+    {
+      "epoch": 0.0483,
+      "grad_norm": 1.08018159866333,
+      "learning_rate": 0.003,
+      "loss": 4.1039,
+      "step": 4830
+    },
+    {
+      "epoch": 0.04831,
+      "grad_norm": 0.8102112412452698,
+      "learning_rate": 0.003,
+      "loss": 4.0598,
+      "step": 4831
+    },
+    {
+      "epoch": 0.04832,
+      "grad_norm": 0.5810506939888,
+      "learning_rate": 0.003,
+      "loss": 4.0794,
+      "step": 4832
+    },
+    {
+      "epoch": 0.04833,
+      "grad_norm": 0.6366890072822571,
+      "learning_rate": 0.003,
+      "loss": 4.085,
+      "step": 4833
+    },
+    {
+      "epoch": 0.04834,
+      "grad_norm": 0.7475446462631226,
+      "learning_rate": 0.003,
+      "loss": 4.058,
+      "step": 4834
+    },
+    {
+      "epoch": 0.04835,
+      "grad_norm": 0.8804463148117065,
+      "learning_rate": 0.003,
+      "loss": 4.1016,
+      "step": 4835
+    },
+    {
+      "epoch": 0.04836,
+      "grad_norm": 0.9185641407966614,
+      "learning_rate": 0.003,
+      "loss": 4.0767,
+      "step": 4836
+    },
+    {
+      "epoch": 0.04837,
+      "grad_norm": 0.7357785105705261,
+      "learning_rate": 0.003,
+      "loss": 4.0978,
+      "step": 4837
+    },
+    {
+      "epoch": 0.04838,
+      "grad_norm": 0.6829032897949219,
+      "learning_rate": 0.003,
+      "loss": 4.103,
+      "step": 4838
+    },
+    {
+      "epoch": 0.04839,
+      "grad_norm": 0.7056359052658081,
+      "learning_rate": 0.003,
+      "loss": 4.0847,
+      "step": 4839
+    },
+    {
+      "epoch": 0.0484,
+      "grad_norm": 0.8075634837150574,
+      "learning_rate": 0.003,
+      "loss": 4.0931,
+      "step": 4840
+    },
+    {
+      "epoch": 0.04841,
+      "grad_norm": 0.8837736248970032,
+      "learning_rate": 0.003,
+      "loss": 4.0902,
+      "step": 4841
+    },
+    {
+      "epoch": 0.04842,
+      "grad_norm": 0.8638008832931519,
+      "learning_rate": 0.003,
+      "loss": 4.1091,
+      "step": 4842
+    },
+    {
+      "epoch": 0.04843,
+      "grad_norm": 0.7591082453727722,
+      "learning_rate": 0.003,
+      "loss": 4.0639,
+      "step": 4843
+    },
+    {
+      "epoch": 0.04844,
+      "grad_norm": 0.6942276954650879,
+      "learning_rate": 0.003,
+      "loss": 4.0925,
+      "step": 4844
+    },
+    {
+      "epoch": 0.04845,
+      "grad_norm": 0.7003408670425415,
+      "learning_rate": 0.003,
+      "loss": 4.0766,
+      "step": 4845
+    },
+    {
+      "epoch": 0.04846,
+      "grad_norm": 0.7513200640678406,
+      "learning_rate": 0.003,
+      "loss": 4.0998,
+      "step": 4846
+    },
+    {
+      "epoch": 0.04847,
+      "grad_norm": 0.8167382478713989,
+      "learning_rate": 0.003,
+      "loss": 4.066,
+      "step": 4847
+    },
+    {
+      "epoch": 0.04848,
+      "grad_norm": 0.8485641479492188,
+      "learning_rate": 0.003,
+      "loss": 4.1097,
+      "step": 4848
+    },
+    {
+      "epoch": 0.04849,
+      "grad_norm": 0.7200870513916016,
+      "learning_rate": 0.003,
+      "loss": 4.0997,
+      "step": 4849
+    },
+    {
+      "epoch": 0.0485,
+      "grad_norm": 0.6998422145843506,
+      "learning_rate": 0.003,
+      "loss": 4.0862,
+      "step": 4850
+    },
+    {
+      "epoch": 0.04851,
+      "grad_norm": 0.6962882280349731,
+      "learning_rate": 0.003,
+      "loss": 4.1018,
+      "step": 4851
+    },
+    {
+      "epoch": 0.04852,
+      "grad_norm": 0.6214674115180969,
+      "learning_rate": 0.003,
+      "loss": 4.031,
+      "step": 4852
+    },
+    {
+      "epoch": 0.04853,
+      "grad_norm": 0.6190800666809082,
+      "learning_rate": 0.003,
+      "loss": 4.0902,
+      "step": 4853
+    },
+    {
+      "epoch": 0.04854,
+      "grad_norm": 0.5581333637237549,
+      "learning_rate": 0.003,
+      "loss": 4.0852,
+      "step": 4854
+    },
+    {
+      "epoch": 0.04855,
+      "grad_norm": 0.4471603035926819,
+      "learning_rate": 0.003,
+      "loss": 4.0905,
+      "step": 4855
+    },
+    {
+      "epoch": 0.04856,
+      "grad_norm": 0.4486013650894165,
+      "learning_rate": 0.003,
+      "loss": 4.0887,
+      "step": 4856
+    },
+    {
+      "epoch": 0.04857,
+      "grad_norm": 0.4567798376083374,
+      "learning_rate": 0.003,
+      "loss": 4.101,
+      "step": 4857
+    },
+    {
+      "epoch": 0.04858,
+      "grad_norm": 0.4150858223438263,
+      "learning_rate": 0.003,
+      "loss": 4.0659,
+      "step": 4858
+    },
+    {
+      "epoch": 0.04859,
+      "grad_norm": 0.428250789642334,
+      "learning_rate": 0.003,
+      "loss": 4.0982,
+      "step": 4859
+    },
+    {
+      "epoch": 0.0486,
+      "grad_norm": 0.44279322028160095,
+      "learning_rate": 0.003,
+      "loss": 4.0806,
+      "step": 4860
+    },
+    {
+      "epoch": 0.04861,
+      "grad_norm": 0.5173445343971252,
+      "learning_rate": 0.003,
+      "loss": 4.0703,
+      "step": 4861
+    },
+    {
+      "epoch": 0.04862,
+      "grad_norm": 0.6446529030799866,
+      "learning_rate": 0.003,
+      "loss": 4.0755,
+      "step": 4862
+    },
+    {
+      "epoch": 0.04863,
+      "grad_norm": 0.7789918184280396,
+      "learning_rate": 0.003,
+      "loss": 4.0978,
+      "step": 4863
+    },
+    {
+      "epoch": 0.04864,
+      "grad_norm": 0.8804915547370911,
+      "learning_rate": 0.003,
+      "loss": 4.0781,
+      "step": 4864
+    },
+    {
+      "epoch": 0.04865,
+      "grad_norm": 0.8473218679428101,
+      "learning_rate": 0.003,
+      "loss": 4.0974,
+      "step": 4865
+    },
+    {
+      "epoch": 0.04866,
+      "grad_norm": 0.7426813840866089,
+      "learning_rate": 0.003,
+      "loss": 4.101,
+      "step": 4866
+    },
+    {
+      "epoch": 0.04867,
+      "grad_norm": 0.708594799041748,
+      "learning_rate": 0.003,
+      "loss": 4.0708,
+      "step": 4867
+    },
+    {
+      "epoch": 0.04868,
+      "grad_norm": 0.8590058088302612,
+      "learning_rate": 0.003,
+      "loss": 4.0825,
+      "step": 4868
+    },
+    {
+      "epoch": 0.04869,
+      "grad_norm": 0.9592583179473877,
+      "learning_rate": 0.003,
+      "loss": 4.0948,
+      "step": 4869
+    },
+    {
+      "epoch": 0.0487,
+      "grad_norm": 0.8780485391616821,
+      "learning_rate": 0.003,
+      "loss": 4.089,
+      "step": 4870
+    },
+    {
+      "epoch": 0.04871,
+      "grad_norm": 0.7498546838760376,
+      "learning_rate": 0.003,
+      "loss": 4.1046,
+      "step": 4871
+    },
+    {
+      "epoch": 0.04872,
+      "grad_norm": 0.7363294363021851,
+      "learning_rate": 0.003,
+      "loss": 4.0704,
+      "step": 4872
+    },
+    {
+      "epoch": 0.04873,
+      "grad_norm": 0.9078441262245178,
+      "learning_rate": 0.003,
+      "loss": 4.0999,
+      "step": 4873
+    },
+    {
+      "epoch": 0.04874,
+      "grad_norm": 0.9841904044151306,
+      "learning_rate": 0.003,
+      "loss": 4.0922,
+      "step": 4874
+    },
+    {
+      "epoch": 0.04875,
+      "grad_norm": 0.8496304154396057,
+      "learning_rate": 0.003,
+      "loss": 4.0882,
+      "step": 4875
+    },
+    {
+      "epoch": 0.04876,
+      "grad_norm": 0.7967992424964905,
+      "learning_rate": 0.003,
+      "loss": 4.0886,
+      "step": 4876
+    },
+    {
+      "epoch": 0.04877,
+      "grad_norm": 0.8723610639572144,
+      "learning_rate": 0.003,
+      "loss": 4.0925,
+      "step": 4877
+    },
+    {
+      "epoch": 0.04878,
+      "grad_norm": 0.8831827044487,
+      "learning_rate": 0.003,
+      "loss": 4.0784,
+      "step": 4878
+    },
+    {
+      "epoch": 0.04879,
+      "grad_norm": 0.8523681163787842,
+      "learning_rate": 0.003,
+      "loss": 4.0691,
+      "step": 4879
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.8125349283218384,
+      "learning_rate": 0.003,
+      "loss": 4.0823,
+      "step": 4880
+    },
+    {
+      "epoch": 0.04881,
+      "grad_norm": 0.7172712087631226,
+      "learning_rate": 0.003,
+      "loss": 4.0729,
+      "step": 4881
+    },
+    {
+      "epoch": 0.04882,
+      "grad_norm": 0.8336440920829773,
+      "learning_rate": 0.003,
+      "loss": 4.0818,
+      "step": 4882
+    },
+    {
+      "epoch": 0.04883,
+      "grad_norm": 1.029686450958252,
+      "learning_rate": 0.003,
+      "loss": 4.0824,
+      "step": 4883
+    },
+    {
+      "epoch": 0.04884,
+      "grad_norm": 1.030254602432251,
+      "learning_rate": 0.003,
+      "loss": 4.0947,
+      "step": 4884
+    },
+    {
+      "epoch": 0.04885,
+      "grad_norm": 0.8340651392936707,
+      "learning_rate": 0.003,
+      "loss": 4.0999,
+      "step": 4885
+    },
+    {
+      "epoch": 0.04886,
+      "grad_norm": 0.7567631006240845,
+      "learning_rate": 0.003,
+      "loss": 4.0934,
+      "step": 4886
+    },
+    {
+      "epoch": 0.04887,
+      "grad_norm": 0.7703821659088135,
+      "learning_rate": 0.003,
+      "loss": 4.1032,
+      "step": 4887
+    },
+    {
+      "epoch": 0.04888,
+      "grad_norm": 0.7452254295349121,
+      "learning_rate": 0.003,
+      "loss": 4.1062,
+      "step": 4888
+    },
+    {
+      "epoch": 0.04889,
+      "grad_norm": 0.819527804851532,
+      "learning_rate": 0.003,
+      "loss": 4.0903,
+      "step": 4889
+    },
+    {
+      "epoch": 0.0489,
+      "grad_norm": 0.7526473999023438,
+      "learning_rate": 0.003,
+      "loss": 4.0955,
+      "step": 4890
+    },
+    {
+      "epoch": 0.04891,
+      "grad_norm": 0.6946446299552917,
+      "learning_rate": 0.003,
+      "loss": 4.1053,
+      "step": 4891
+    },
+    {
+      "epoch": 0.04892,
+      "grad_norm": 0.7302446365356445,
+      "learning_rate": 0.003,
+      "loss": 4.1035,
+      "step": 4892
+    },
+    {
+      "epoch": 0.04893,
+      "grad_norm": 0.8050813674926758,
+      "learning_rate": 0.003,
+      "loss": 4.1008,
+      "step": 4893
+    },
+    {
+      "epoch": 0.04894,
+      "grad_norm": 0.788234293460846,
+      "learning_rate": 0.003,
+      "loss": 4.1124,
+      "step": 4894
+    },
+    {
+      "epoch": 0.04895,
+      "grad_norm": 0.7052075266838074,
+      "learning_rate": 0.003,
+      "loss": 4.0849,
+      "step": 4895
+    },
+    {
+      "epoch": 0.04896,
+      "grad_norm": 0.6905450820922852,
+      "learning_rate": 0.003,
+      "loss": 4.0983,
+      "step": 4896
+    },
+    {
+      "epoch": 0.04897,
+      "grad_norm": 0.7363834381103516,
+      "learning_rate": 0.003,
+      "loss": 4.1056,
+      "step": 4897
+    },
+    {
+      "epoch": 0.04898,
+      "grad_norm": 0.7060516476631165,
+      "learning_rate": 0.003,
+      "loss": 4.0913,
+      "step": 4898
+    },
+    {
+      "epoch": 0.04899,
+      "grad_norm": 0.6093109846115112,
+      "learning_rate": 0.003,
+      "loss": 4.0779,
+      "step": 4899
+    },
+    {
+      "epoch": 0.049,
+      "grad_norm": 0.5717660784721375,
+      "learning_rate": 0.003,
+      "loss": 4.1034,
+      "step": 4900
+    },
+    {
+      "epoch": 0.04901,
+      "grad_norm": 0.5589826703071594,
+      "learning_rate": 0.003,
+      "loss": 4.0627,
+      "step": 4901
+    },
+    {
+      "epoch": 0.04902,
+      "grad_norm": 0.595604419708252,
+      "learning_rate": 0.003,
+      "loss": 4.0963,
+      "step": 4902
+    },
+    {
+      "epoch": 0.04903,
+      "grad_norm": 0.5431516170501709,
+      "learning_rate": 0.003,
+      "loss": 4.0963,
+      "step": 4903
+    },
+    {
+      "epoch": 0.04904,
+      "grad_norm": 0.49542325735092163,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 4904
+    },
+    {
+      "epoch": 0.04905,
+      "grad_norm": 0.5120609998703003,
+      "learning_rate": 0.003,
+      "loss": 4.0611,
+      "step": 4905
+    },
+    {
+      "epoch": 0.04906,
+      "grad_norm": 0.5707340240478516,
+      "learning_rate": 0.003,
+      "loss": 4.0729,
+      "step": 4906
+    },
+    {
+      "epoch": 0.04907,
+      "grad_norm": 0.6552608609199524,
+      "learning_rate": 0.003,
+      "loss": 4.0565,
+      "step": 4907
+    },
+    {
+      "epoch": 0.04908,
+      "grad_norm": 0.6238669157028198,
+      "learning_rate": 0.003,
+      "loss": 4.0703,
+      "step": 4908
+    },
+    {
+      "epoch": 0.04909,
+      "grad_norm": 0.6134541034698486,
+      "learning_rate": 0.003,
+      "loss": 4.0744,
+      "step": 4909
+    },
+    {
+      "epoch": 0.0491,
+      "grad_norm": 0.6873631477355957,
+      "learning_rate": 0.003,
+      "loss": 4.0686,
+      "step": 4910
+    },
+    {
+      "epoch": 0.04911,
+      "grad_norm": 0.7277111411094666,
+      "learning_rate": 0.003,
+      "loss": 4.0658,
+      "step": 4911
+    },
+    {
+      "epoch": 0.04912,
+      "grad_norm": 0.6708826422691345,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 4912
+    },
+    {
+      "epoch": 0.04913,
+      "grad_norm": 0.5802143812179565,
+      "learning_rate": 0.003,
+      "loss": 4.0638,
+      "step": 4913
+    },
+    {
+      "epoch": 0.04914,
+      "grad_norm": 0.5891876220703125,
+      "learning_rate": 0.003,
+      "loss": 4.0692,
+      "step": 4914
+    },
+    {
+      "epoch": 0.04915,
+      "grad_norm": 0.6115869879722595,
+      "learning_rate": 0.003,
+      "loss": 4.0756,
+      "step": 4915
+    },
+    {
+      "epoch": 0.04916,
+      "grad_norm": 0.6224087476730347,
+      "learning_rate": 0.003,
+      "loss": 4.073,
+      "step": 4916
+    },
+    {
+      "epoch": 0.04917,
+      "grad_norm": 0.5933270454406738,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 4917
+    },
+    {
+      "epoch": 0.04918,
+      "grad_norm": 0.6610130071640015,
+      "learning_rate": 0.003,
+      "loss": 4.0717,
+      "step": 4918
+    },
+    {
+      "epoch": 0.04919,
+      "grad_norm": 0.6908468008041382,
+      "learning_rate": 0.003,
+      "loss": 4.0742,
+      "step": 4919
+    },
+    {
+      "epoch": 0.0492,
+      "grad_norm": 0.699761152267456,
+      "learning_rate": 0.003,
+      "loss": 4.0742,
+      "step": 4920
+    },
+    {
+      "epoch": 0.04921,
+      "grad_norm": 0.6772635579109192,
+      "learning_rate": 0.003,
+      "loss": 4.0911,
+      "step": 4921
+    },
+    {
+      "epoch": 0.04922,
+      "grad_norm": 0.6279475092887878,
+      "learning_rate": 0.003,
+      "loss": 4.0658,
+      "step": 4922
+    },
+    {
+      "epoch": 0.04923,
+      "grad_norm": 0.5486379861831665,
+      "learning_rate": 0.003,
+      "loss": 4.0428,
+      "step": 4923
+    },
+    {
+      "epoch": 0.04924,
+      "grad_norm": 0.5325474739074707,
+      "learning_rate": 0.003,
+      "loss": 4.0947,
+      "step": 4924
+    },
+    {
+      "epoch": 0.04925,
+      "grad_norm": 0.6036094427108765,
+      "learning_rate": 0.003,
+      "loss": 4.0472,
+      "step": 4925
+    },
+    {
+      "epoch": 0.04926,
+      "grad_norm": 0.6497531533241272,
+      "learning_rate": 0.003,
+      "loss": 4.0516,
+      "step": 4926
+    },
+    {
+      "epoch": 0.04927,
+      "grad_norm": 0.5924155116081238,
+      "learning_rate": 0.003,
+      "loss": 4.0849,
+      "step": 4927
+    },
+    {
+      "epoch": 0.04928,
+      "grad_norm": 0.6325770616531372,
+      "learning_rate": 0.003,
+      "loss": 4.0493,
+      "step": 4928
+    },
+    {
+      "epoch": 0.04929,
+      "grad_norm": 0.6553009152412415,
+      "learning_rate": 0.003,
+      "loss": 4.0654,
+      "step": 4929
+    },
+    {
+      "epoch": 0.0493,
+      "grad_norm": 0.7570836544036865,
+      "learning_rate": 0.003,
+      "loss": 4.0598,
+      "step": 4930
+    },
+    {
+      "epoch": 0.04931,
+      "grad_norm": 0.8154568672180176,
+      "learning_rate": 0.003,
+      "loss": 4.0488,
+      "step": 4931
+    },
+    {
+      "epoch": 0.04932,
+      "grad_norm": 0.7909042239189148,
+      "learning_rate": 0.003,
+      "loss": 4.0965,
+      "step": 4932
+    },
+    {
+      "epoch": 0.04933,
+      "grad_norm": 0.7257623672485352,
+      "learning_rate": 0.003,
+      "loss": 4.0728,
+      "step": 4933
+    },
+    {
+      "epoch": 0.04934,
+      "grad_norm": 0.7812435030937195,
+      "learning_rate": 0.003,
+      "loss": 4.1045,
+      "step": 4934
+    },
+    {
+      "epoch": 0.04935,
+      "grad_norm": 0.8619406819343567,
+      "learning_rate": 0.003,
+      "loss": 4.1059,
+      "step": 4935
+    },
+    {
+      "epoch": 0.04936,
+      "grad_norm": 0.9272763133049011,
+      "learning_rate": 0.003,
+      "loss": 4.0859,
+      "step": 4936
+    },
+    {
+      "epoch": 0.04937,
+      "grad_norm": 0.9949960112571716,
+      "learning_rate": 0.003,
+      "loss": 4.0899,
+      "step": 4937
+    },
+    {
+      "epoch": 0.04938,
+      "grad_norm": 0.9583383202552795,
+      "learning_rate": 0.003,
+      "loss": 4.0957,
+      "step": 4938
+    },
+    {
+      "epoch": 0.04939,
+      "grad_norm": 0.9466757774353027,
+      "learning_rate": 0.003,
+      "loss": 4.1071,
+      "step": 4939
+    },
+    {
+      "epoch": 0.0494,
+      "grad_norm": 0.9211159944534302,
+      "learning_rate": 0.003,
+      "loss": 4.1124,
+      "step": 4940
+    },
+    {
+      "epoch": 0.04941,
+      "grad_norm": 0.7683899402618408,
+      "learning_rate": 0.003,
+      "loss": 4.0757,
+      "step": 4941
+    },
+    {
+      "epoch": 0.04942,
+      "grad_norm": 0.7320684194564819,
+      "learning_rate": 0.003,
+      "loss": 4.0876,
+      "step": 4942
+    },
+    {
+      "epoch": 0.04943,
+      "grad_norm": 0.7409505844116211,
+      "learning_rate": 0.003,
+      "loss": 4.109,
+      "step": 4943
+    },
+    {
+      "epoch": 0.04944,
+      "grad_norm": 0.6583126187324524,
+      "learning_rate": 0.003,
+      "loss": 4.1111,
+      "step": 4944
+    },
+    {
+      "epoch": 0.04945,
+      "grad_norm": 0.7039183974266052,
+      "learning_rate": 0.003,
+      "loss": 4.1004,
+      "step": 4945
+    },
+    {
+      "epoch": 0.04946,
+      "grad_norm": 0.7162462472915649,
+      "learning_rate": 0.003,
+      "loss": 4.1273,
+      "step": 4946
+    },
+    {
+      "epoch": 0.04947,
+      "grad_norm": 0.6111107468605042,
+      "learning_rate": 0.003,
+      "loss": 4.0642,
+      "step": 4947
+    },
+    {
+      "epoch": 0.04948,
+      "grad_norm": 0.6708353161811829,
+      "learning_rate": 0.003,
+      "loss": 4.083,
+      "step": 4948
+    },
+    {
+      "epoch": 0.04949,
+      "grad_norm": 0.9382305145263672,
+      "learning_rate": 0.003,
+      "loss": 4.0938,
+      "step": 4949
+    },
+    {
+      "epoch": 0.0495,
+      "grad_norm": 1.2174804210662842,
+      "learning_rate": 0.003,
+      "loss": 4.0692,
+      "step": 4950
+    },
+    {
+      "epoch": 0.04951,
+      "grad_norm": 0.701120913028717,
+      "learning_rate": 0.003,
+      "loss": 4.0654,
+      "step": 4951
+    },
+    {
+      "epoch": 0.04952,
+      "grad_norm": 0.6446220278739929,
+      "learning_rate": 0.003,
+      "loss": 4.1001,
+      "step": 4952
+    },
+    {
+      "epoch": 0.04953,
+      "grad_norm": 0.7424911856651306,
+      "learning_rate": 0.003,
+      "loss": 4.0852,
+      "step": 4953
+    },
+    {
+      "epoch": 0.04954,
+      "grad_norm": 0.6806883215904236,
+      "learning_rate": 0.003,
+      "loss": 4.1053,
+      "step": 4954
+    },
+    {
+      "epoch": 0.04955,
+      "grad_norm": 0.699410617351532,
+      "learning_rate": 0.003,
+      "loss": 4.0784,
+      "step": 4955
+    },
+    {
+      "epoch": 0.04956,
+      "grad_norm": 0.6064649224281311,
+      "learning_rate": 0.003,
+      "loss": 4.0909,
+      "step": 4956
+    },
+    {
+      "epoch": 0.04957,
+      "grad_norm": 0.5583900809288025,
+      "learning_rate": 0.003,
+      "loss": 4.0594,
+      "step": 4957
+    },
+    {
+      "epoch": 0.04958,
+      "grad_norm": 0.5232359170913696,
+      "learning_rate": 0.003,
+      "loss": 4.0761,
+      "step": 4958
+    },
+    {
+      "epoch": 0.04959,
+      "grad_norm": 0.4963710308074951,
+      "learning_rate": 0.003,
+      "loss": 4.0787,
+      "step": 4959
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.5652657151222229,
+      "learning_rate": 0.003,
+      "loss": 4.0509,
+      "step": 4960
+    },
+    {
+      "epoch": 0.04961,
+      "grad_norm": 0.6090538501739502,
+      "learning_rate": 0.003,
+      "loss": 4.0569,
+      "step": 4961
+    },
+    {
+      "epoch": 0.04962,
+      "grad_norm": 0.6194426417350769,
+      "learning_rate": 0.003,
+      "loss": 4.0711,
+      "step": 4962
+    },
+    {
+      "epoch": 0.04963,
+      "grad_norm": 0.6753765344619751,
+      "learning_rate": 0.003,
+      "loss": 4.1102,
+      "step": 4963
+    },
+    {
+      "epoch": 0.04964,
+      "grad_norm": 0.733012855052948,
+      "learning_rate": 0.003,
+      "loss": 4.0852,
+      "step": 4964
+    },
+    {
+      "epoch": 0.04965,
+      "grad_norm": 0.7559370398521423,
+      "learning_rate": 0.003,
+      "loss": 4.0855,
+      "step": 4965
+    },
+    {
+      "epoch": 0.04966,
+      "grad_norm": 0.75950026512146,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 4966
+    },
+    {
+      "epoch": 0.04967,
+      "grad_norm": 0.697681725025177,
+      "learning_rate": 0.003,
+      "loss": 4.0811,
+      "step": 4967
+    },
+    {
+      "epoch": 0.04968,
+      "grad_norm": 0.7198742628097534,
+      "learning_rate": 0.003,
+      "loss": 4.1061,
+      "step": 4968
+    },
+    {
+      "epoch": 0.04969,
+      "grad_norm": 0.6972935795783997,
+      "learning_rate": 0.003,
+      "loss": 4.0793,
+      "step": 4969
+    },
+    {
+      "epoch": 0.0497,
+      "grad_norm": 0.6653467416763306,
+      "learning_rate": 0.003,
+      "loss": 4.074,
+      "step": 4970
+    },
+    {
+      "epoch": 0.04971,
+      "grad_norm": 0.6616604328155518,
+      "learning_rate": 0.003,
+      "loss": 4.0928,
+      "step": 4971
+    },
+    {
+      "epoch": 0.04972,
+      "grad_norm": 0.6987863183021545,
+      "learning_rate": 0.003,
+      "loss": 4.0678,
+      "step": 4972
+    },
+    {
+      "epoch": 0.04973,
+      "grad_norm": 0.7355426549911499,
+      "learning_rate": 0.003,
+      "loss": 4.1172,
+      "step": 4973
+    },
+    {
+      "epoch": 0.04974,
+      "grad_norm": 0.8188806176185608,
+      "learning_rate": 0.003,
+      "loss": 4.0828,
+      "step": 4974
+    },
+    {
+      "epoch": 0.04975,
+      "grad_norm": 0.8495981693267822,
+      "learning_rate": 0.003,
+      "loss": 4.0867,
+      "step": 4975
+    },
+    {
+      "epoch": 0.04976,
+      "grad_norm": 0.9018936157226562,
+      "learning_rate": 0.003,
+      "loss": 4.0878,
+      "step": 4976
+    },
+    {
+      "epoch": 0.04977,
+      "grad_norm": 0.8570343852043152,
+      "learning_rate": 0.003,
+      "loss": 4.0972,
+      "step": 4977
+    },
+    {
+      "epoch": 0.04978,
+      "grad_norm": 0.8940666317939758,
+      "learning_rate": 0.003,
+      "loss": 4.1055,
+      "step": 4978
+    },
+    {
+      "epoch": 0.04979,
+      "grad_norm": 0.8460613489151001,
+      "learning_rate": 0.003,
+      "loss": 4.1083,
+      "step": 4979
+    },
+    {
+      "epoch": 0.0498,
+      "grad_norm": 0.6897357702255249,
+      "learning_rate": 0.003,
+      "loss": 4.0943,
+      "step": 4980
+    },
+    {
+      "epoch": 0.04981,
+      "grad_norm": 0.6147297024726868,
+      "learning_rate": 0.003,
+      "loss": 4.0647,
+      "step": 4981
+    },
+    {
+      "epoch": 0.04982,
+      "grad_norm": 0.6292622089385986,
+      "learning_rate": 0.003,
+      "loss": 4.0869,
+      "step": 4982
+    },
+    {
+      "epoch": 0.04983,
+      "grad_norm": 0.6632224321365356,
+      "learning_rate": 0.003,
+      "loss": 4.0449,
+      "step": 4983
+    },
+    {
+      "epoch": 0.04984,
+      "grad_norm": 0.6822007894515991,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 4984
+    },
+    {
+      "epoch": 0.04985,
+      "grad_norm": 0.6867620348930359,
+      "learning_rate": 0.003,
+      "loss": 4.0828,
+      "step": 4985
+    },
+    {
+      "epoch": 0.04986,
+      "grad_norm": 0.7010688185691833,
+      "learning_rate": 0.003,
+      "loss": 4.0999,
+      "step": 4986
+    },
+    {
+      "epoch": 0.04987,
+      "grad_norm": 0.7688935399055481,
+      "learning_rate": 0.003,
+      "loss": 4.0643,
+      "step": 4987
+    },
+    {
+      "epoch": 0.04988,
+      "grad_norm": 0.7406259179115295,
+      "learning_rate": 0.003,
+      "loss": 4.0959,
+      "step": 4988
+    },
+    {
+      "epoch": 0.04989,
+      "grad_norm": 0.7552890181541443,
+      "learning_rate": 0.003,
+      "loss": 4.0986,
+      "step": 4989
+    },
+    {
+      "epoch": 0.0499,
+      "grad_norm": 0.9177321195602417,
+      "learning_rate": 0.003,
+      "loss": 4.0795,
+      "step": 4990
+    },
+    {
+      "epoch": 0.04991,
+      "grad_norm": 1.0167580842971802,
+      "learning_rate": 0.003,
+      "loss": 4.0722,
+      "step": 4991
+    },
+    {
+      "epoch": 0.04992,
+      "grad_norm": 1.1560254096984863,
+      "learning_rate": 0.003,
+      "loss": 4.0648,
+      "step": 4992
+    },
+    {
+      "epoch": 0.04993,
+      "grad_norm": 0.8258231282234192,
+      "learning_rate": 0.003,
+      "loss": 4.0849,
+      "step": 4993
+    },
+    {
+      "epoch": 0.04994,
+      "grad_norm": 0.790297269821167,
+      "learning_rate": 0.003,
+      "loss": 4.0737,
+      "step": 4994
+    },
+    {
+      "epoch": 0.04995,
+      "grad_norm": 0.8769829273223877,
+      "learning_rate": 0.003,
+      "loss": 4.0946,
+      "step": 4995
+    },
+    {
+      "epoch": 0.04996,
+      "grad_norm": 0.8291935324668884,
+      "learning_rate": 0.003,
+      "loss": 4.0762,
+      "step": 4996
+    },
+    {
+      "epoch": 0.04997,
+      "grad_norm": 0.7305048704147339,
+      "learning_rate": 0.003,
+      "loss": 4.0907,
+      "step": 4997
+    },
+    {
+      "epoch": 0.04998,
+      "grad_norm": 0.6434715390205383,
+      "learning_rate": 0.003,
+      "loss": 4.1105,
+      "step": 4998
+    },
+    {
+      "epoch": 0.04999,
+      "grad_norm": 0.6438940167427063,
+      "learning_rate": 0.003,
+      "loss": 4.0697,
+      "step": 4999
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.6508415937423706,
+      "learning_rate": 0.003,
+      "loss": 4.1117,
+      "step": 5000
+    },
+    {
+      "epoch": 0.05001,
+      "grad_norm": 0.7253802418708801,
+      "learning_rate": 0.003,
+      "loss": 4.1089,
+      "step": 5001
+    },
+    {
+      "epoch": 0.05002,
+      "grad_norm": 0.7797998785972595,
+      "learning_rate": 0.003,
+      "loss": 4.0831,
+      "step": 5002
+    },
+    {
+      "epoch": 0.05003,
+      "grad_norm": 0.8626973628997803,
+      "learning_rate": 0.003,
+      "loss": 4.0983,
+      "step": 5003
+    },
+    {
+      "epoch": 0.05004,
+      "grad_norm": 0.8299462199211121,
+      "learning_rate": 0.003,
+      "loss": 4.0816,
+      "step": 5004
+    },
+    {
+      "epoch": 0.05005,
+      "grad_norm": 0.8039361834526062,
+      "learning_rate": 0.003,
+      "loss": 4.1064,
+      "step": 5005
+    },
+    {
+      "epoch": 0.05006,
+      "grad_norm": 0.7754454612731934,
+      "learning_rate": 0.003,
+      "loss": 4.0831,
+      "step": 5006
+    },
+    {
+      "epoch": 0.05007,
+      "grad_norm": 0.6967061161994934,
+      "learning_rate": 0.003,
+      "loss": 4.0964,
+      "step": 5007
+    },
+    {
+      "epoch": 0.05008,
+      "grad_norm": 0.5528774261474609,
+      "learning_rate": 0.003,
+      "loss": 4.0585,
+      "step": 5008
+    },
+    {
+      "epoch": 0.05009,
+      "grad_norm": 0.5234524607658386,
+      "learning_rate": 0.003,
+      "loss": 4.0687,
+      "step": 5009
+    },
+    {
+      "epoch": 0.0501,
+      "grad_norm": 0.46996673941612244,
+      "learning_rate": 0.003,
+      "loss": 4.0786,
+      "step": 5010
+    },
+    {
+      "epoch": 0.05011,
+      "grad_norm": 0.5573607683181763,
+      "learning_rate": 0.003,
+      "loss": 4.0822,
+      "step": 5011
+    },
+    {
+      "epoch": 0.05012,
+      "grad_norm": 0.5557873249053955,
+      "learning_rate": 0.003,
+      "loss": 4.0666,
+      "step": 5012
+    },
+    {
+      "epoch": 0.05013,
+      "grad_norm": 0.5415859818458557,
+      "learning_rate": 0.003,
+      "loss": 4.0689,
+      "step": 5013
+    },
+    {
+      "epoch": 0.05014,
+      "grad_norm": 0.5040212869644165,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 5014
+    },
+    {
+      "epoch": 0.05015,
+      "grad_norm": 0.5970138907432556,
+      "learning_rate": 0.003,
+      "loss": 4.0853,
+      "step": 5015
+    },
+    {
+      "epoch": 0.05016,
+      "grad_norm": 0.641599714756012,
+      "learning_rate": 0.003,
+      "loss": 4.0648,
+      "step": 5016
+    },
+    {
+      "epoch": 0.05017,
+      "grad_norm": 0.6173446178436279,
+      "learning_rate": 0.003,
+      "loss": 4.0812,
+      "step": 5017
+    },
+    {
+      "epoch": 0.05018,
+      "grad_norm": 0.6326936483383179,
+      "learning_rate": 0.003,
+      "loss": 4.0763,
+      "step": 5018
+    },
+    {
+      "epoch": 0.05019,
+      "grad_norm": 0.6489758491516113,
+      "learning_rate": 0.003,
+      "loss": 4.0935,
+      "step": 5019
+    },
+    {
+      "epoch": 0.0502,
+      "grad_norm": 0.6537885665893555,
+      "learning_rate": 0.003,
+      "loss": 4.0579,
+      "step": 5020
+    },
+    {
+      "epoch": 0.05021,
+      "grad_norm": 0.6637057662010193,
+      "learning_rate": 0.003,
+      "loss": 4.0642,
+      "step": 5021
+    },
+    {
+      "epoch": 0.05022,
+      "grad_norm": 0.6988769769668579,
+      "learning_rate": 0.003,
+      "loss": 4.0682,
+      "step": 5022
+    },
+    {
+      "epoch": 0.05023,
+      "grad_norm": 0.6984631419181824,
+      "learning_rate": 0.003,
+      "loss": 4.093,
+      "step": 5023
+    },
+    {
+      "epoch": 0.05024,
+      "grad_norm": 0.7143153548240662,
+      "learning_rate": 0.003,
+      "loss": 4.053,
+      "step": 5024
+    },
+    {
+      "epoch": 0.05025,
+      "grad_norm": 0.6525802612304688,
+      "learning_rate": 0.003,
+      "loss": 4.0907,
+      "step": 5025
+    },
+    {
+      "epoch": 0.05026,
+      "grad_norm": 0.6056846976280212,
+      "learning_rate": 0.003,
+      "loss": 4.0672,
+      "step": 5026
+    },
+    {
+      "epoch": 0.05027,
+      "grad_norm": 0.6280860304832458,
+      "learning_rate": 0.003,
+      "loss": 4.0857,
+      "step": 5027
+    },
+    {
+      "epoch": 0.05028,
+      "grad_norm": 0.6172341704368591,
+      "learning_rate": 0.003,
+      "loss": 4.0708,
+      "step": 5028
+    },
+    {
+      "epoch": 0.05029,
+      "grad_norm": 0.6776188015937805,
+      "learning_rate": 0.003,
+      "loss": 4.0839,
+      "step": 5029
+    },
+    {
+      "epoch": 0.0503,
+      "grad_norm": 0.7172451615333557,
+      "learning_rate": 0.003,
+      "loss": 4.0559,
+      "step": 5030
+    },
+    {
+      "epoch": 0.05031,
+      "grad_norm": 0.7082464694976807,
+      "learning_rate": 0.003,
+      "loss": 4.0772,
+      "step": 5031
+    },
+    {
+      "epoch": 0.05032,
+      "grad_norm": 0.7410402297973633,
+      "learning_rate": 0.003,
+      "loss": 4.0809,
+      "step": 5032
+    },
+    {
+      "epoch": 0.05033,
+      "grad_norm": 0.7269566059112549,
+      "learning_rate": 0.003,
+      "loss": 4.0574,
+      "step": 5033
+    },
+    {
+      "epoch": 0.05034,
+      "grad_norm": 0.8119834065437317,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 5034
+    },
+    {
+      "epoch": 0.05035,
+      "grad_norm": 0.9735667109489441,
+      "learning_rate": 0.003,
+      "loss": 4.0886,
+      "step": 5035
+    },
+    {
+      "epoch": 0.05036,
+      "grad_norm": 0.886156439781189,
+      "learning_rate": 0.003,
+      "loss": 4.0729,
+      "step": 5036
+    },
+    {
+      "epoch": 0.05037,
+      "grad_norm": 0.9368439316749573,
+      "learning_rate": 0.003,
+      "loss": 4.0672,
+      "step": 5037
+    },
+    {
+      "epoch": 0.05038,
+      "grad_norm": 0.9387019872665405,
+      "learning_rate": 0.003,
+      "loss": 4.0815,
+      "step": 5038
+    },
+    {
+      "epoch": 0.05039,
+      "grad_norm": 1.077192783355713,
+      "learning_rate": 0.003,
+      "loss": 4.1007,
+      "step": 5039
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.8130627274513245,
+      "learning_rate": 0.003,
+      "loss": 4.0894,
+      "step": 5040
+    },
+    {
+      "epoch": 0.05041,
+      "grad_norm": 0.8206466436386108,
+      "learning_rate": 0.003,
+      "loss": 4.097,
+      "step": 5041
+    },
+    {
+      "epoch": 0.05042,
+      "grad_norm": 0.9645205140113831,
+      "learning_rate": 0.003,
+      "loss": 4.0969,
+      "step": 5042
+    },
+    {
+      "epoch": 0.05043,
+      "grad_norm": 1.1380254030227661,
+      "learning_rate": 0.003,
+      "loss": 4.1064,
+      "step": 5043
+    },
+    {
+      "epoch": 0.05044,
+      "grad_norm": 1.002665638923645,
+      "learning_rate": 0.003,
+      "loss": 4.1169,
+      "step": 5044
+    },
+    {
+      "epoch": 0.05045,
+      "grad_norm": 0.9848024845123291,
+      "learning_rate": 0.003,
+      "loss": 4.1341,
+      "step": 5045
+    },
+    {
+      "epoch": 0.05046,
+      "grad_norm": 0.8786163330078125,
+      "learning_rate": 0.003,
+      "loss": 4.0945,
+      "step": 5046
+    },
+    {
+      "epoch": 0.05047,
+      "grad_norm": 0.8327427506446838,
+      "learning_rate": 0.003,
+      "loss": 4.1026,
+      "step": 5047
+    },
+    {
+      "epoch": 0.05048,
+      "grad_norm": 0.7884167432785034,
+      "learning_rate": 0.003,
+      "loss": 4.0987,
+      "step": 5048
+    },
+    {
+      "epoch": 0.05049,
+      "grad_norm": 0.8283893465995789,
+      "learning_rate": 0.003,
+      "loss": 4.0622,
+      "step": 5049
+    },
+    {
+      "epoch": 0.0505,
+      "grad_norm": 0.7522302865982056,
+      "learning_rate": 0.003,
+      "loss": 4.1483,
+      "step": 5050
+    },
+    {
+      "epoch": 0.05051,
+      "grad_norm": 0.7367852926254272,
+      "learning_rate": 0.003,
+      "loss": 4.1075,
+      "step": 5051
+    },
+    {
+      "epoch": 0.05052,
+      "grad_norm": 0.6346535682678223,
+      "learning_rate": 0.003,
+      "loss": 4.0761,
+      "step": 5052
+    },
+    {
+      "epoch": 0.05053,
+      "grad_norm": 0.6007921099662781,
+      "learning_rate": 0.003,
+      "loss": 4.0505,
+      "step": 5053
+    },
+    {
+      "epoch": 0.05054,
+      "grad_norm": 0.5848135948181152,
+      "learning_rate": 0.003,
+      "loss": 4.1045,
+      "step": 5054
+    },
+    {
+      "epoch": 0.05055,
+      "grad_norm": 0.6019494533538818,
+      "learning_rate": 0.003,
+      "loss": 4.0809,
+      "step": 5055
+    },
+    {
+      "epoch": 0.05056,
+      "grad_norm": 0.6064556241035461,
+      "learning_rate": 0.003,
+      "loss": 4.0954,
+      "step": 5056
+    },
+    {
+      "epoch": 0.05057,
+      "grad_norm": 0.6536866426467896,
+      "learning_rate": 0.003,
+      "loss": 4.0992,
+      "step": 5057
+    },
+    {
+      "epoch": 0.05058,
+      "grad_norm": 0.6580118536949158,
+      "learning_rate": 0.003,
+      "loss": 4.0981,
+      "step": 5058
+    },
+    {
+      "epoch": 0.05059,
+      "grad_norm": 0.5559451580047607,
+      "learning_rate": 0.003,
+      "loss": 4.0619,
+      "step": 5059
+    },
+    {
+      "epoch": 0.0506,
+      "grad_norm": 0.512411892414093,
+      "learning_rate": 0.003,
+      "loss": 4.0929,
+      "step": 5060
+    },
+    {
+      "epoch": 0.05061,
+      "grad_norm": 0.5732488632202148,
+      "learning_rate": 0.003,
+      "loss": 4.0719,
+      "step": 5061
+    },
+    {
+      "epoch": 0.05062,
+      "grad_norm": 0.6876088380813599,
+      "learning_rate": 0.003,
+      "loss": 4.0574,
+      "step": 5062
+    },
+    {
+      "epoch": 0.05063,
+      "grad_norm": 0.8171137571334839,
+      "learning_rate": 0.003,
+      "loss": 4.0471,
+      "step": 5063
+    },
+    {
+      "epoch": 0.05064,
+      "grad_norm": 0.9967246055603027,
+      "learning_rate": 0.003,
+      "loss": 4.0904,
+      "step": 5064
+    },
+    {
+      "epoch": 0.05065,
+      "grad_norm": 0.9538764953613281,
+      "learning_rate": 0.003,
+      "loss": 4.0813,
+      "step": 5065
+    },
+    {
+      "epoch": 0.05066,
+      "grad_norm": 0.7090097069740295,
+      "learning_rate": 0.003,
+      "loss": 4.0624,
+      "step": 5066
+    },
+    {
+      "epoch": 0.05067,
+      "grad_norm": 0.62126225233078,
+      "learning_rate": 0.003,
+      "loss": 4.0713,
+      "step": 5067
+    },
+    {
+      "epoch": 0.05068,
+      "grad_norm": 0.6797899603843689,
+      "learning_rate": 0.003,
+      "loss": 4.0921,
+      "step": 5068
+    },
+    {
+      "epoch": 0.05069,
+      "grad_norm": 0.7215432524681091,
+      "learning_rate": 0.003,
+      "loss": 4.0839,
+      "step": 5069
+    },
+    {
+      "epoch": 0.0507,
+      "grad_norm": 0.7022911906242371,
+      "learning_rate": 0.003,
+      "loss": 4.0978,
+      "step": 5070
+    },
+    {
+      "epoch": 0.05071,
+      "grad_norm": 0.6767652034759521,
+      "learning_rate": 0.003,
+      "loss": 4.083,
+      "step": 5071
+    },
+    {
+      "epoch": 0.05072,
+      "grad_norm": 0.6579228043556213,
+      "learning_rate": 0.003,
+      "loss": 4.0602,
+      "step": 5072
+    },
+    {
+      "epoch": 0.05073,
+      "grad_norm": 0.6343953609466553,
+      "learning_rate": 0.003,
+      "loss": 4.0608,
+      "step": 5073
+    },
+    {
+      "epoch": 0.05074,
+      "grad_norm": 0.6530939340591431,
+      "learning_rate": 0.003,
+      "loss": 4.0552,
+      "step": 5074
+    },
+    {
+      "epoch": 0.05075,
+      "grad_norm": 0.5952531099319458,
+      "learning_rate": 0.003,
+      "loss": 4.0733,
+      "step": 5075
+    },
+    {
+      "epoch": 0.05076,
+      "grad_norm": 0.5853413939476013,
+      "learning_rate": 0.003,
+      "loss": 4.089,
+      "step": 5076
+    },
+    {
+      "epoch": 0.05077,
+      "grad_norm": 0.6744263768196106,
+      "learning_rate": 0.003,
+      "loss": 4.0951,
+      "step": 5077
+    },
+    {
+      "epoch": 0.05078,
+      "grad_norm": 0.7323231101036072,
+      "learning_rate": 0.003,
+      "loss": 4.0906,
+      "step": 5078
+    },
+    {
+      "epoch": 0.05079,
+      "grad_norm": 0.7203153967857361,
+      "learning_rate": 0.003,
+      "loss": 4.0639,
+      "step": 5079
+    },
+    {
+      "epoch": 0.0508,
+      "grad_norm": 0.7360239624977112,
+      "learning_rate": 0.003,
+      "loss": 4.0559,
+      "step": 5080
+    },
+    {
+      "epoch": 0.05081,
+      "grad_norm": 0.7595191597938538,
+      "learning_rate": 0.003,
+      "loss": 4.0829,
+      "step": 5081
+    },
+    {
+      "epoch": 0.05082,
+      "grad_norm": 0.7678768634796143,
+      "learning_rate": 0.003,
+      "loss": 4.0914,
+      "step": 5082
+    },
+    {
+      "epoch": 0.05083,
+      "grad_norm": 0.7995537519454956,
+      "learning_rate": 0.003,
+      "loss": 4.0561,
+      "step": 5083
+    },
+    {
+      "epoch": 0.05084,
+      "grad_norm": 0.8082243800163269,
+      "learning_rate": 0.003,
+      "loss": 4.0774,
+      "step": 5084
+    },
+    {
+      "epoch": 0.05085,
+      "grad_norm": 0.8145765662193298,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 5085
+    },
+    {
+      "epoch": 0.05086,
+      "grad_norm": 0.8743909001350403,
+      "learning_rate": 0.003,
+      "loss": 4.0573,
+      "step": 5086
+    },
+    {
+      "epoch": 0.05087,
+      "grad_norm": 0.986849844455719,
+      "learning_rate": 0.003,
+      "loss": 4.085,
+      "step": 5087
+    },
+    {
+      "epoch": 0.05088,
+      "grad_norm": 0.9816634058952332,
+      "learning_rate": 0.003,
+      "loss": 4.0831,
+      "step": 5088
+    },
+    {
+      "epoch": 0.05089,
+      "grad_norm": 0.962716817855835,
+      "learning_rate": 0.003,
+      "loss": 4.0805,
+      "step": 5089
+    },
+    {
+      "epoch": 0.0509,
+      "grad_norm": 0.8741410970687866,
+      "learning_rate": 0.003,
+      "loss": 4.1174,
+      "step": 5090
+    },
+    {
+      "epoch": 0.05091,
+      "grad_norm": 0.7220102548599243,
+      "learning_rate": 0.003,
+      "loss": 4.0781,
+      "step": 5091
+    },
+    {
+      "epoch": 0.05092,
+      "grad_norm": 0.718839704990387,
+      "learning_rate": 0.003,
+      "loss": 4.0764,
+      "step": 5092
+    },
+    {
+      "epoch": 0.05093,
+      "grad_norm": 0.8266152143478394,
+      "learning_rate": 0.003,
+      "loss": 4.0955,
+      "step": 5093
+    },
+    {
+      "epoch": 0.05094,
+      "grad_norm": 0.8446786999702454,
+      "learning_rate": 0.003,
+      "loss": 4.0547,
+      "step": 5094
+    },
+    {
+      "epoch": 0.05095,
+      "grad_norm": 0.7700090408325195,
+      "learning_rate": 0.003,
+      "loss": 4.0916,
+      "step": 5095
+    },
+    {
+      "epoch": 0.05096,
+      "grad_norm": 0.715460479259491,
+      "learning_rate": 0.003,
+      "loss": 4.078,
+      "step": 5096
+    },
+    {
+      "epoch": 0.05097,
+      "grad_norm": 0.671999454498291,
+      "learning_rate": 0.003,
+      "loss": 4.0712,
+      "step": 5097
+    },
+    {
+      "epoch": 0.05098,
+      "grad_norm": 0.6299563050270081,
+      "learning_rate": 0.003,
+      "loss": 4.063,
+      "step": 5098
+    },
+    {
+      "epoch": 0.05099,
+      "grad_norm": 0.5293504595756531,
+      "learning_rate": 0.003,
+      "loss": 4.0384,
+      "step": 5099
+    },
+    {
+      "epoch": 0.051,
+      "grad_norm": 0.5206742882728577,
+      "learning_rate": 0.003,
+      "loss": 4.0581,
+      "step": 5100
+    },
+    {
+      "epoch": 0.05101,
+      "grad_norm": 0.5910031199455261,
+      "learning_rate": 0.003,
+      "loss": 4.0943,
+      "step": 5101
+    },
+    {
+      "epoch": 0.05102,
+      "grad_norm": 0.655666172504425,
+      "learning_rate": 0.003,
+      "loss": 4.0711,
+      "step": 5102
+    },
+    {
+      "epoch": 0.05103,
+      "grad_norm": 0.7769567966461182,
+      "learning_rate": 0.003,
+      "loss": 4.0847,
+      "step": 5103
+    },
+    {
+      "epoch": 0.05104,
+      "grad_norm": 0.8552154898643494,
+      "learning_rate": 0.003,
+      "loss": 4.0684,
+      "step": 5104
+    },
+    {
+      "epoch": 0.05105,
+      "grad_norm": 0.8374227285385132,
+      "learning_rate": 0.003,
+      "loss": 4.0593,
+      "step": 5105
+    },
+    {
+      "epoch": 0.05106,
+      "grad_norm": 0.6673815250396729,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 5106
+    },
+    {
+      "epoch": 0.05107,
+      "grad_norm": 0.5111753940582275,
+      "learning_rate": 0.003,
+      "loss": 4.0544,
+      "step": 5107
+    },
+    {
+      "epoch": 0.05108,
+      "grad_norm": 0.5464461445808411,
+      "learning_rate": 0.003,
+      "loss": 4.1025,
+      "step": 5108
+    },
+    {
+      "epoch": 0.05109,
+      "grad_norm": 0.5811792016029358,
+      "learning_rate": 0.003,
+      "loss": 4.0416,
+      "step": 5109
+    },
+    {
+      "epoch": 0.0511,
+      "grad_norm": 0.5256453156471252,
+      "learning_rate": 0.003,
+      "loss": 4.0755,
+      "step": 5110
+    },
+    {
+      "epoch": 0.05111,
+      "grad_norm": 0.5240735411643982,
+      "learning_rate": 0.003,
+      "loss": 4.0901,
+      "step": 5111
+    },
+    {
+      "epoch": 0.05112,
+      "grad_norm": 0.5387896299362183,
+      "learning_rate": 0.003,
+      "loss": 4.0774,
+      "step": 5112
+    },
+    {
+      "epoch": 0.05113,
+      "grad_norm": 0.6552562117576599,
+      "learning_rate": 0.003,
+      "loss": 4.0708,
+      "step": 5113
+    },
+    {
+      "epoch": 0.05114,
+      "grad_norm": 0.6873152852058411,
+      "learning_rate": 0.003,
+      "loss": 4.0495,
+      "step": 5114
+    },
+    {
+      "epoch": 0.05115,
+      "grad_norm": 0.6076439619064331,
+      "learning_rate": 0.003,
+      "loss": 4.075,
+      "step": 5115
+    },
+    {
+      "epoch": 0.05116,
+      "grad_norm": 0.536746621131897,
+      "learning_rate": 0.003,
+      "loss": 4.0804,
+      "step": 5116
+    },
+    {
+      "epoch": 0.05117,
+      "grad_norm": 0.5262901186943054,
+      "learning_rate": 0.003,
+      "loss": 4.0589,
+      "step": 5117
+    },
+    {
+      "epoch": 0.05118,
+      "grad_norm": 0.5599290132522583,
+      "learning_rate": 0.003,
+      "loss": 4.0593,
+      "step": 5118
+    },
+    {
+      "epoch": 0.05119,
+      "grad_norm": 0.5657877922058105,
+      "learning_rate": 0.003,
+      "loss": 4.0428,
+      "step": 5119
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.5846731066703796,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 5120
+    },
+    {
+      "epoch": 0.05121,
+      "grad_norm": 0.6840034127235413,
+      "learning_rate": 0.003,
+      "loss": 4.0562,
+      "step": 5121
+    },
+    {
+      "epoch": 0.05122,
+      "grad_norm": 0.9214706420898438,
+      "learning_rate": 0.003,
+      "loss": 4.0508,
+      "step": 5122
+    },
+    {
+      "epoch": 0.05123,
+      "grad_norm": 0.996700644493103,
+      "learning_rate": 0.003,
+      "loss": 4.0739,
+      "step": 5123
+    },
+    {
+      "epoch": 0.05124,
+      "grad_norm": 0.9406014680862427,
+      "learning_rate": 0.003,
+      "loss": 4.1103,
+      "step": 5124
+    },
+    {
+      "epoch": 0.05125,
+      "grad_norm": 0.7598673105239868,
+      "learning_rate": 0.003,
+      "loss": 4.0724,
+      "step": 5125
+    },
+    {
+      "epoch": 0.05126,
+      "grad_norm": 0.6382818818092346,
+      "learning_rate": 0.003,
+      "loss": 4.0659,
+      "step": 5126
+    },
+    {
+      "epoch": 0.05127,
+      "grad_norm": 0.7045958042144775,
+      "learning_rate": 0.003,
+      "loss": 4.0682,
+      "step": 5127
+    },
+    {
+      "epoch": 0.05128,
+      "grad_norm": 0.7263686656951904,
+      "learning_rate": 0.003,
+      "loss": 4.0884,
+      "step": 5128
+    },
+    {
+      "epoch": 0.05129,
+      "grad_norm": 0.7894842624664307,
+      "learning_rate": 0.003,
+      "loss": 4.0578,
+      "step": 5129
+    },
+    {
+      "epoch": 0.0513,
+      "grad_norm": 0.7886913418769836,
+      "learning_rate": 0.003,
+      "loss": 4.0468,
+      "step": 5130
+    },
+    {
+      "epoch": 0.05131,
+      "grad_norm": 0.791679322719574,
+      "learning_rate": 0.003,
+      "loss": 4.0837,
+      "step": 5131
+    },
+    {
+      "epoch": 0.05132,
+      "grad_norm": 0.7831953763961792,
+      "learning_rate": 0.003,
+      "loss": 4.0724,
+      "step": 5132
+    },
+    {
+      "epoch": 0.05133,
+      "grad_norm": 0.9040250778198242,
+      "learning_rate": 0.003,
+      "loss": 4.0878,
+      "step": 5133
+    },
+    {
+      "epoch": 0.05134,
+      "grad_norm": 0.9008460640907288,
+      "learning_rate": 0.003,
+      "loss": 4.1018,
+      "step": 5134
+    },
+    {
+      "epoch": 0.05135,
+      "grad_norm": 0.8832536339759827,
+      "learning_rate": 0.003,
+      "loss": 4.0661,
+      "step": 5135
+    },
+    {
+      "epoch": 0.05136,
+      "grad_norm": 0.886307418346405,
+      "learning_rate": 0.003,
+      "loss": 4.052,
+      "step": 5136
+    },
+    {
+      "epoch": 0.05137,
+      "grad_norm": 0.9245304465293884,
+      "learning_rate": 0.003,
+      "loss": 4.0988,
+      "step": 5137
+    },
+    {
+      "epoch": 0.05138,
+      "grad_norm": 1.0758816003799438,
+      "learning_rate": 0.003,
+      "loss": 4.0972,
+      "step": 5138
+    },
+    {
+      "epoch": 0.05139,
+      "grad_norm": 0.9390636086463928,
+      "learning_rate": 0.003,
+      "loss": 4.0889,
+      "step": 5139
+    },
+    {
+      "epoch": 0.0514,
+      "grad_norm": 0.8630328178405762,
+      "learning_rate": 0.003,
+      "loss": 4.0824,
+      "step": 5140
+    },
+    {
+      "epoch": 0.05141,
+      "grad_norm": 0.6845744252204895,
+      "learning_rate": 0.003,
+      "loss": 4.1305,
+      "step": 5141
+    },
+    {
+      "epoch": 0.05142,
+      "grad_norm": 0.7005607485771179,
+      "learning_rate": 0.003,
+      "loss": 4.0741,
+      "step": 5142
+    },
+    {
+      "epoch": 0.05143,
+      "grad_norm": 0.6236844062805176,
+      "learning_rate": 0.003,
+      "loss": 4.0784,
+      "step": 5143
+    },
+    {
+      "epoch": 0.05144,
+      "grad_norm": 0.5919453501701355,
+      "learning_rate": 0.003,
+      "loss": 4.0655,
+      "step": 5144
+    },
+    {
+      "epoch": 0.05145,
+      "grad_norm": 0.6221011281013489,
+      "learning_rate": 0.003,
+      "loss": 4.0624,
+      "step": 5145
+    },
+    {
+      "epoch": 0.05146,
+      "grad_norm": 0.6235561370849609,
+      "learning_rate": 0.003,
+      "loss": 4.0508,
+      "step": 5146
+    },
+    {
+      "epoch": 0.05147,
+      "grad_norm": 0.6597413420677185,
+      "learning_rate": 0.003,
+      "loss": 4.0791,
+      "step": 5147
+    },
+    {
+      "epoch": 0.05148,
+      "grad_norm": 0.7255842089653015,
+      "learning_rate": 0.003,
+      "loss": 4.0827,
+      "step": 5148
+    },
+    {
+      "epoch": 0.05149,
+      "grad_norm": 0.7855421900749207,
+      "learning_rate": 0.003,
+      "loss": 4.0549,
+      "step": 5149
+    },
+    {
+      "epoch": 0.0515,
+      "grad_norm": 0.6787280440330505,
+      "learning_rate": 0.003,
+      "loss": 4.0901,
+      "step": 5150
+    },
+    {
+      "epoch": 0.05151,
+      "grad_norm": 0.5714637041091919,
+      "learning_rate": 0.003,
+      "loss": 4.0505,
+      "step": 5151
+    },
+    {
+      "epoch": 0.05152,
+      "grad_norm": 0.6426389813423157,
+      "learning_rate": 0.003,
+      "loss": 4.0929,
+      "step": 5152
+    },
+    {
+      "epoch": 0.05153,
+      "grad_norm": 0.6792200207710266,
+      "learning_rate": 0.003,
+      "loss": 4.0697,
+      "step": 5153
+    },
+    {
+      "epoch": 0.05154,
+      "grad_norm": 0.6108073592185974,
+      "learning_rate": 0.003,
+      "loss": 4.0787,
+      "step": 5154
+    },
+    {
+      "epoch": 0.05155,
+      "grad_norm": 0.6600965261459351,
+      "learning_rate": 0.003,
+      "loss": 4.08,
+      "step": 5155
+    },
+    {
+      "epoch": 0.05156,
+      "grad_norm": 0.6948540210723877,
+      "learning_rate": 0.003,
+      "loss": 4.104,
+      "step": 5156
+    },
+    {
+      "epoch": 0.05157,
+      "grad_norm": 0.7809789776802063,
+      "learning_rate": 0.003,
+      "loss": 4.0788,
+      "step": 5157
+    },
+    {
+      "epoch": 0.05158,
+      "grad_norm": 0.80213862657547,
+      "learning_rate": 0.003,
+      "loss": 4.0755,
+      "step": 5158
+    },
+    {
+      "epoch": 0.05159,
+      "grad_norm": 0.7642487287521362,
+      "learning_rate": 0.003,
+      "loss": 4.0543,
+      "step": 5159
+    },
+    {
+      "epoch": 0.0516,
+      "grad_norm": 0.7955514192581177,
+      "learning_rate": 0.003,
+      "loss": 4.0663,
+      "step": 5160
+    },
+    {
+      "epoch": 0.05161,
+      "grad_norm": 0.5677341222763062,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 5161
+    },
+    {
+      "epoch": 0.05162,
+      "grad_norm": 0.5948107242584229,
+      "learning_rate": 0.003,
+      "loss": 4.063,
+      "step": 5162
+    },
+    {
+      "epoch": 0.05163,
+      "grad_norm": 0.7498951554298401,
+      "learning_rate": 0.003,
+      "loss": 4.0825,
+      "step": 5163
+    },
+    {
+      "epoch": 0.05164,
+      "grad_norm": 0.98694908618927,
+      "learning_rate": 0.003,
+      "loss": 4.0668,
+      "step": 5164
+    },
+    {
+      "epoch": 0.05165,
+      "grad_norm": 1.0628858804702759,
+      "learning_rate": 0.003,
+      "loss": 4.093,
+      "step": 5165
+    },
+    {
+      "epoch": 0.05166,
+      "grad_norm": 0.817200243473053,
+      "learning_rate": 0.003,
+      "loss": 4.0823,
+      "step": 5166
+    },
+    {
+      "epoch": 0.05167,
+      "grad_norm": 0.6983803510665894,
+      "learning_rate": 0.003,
+      "loss": 4.0886,
+      "step": 5167
+    },
+    {
+      "epoch": 0.05168,
+      "grad_norm": 0.6873238682746887,
+      "learning_rate": 0.003,
+      "loss": 4.1181,
+      "step": 5168
+    },
+    {
+      "epoch": 0.05169,
+      "grad_norm": 0.6252221465110779,
+      "learning_rate": 0.003,
+      "loss": 4.0676,
+      "step": 5169
+    },
+    {
+      "epoch": 0.0517,
+      "grad_norm": 0.6794670224189758,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 5170
+    },
+    {
+      "epoch": 0.05171,
+      "grad_norm": 0.6629231572151184,
+      "learning_rate": 0.003,
+      "loss": 4.0667,
+      "step": 5171
+    },
+    {
+      "epoch": 0.05172,
+      "grad_norm": 0.7229346036911011,
+      "learning_rate": 0.003,
+      "loss": 4.0826,
+      "step": 5172
+    },
+    {
+      "epoch": 0.05173,
+      "grad_norm": 0.8697673678398132,
+      "learning_rate": 0.003,
+      "loss": 4.098,
+      "step": 5173
+    },
+    {
+      "epoch": 0.05174,
+      "grad_norm": 1.057416319847107,
+      "learning_rate": 0.003,
+      "loss": 4.0949,
+      "step": 5174
+    },
+    {
+      "epoch": 0.05175,
+      "grad_norm": 0.8897588849067688,
+      "learning_rate": 0.003,
+      "loss": 4.1016,
+      "step": 5175
+    },
+    {
+      "epoch": 0.05176,
+      "grad_norm": 0.6620845794677734,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 5176
+    },
+    {
+      "epoch": 0.05177,
+      "grad_norm": 0.659085750579834,
+      "learning_rate": 0.003,
+      "loss": 4.0542,
+      "step": 5177
+    },
+    {
+      "epoch": 0.05178,
+      "grad_norm": 0.7169133424758911,
+      "learning_rate": 0.003,
+      "loss": 4.0521,
+      "step": 5178
+    },
+    {
+      "epoch": 0.05179,
+      "grad_norm": 0.6649430394172668,
+      "learning_rate": 0.003,
+      "loss": 4.0708,
+      "step": 5179
+    },
+    {
+      "epoch": 0.0518,
+      "grad_norm": 0.5839197635650635,
+      "learning_rate": 0.003,
+      "loss": 4.067,
+      "step": 5180
+    },
+    {
+      "epoch": 0.05181,
+      "grad_norm": 0.44185495376586914,
+      "learning_rate": 0.003,
+      "loss": 4.0706,
+      "step": 5181
+    },
+    {
+      "epoch": 0.05182,
+      "grad_norm": 0.4659267067909241,
+      "learning_rate": 0.003,
+      "loss": 4.0853,
+      "step": 5182
+    },
+    {
+      "epoch": 0.05183,
+      "grad_norm": 0.5202658772468567,
+      "learning_rate": 0.003,
+      "loss": 4.0498,
+      "step": 5183
+    },
+    {
+      "epoch": 0.05184,
+      "grad_norm": 0.5452793836593628,
+      "learning_rate": 0.003,
+      "loss": 4.0523,
+      "step": 5184
+    },
+    {
+      "epoch": 0.05185,
+      "grad_norm": 0.6658215522766113,
+      "learning_rate": 0.003,
+      "loss": 4.0579,
+      "step": 5185
+    },
+    {
+      "epoch": 0.05186,
+      "grad_norm": 0.7893977165222168,
+      "learning_rate": 0.003,
+      "loss": 4.0726,
+      "step": 5186
+    },
+    {
+      "epoch": 0.05187,
+      "grad_norm": 0.8065964579582214,
+      "learning_rate": 0.003,
+      "loss": 4.1136,
+      "step": 5187
+    },
+    {
+      "epoch": 0.05188,
+      "grad_norm": 0.7456889748573303,
+      "learning_rate": 0.003,
+      "loss": 4.0865,
+      "step": 5188
+    },
+    {
+      "epoch": 0.05189,
+      "grad_norm": 0.7495638132095337,
+      "learning_rate": 0.003,
+      "loss": 4.0619,
+      "step": 5189
+    },
+    {
+      "epoch": 0.0519,
+      "grad_norm": 0.8337745666503906,
+      "learning_rate": 0.003,
+      "loss": 4.0719,
+      "step": 5190
+    },
+    {
+      "epoch": 0.05191,
+      "grad_norm": 0.8566077351570129,
+      "learning_rate": 0.003,
+      "loss": 4.0703,
+      "step": 5191
+    },
+    {
+      "epoch": 0.05192,
+      "grad_norm": 0.805960476398468,
+      "learning_rate": 0.003,
+      "loss": 4.0888,
+      "step": 5192
+    },
+    {
+      "epoch": 0.05193,
+      "grad_norm": 0.789035439491272,
+      "learning_rate": 0.003,
+      "loss": 4.0827,
+      "step": 5193
+    },
+    {
+      "epoch": 0.05194,
+      "grad_norm": 0.7752846479415894,
+      "learning_rate": 0.003,
+      "loss": 4.0454,
+      "step": 5194
+    },
+    {
+      "epoch": 0.05195,
+      "grad_norm": 0.711561381816864,
+      "learning_rate": 0.003,
+      "loss": 4.0867,
+      "step": 5195
+    },
+    {
+      "epoch": 0.05196,
+      "grad_norm": 0.6495727896690369,
+      "learning_rate": 0.003,
+      "loss": 4.0594,
+      "step": 5196
+    },
+    {
+      "epoch": 0.05197,
+      "grad_norm": 0.6552587747573853,
+      "learning_rate": 0.003,
+      "loss": 4.0849,
+      "step": 5197
+    },
+    {
+      "epoch": 0.05198,
+      "grad_norm": 0.5346404314041138,
+      "learning_rate": 0.003,
+      "loss": 4.0509,
+      "step": 5198
+    },
+    {
+      "epoch": 0.05199,
+      "grad_norm": 0.5881133079528809,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 5199
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.5984171032905579,
+      "learning_rate": 0.003,
+      "loss": 4.0726,
+      "step": 5200
+    },
+    {
+      "epoch": 0.05201,
+      "grad_norm": 0.5811743140220642,
+      "learning_rate": 0.003,
+      "loss": 4.0735,
+      "step": 5201
+    },
+    {
+      "epoch": 0.05202,
+      "grad_norm": 0.5741499662399292,
+      "learning_rate": 0.003,
+      "loss": 4.058,
+      "step": 5202
+    },
+    {
+      "epoch": 0.05203,
+      "grad_norm": 0.5631682276725769,
+      "learning_rate": 0.003,
+      "loss": 4.0858,
+      "step": 5203
+    },
+    {
+      "epoch": 0.05204,
+      "grad_norm": 0.6046068668365479,
+      "learning_rate": 0.003,
+      "loss": 4.0768,
+      "step": 5204
+    },
+    {
+      "epoch": 0.05205,
+      "grad_norm": 0.6720539331436157,
+      "learning_rate": 0.003,
+      "loss": 4.0852,
+      "step": 5205
+    },
+    {
+      "epoch": 0.05206,
+      "grad_norm": 0.8187111020088196,
+      "learning_rate": 0.003,
+      "loss": 4.0735,
+      "step": 5206
+    },
+    {
+      "epoch": 0.05207,
+      "grad_norm": 0.9796744585037231,
+      "learning_rate": 0.003,
+      "loss": 4.0629,
+      "step": 5207
+    },
+    {
+      "epoch": 0.05208,
+      "grad_norm": 1.0159255266189575,
+      "learning_rate": 0.003,
+      "loss": 4.0845,
+      "step": 5208
+    },
+    {
+      "epoch": 0.05209,
+      "grad_norm": 0.9315694570541382,
+      "learning_rate": 0.003,
+      "loss": 4.0767,
+      "step": 5209
+    },
+    {
+      "epoch": 0.0521,
+      "grad_norm": 0.932378351688385,
+      "learning_rate": 0.003,
+      "loss": 4.0544,
+      "step": 5210
+    },
+    {
+      "epoch": 0.05211,
+      "grad_norm": 0.8560777902603149,
+      "learning_rate": 0.003,
+      "loss": 4.0766,
+      "step": 5211
+    },
+    {
+      "epoch": 0.05212,
+      "grad_norm": 0.7157209515571594,
+      "learning_rate": 0.003,
+      "loss": 4.1207,
+      "step": 5212
+    },
+    {
+      "epoch": 0.05213,
+      "grad_norm": 0.7000187635421753,
+      "learning_rate": 0.003,
+      "loss": 4.0757,
+      "step": 5213
+    },
+    {
+      "epoch": 0.05214,
+      "grad_norm": 0.7313051223754883,
+      "learning_rate": 0.003,
+      "loss": 4.0702,
+      "step": 5214
+    },
+    {
+      "epoch": 0.05215,
+      "grad_norm": 0.7454226016998291,
+      "learning_rate": 0.003,
+      "loss": 4.0613,
+      "step": 5215
+    },
+    {
+      "epoch": 0.05216,
+      "grad_norm": 0.7446316480636597,
+      "learning_rate": 0.003,
+      "loss": 4.074,
+      "step": 5216
+    },
+    {
+      "epoch": 0.05217,
+      "grad_norm": 0.892437219619751,
+      "learning_rate": 0.003,
+      "loss": 4.0815,
+      "step": 5217
+    },
+    {
+      "epoch": 0.05218,
+      "grad_norm": 1.0080622434616089,
+      "learning_rate": 0.003,
+      "loss": 4.0757,
+      "step": 5218
+    },
+    {
+      "epoch": 0.05219,
+      "grad_norm": 0.969513475894928,
+      "learning_rate": 0.003,
+      "loss": 4.1046,
+      "step": 5219
+    },
+    {
+      "epoch": 0.0522,
+      "grad_norm": 0.8957400918006897,
+      "learning_rate": 0.003,
+      "loss": 4.0658,
+      "step": 5220
+    },
+    {
+      "epoch": 0.05221,
+      "grad_norm": 0.7468663454055786,
+      "learning_rate": 0.003,
+      "loss": 4.077,
+      "step": 5221
+    },
+    {
+      "epoch": 0.05222,
+      "grad_norm": 0.6890023946762085,
+      "learning_rate": 0.003,
+      "loss": 4.0833,
+      "step": 5222
+    },
+    {
+      "epoch": 0.05223,
+      "grad_norm": 0.7769709825515747,
+      "learning_rate": 0.003,
+      "loss": 4.1028,
+      "step": 5223
+    },
+    {
+      "epoch": 0.05224,
+      "grad_norm": 0.8365834355354309,
+      "learning_rate": 0.003,
+      "loss": 4.079,
+      "step": 5224
+    },
+    {
+      "epoch": 0.05225,
+      "grad_norm": 0.8886056542396545,
+      "learning_rate": 0.003,
+      "loss": 4.106,
+      "step": 5225
+    },
+    {
+      "epoch": 0.05226,
+      "grad_norm": 1.014122486114502,
+      "learning_rate": 0.003,
+      "loss": 4.1027,
+      "step": 5226
+    },
+    {
+      "epoch": 0.05227,
+      "grad_norm": 0.9814178943634033,
+      "learning_rate": 0.003,
+      "loss": 4.1245,
+      "step": 5227
+    },
+    {
+      "epoch": 0.05228,
+      "grad_norm": 0.8787015080451965,
+      "learning_rate": 0.003,
+      "loss": 4.1037,
+      "step": 5228
+    },
+    {
+      "epoch": 0.05229,
+      "grad_norm": 0.7741072177886963,
+      "learning_rate": 0.003,
+      "loss": 4.0648,
+      "step": 5229
+    },
+    {
+      "epoch": 0.0523,
+      "grad_norm": 0.6090134978294373,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 5230
+    },
+    {
+      "epoch": 0.05231,
+      "grad_norm": 0.588998556137085,
+      "learning_rate": 0.003,
+      "loss": 4.0861,
+      "step": 5231
+    },
+    {
+      "epoch": 0.05232,
+      "grad_norm": 0.5598298907279968,
+      "learning_rate": 0.003,
+      "loss": 4.0726,
+      "step": 5232
+    },
+    {
+      "epoch": 0.05233,
+      "grad_norm": 0.5243165493011475,
+      "learning_rate": 0.003,
+      "loss": 4.0552,
+      "step": 5233
+    },
+    {
+      "epoch": 0.05234,
+      "grad_norm": 0.41240203380584717,
+      "learning_rate": 0.003,
+      "loss": 4.099,
+      "step": 5234
+    },
+    {
+      "epoch": 0.05235,
+      "grad_norm": 0.4857303500175476,
+      "learning_rate": 0.003,
+      "loss": 4.0754,
+      "step": 5235
+    },
+    {
+      "epoch": 0.05236,
+      "grad_norm": 0.5221879482269287,
+      "learning_rate": 0.003,
+      "loss": 4.0576,
+      "step": 5236
+    },
+    {
+      "epoch": 0.05237,
+      "grad_norm": 0.5270381569862366,
+      "learning_rate": 0.003,
+      "loss": 4.0598,
+      "step": 5237
+    },
+    {
+      "epoch": 0.05238,
+      "grad_norm": 0.6028867959976196,
+      "learning_rate": 0.003,
+      "loss": 4.053,
+      "step": 5238
+    },
+    {
+      "epoch": 0.05239,
+      "grad_norm": 0.6225501298904419,
+      "learning_rate": 0.003,
+      "loss": 4.0752,
+      "step": 5239
+    },
+    {
+      "epoch": 0.0524,
+      "grad_norm": 0.6203356385231018,
+      "learning_rate": 0.003,
+      "loss": 4.0851,
+      "step": 5240
+    },
+    {
+      "epoch": 0.05241,
+      "grad_norm": 0.6311340928077698,
+      "learning_rate": 0.003,
+      "loss": 4.0892,
+      "step": 5241
+    },
+    {
+      "epoch": 0.05242,
+      "grad_norm": 0.6222201585769653,
+      "learning_rate": 0.003,
+      "loss": 4.084,
+      "step": 5242
+    },
+    {
+      "epoch": 0.05243,
+      "grad_norm": 0.6350955963134766,
+      "learning_rate": 0.003,
+      "loss": 4.0759,
+      "step": 5243
+    },
+    {
+      "epoch": 0.05244,
+      "grad_norm": 0.6730124354362488,
+      "learning_rate": 0.003,
+      "loss": 4.0525,
+      "step": 5244
+    },
+    {
+      "epoch": 0.05245,
+      "grad_norm": 0.8762339353561401,
+      "learning_rate": 0.003,
+      "loss": 4.066,
+      "step": 5245
+    },
+    {
+      "epoch": 0.05246,
+      "grad_norm": 1.018054485321045,
+      "learning_rate": 0.003,
+      "loss": 4.0753,
+      "step": 5246
+    },
+    {
+      "epoch": 0.05247,
+      "grad_norm": 1.032390832901001,
+      "learning_rate": 0.003,
+      "loss": 4.0817,
+      "step": 5247
+    },
+    {
+      "epoch": 0.05248,
+      "grad_norm": 0.6803662180900574,
+      "learning_rate": 0.003,
+      "loss": 4.0788,
+      "step": 5248
+    },
+    {
+      "epoch": 0.05249,
+      "grad_norm": 0.7293293476104736,
+      "learning_rate": 0.003,
+      "loss": 4.0914,
+      "step": 5249
+    },
+    {
+      "epoch": 0.0525,
+      "grad_norm": 1.0431487560272217,
+      "learning_rate": 0.003,
+      "loss": 4.0917,
+      "step": 5250
+    },
+    {
+      "epoch": 0.05251,
+      "grad_norm": 0.9717729687690735,
+      "learning_rate": 0.003,
+      "loss": 4.0809,
+      "step": 5251
+    },
+    {
+      "epoch": 0.05252,
+      "grad_norm": 0.8074532747268677,
+      "learning_rate": 0.003,
+      "loss": 4.1051,
+      "step": 5252
+    },
+    {
+      "epoch": 0.05253,
+      "grad_norm": 0.8047144412994385,
+      "learning_rate": 0.003,
+      "loss": 4.0752,
+      "step": 5253
+    },
+    {
+      "epoch": 0.05254,
+      "grad_norm": 0.7531102895736694,
+      "learning_rate": 0.003,
+      "loss": 4.0626,
+      "step": 5254
+    },
+    {
+      "epoch": 0.05255,
+      "grad_norm": 0.6377032995223999,
+      "learning_rate": 0.003,
+      "loss": 4.0594,
+      "step": 5255
+    },
+    {
+      "epoch": 0.05256,
+      "grad_norm": 0.5738644003868103,
+      "learning_rate": 0.003,
+      "loss": 4.0852,
+      "step": 5256
+    },
+    {
+      "epoch": 0.05257,
+      "grad_norm": 0.5616352558135986,
+      "learning_rate": 0.003,
+      "loss": 4.0727,
+      "step": 5257
+    },
+    {
+      "epoch": 0.05258,
+      "grad_norm": 0.5875840783119202,
+      "learning_rate": 0.003,
+      "loss": 4.0741,
+      "step": 5258
+    },
+    {
+      "epoch": 0.05259,
+      "grad_norm": 0.5328964591026306,
+      "learning_rate": 0.003,
+      "loss": 4.0554,
+      "step": 5259
+    },
+    {
+      "epoch": 0.0526,
+      "grad_norm": 0.5470474362373352,
+      "learning_rate": 0.003,
+      "loss": 4.0832,
+      "step": 5260
+    },
+    {
+      "epoch": 0.05261,
+      "grad_norm": 0.5530418753623962,
+      "learning_rate": 0.003,
+      "loss": 4.0504,
+      "step": 5261
+    },
+    {
+      "epoch": 0.05262,
+      "grad_norm": 0.543959379196167,
+      "learning_rate": 0.003,
+      "loss": 4.0458,
+      "step": 5262
+    },
+    {
+      "epoch": 0.05263,
+      "grad_norm": 0.5292184352874756,
+      "learning_rate": 0.003,
+      "loss": 4.0905,
+      "step": 5263
+    },
+    {
+      "epoch": 0.05264,
+      "grad_norm": 0.5867078900337219,
+      "learning_rate": 0.003,
+      "loss": 4.0895,
+      "step": 5264
+    },
+    {
+      "epoch": 0.05265,
+      "grad_norm": 0.5979050397872925,
+      "learning_rate": 0.003,
+      "loss": 4.0514,
+      "step": 5265
+    },
+    {
+      "epoch": 0.05266,
+      "grad_norm": 0.5827270746231079,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 5266
+    },
+    {
+      "epoch": 0.05267,
+      "grad_norm": 0.7350538969039917,
+      "learning_rate": 0.003,
+      "loss": 4.0909,
+      "step": 5267
+    },
+    {
+      "epoch": 0.05268,
+      "grad_norm": 0.8645079731941223,
+      "learning_rate": 0.003,
+      "loss": 4.0594,
+      "step": 5268
+    },
+    {
+      "epoch": 0.05269,
+      "grad_norm": 0.8950510025024414,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 5269
+    },
+    {
+      "epoch": 0.0527,
+      "grad_norm": 0.8910822868347168,
+      "learning_rate": 0.003,
+      "loss": 4.0757,
+      "step": 5270
+    },
+    {
+      "epoch": 0.05271,
+      "grad_norm": 0.7200420498847961,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 5271
+    },
+    {
+      "epoch": 0.05272,
+      "grad_norm": 0.6258223652839661,
+      "learning_rate": 0.003,
+      "loss": 4.0899,
+      "step": 5272
+    },
+    {
+      "epoch": 0.05273,
+      "grad_norm": 0.7092558741569519,
+      "learning_rate": 0.003,
+      "loss": 4.0771,
+      "step": 5273
+    },
+    {
+      "epoch": 0.05274,
+      "grad_norm": 0.6717981696128845,
+      "learning_rate": 0.003,
+      "loss": 4.0824,
+      "step": 5274
+    },
+    {
+      "epoch": 0.05275,
+      "grad_norm": 0.7079071402549744,
+      "learning_rate": 0.003,
+      "loss": 4.0582,
+      "step": 5275
+    },
+    {
+      "epoch": 0.05276,
+      "grad_norm": 0.6641931533813477,
+      "learning_rate": 0.003,
+      "loss": 4.0496,
+      "step": 5276
+    },
+    {
+      "epoch": 0.05277,
+      "grad_norm": 0.6190426349639893,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 5277
+    },
+    {
+      "epoch": 0.05278,
+      "grad_norm": 0.5594713687896729,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 5278
+    },
+    {
+      "epoch": 0.05279,
+      "grad_norm": 0.6031233668327332,
+      "learning_rate": 0.003,
+      "loss": 4.0549,
+      "step": 5279
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.678126871585846,
+      "learning_rate": 0.003,
+      "loss": 4.0539,
+      "step": 5280
+    },
+    {
+      "epoch": 0.05281,
+      "grad_norm": 0.6510528326034546,
+      "learning_rate": 0.003,
+      "loss": 4.0678,
+      "step": 5281
+    },
+    {
+      "epoch": 0.05282,
+      "grad_norm": 0.6001152992248535,
+      "learning_rate": 0.003,
+      "loss": 4.0589,
+      "step": 5282
+    },
+    {
+      "epoch": 0.05283,
+      "grad_norm": 0.5790733098983765,
+      "learning_rate": 0.003,
+      "loss": 4.0463,
+      "step": 5283
+    },
+    {
+      "epoch": 0.05284,
+      "grad_norm": 0.6320419907569885,
+      "learning_rate": 0.003,
+      "loss": 4.0751,
+      "step": 5284
+    },
+    {
+      "epoch": 0.05285,
+      "grad_norm": 0.7830936312675476,
+      "learning_rate": 0.003,
+      "loss": 4.034,
+      "step": 5285
+    },
+    {
+      "epoch": 0.05286,
+      "grad_norm": 0.9895585775375366,
+      "learning_rate": 0.003,
+      "loss": 4.1052,
+      "step": 5286
+    },
+    {
+      "epoch": 0.05287,
+      "grad_norm": 1.0884960889816284,
+      "learning_rate": 0.003,
+      "loss": 4.0843,
+      "step": 5287
+    },
+    {
+      "epoch": 0.05288,
+      "grad_norm": 0.9570357203483582,
+      "learning_rate": 0.003,
+      "loss": 4.063,
+      "step": 5288
+    },
+    {
+      "epoch": 0.05289,
+      "grad_norm": 0.9439939260482788,
+      "learning_rate": 0.003,
+      "loss": 4.0934,
+      "step": 5289
+    },
+    {
+      "epoch": 0.0529,
+      "grad_norm": 0.8485848903656006,
+      "learning_rate": 0.003,
+      "loss": 4.0858,
+      "step": 5290
+    },
+    {
+      "epoch": 0.05291,
+      "grad_norm": 0.9338999390602112,
+      "learning_rate": 0.003,
+      "loss": 4.104,
+      "step": 5291
+    },
+    {
+      "epoch": 0.05292,
+      "grad_norm": 0.8072062730789185,
+      "learning_rate": 0.003,
+      "loss": 4.0966,
+      "step": 5292
+    },
+    {
+      "epoch": 0.05293,
+      "grad_norm": 0.8015543818473816,
+      "learning_rate": 0.003,
+      "loss": 4.095,
+      "step": 5293
+    },
+    {
+      "epoch": 0.05294,
+      "grad_norm": 0.819395124912262,
+      "learning_rate": 0.003,
+      "loss": 4.081,
+      "step": 5294
+    },
+    {
+      "epoch": 0.05295,
+      "grad_norm": 0.92290198802948,
+      "learning_rate": 0.003,
+      "loss": 4.0921,
+      "step": 5295
+    },
+    {
+      "epoch": 0.05296,
+      "grad_norm": 0.8833643794059753,
+      "learning_rate": 0.003,
+      "loss": 4.0774,
+      "step": 5296
+    },
+    {
+      "epoch": 0.05297,
+      "grad_norm": 0.7732718586921692,
+      "learning_rate": 0.003,
+      "loss": 4.0758,
+      "step": 5297
+    },
+    {
+      "epoch": 0.05298,
+      "grad_norm": 0.7301531434059143,
+      "learning_rate": 0.003,
+      "loss": 4.0945,
+      "step": 5298
+    },
+    {
+      "epoch": 0.05299,
+      "grad_norm": 0.6377935409545898,
+      "learning_rate": 0.003,
+      "loss": 4.0656,
+      "step": 5299
+    },
+    {
+      "epoch": 0.053,
+      "grad_norm": 0.653924286365509,
+      "learning_rate": 0.003,
+      "loss": 4.0821,
+      "step": 5300
+    },
+    {
+      "epoch": 0.05301,
+      "grad_norm": 0.6310106515884399,
+      "learning_rate": 0.003,
+      "loss": 4.095,
+      "step": 5301
+    },
+    {
+      "epoch": 0.05302,
+      "grad_norm": 0.5855343341827393,
+      "learning_rate": 0.003,
+      "loss": 4.0676,
+      "step": 5302
+    },
+    {
+      "epoch": 0.05303,
+      "grad_norm": 0.6311571598052979,
+      "learning_rate": 0.003,
+      "loss": 4.0781,
+      "step": 5303
+    },
+    {
+      "epoch": 0.05304,
+      "grad_norm": 0.6711182594299316,
+      "learning_rate": 0.003,
+      "loss": 4.0666,
+      "step": 5304
+    },
+    {
+      "epoch": 0.05305,
+      "grad_norm": 0.8134438991546631,
+      "learning_rate": 0.003,
+      "loss": 4.0773,
+      "step": 5305
+    },
+    {
+      "epoch": 0.05306,
+      "grad_norm": 0.8880993723869324,
+      "learning_rate": 0.003,
+      "loss": 4.0883,
+      "step": 5306
+    },
+    {
+      "epoch": 0.05307,
+      "grad_norm": 0.8936927318572998,
+      "learning_rate": 0.003,
+      "loss": 4.0892,
+      "step": 5307
+    },
+    {
+      "epoch": 0.05308,
+      "grad_norm": 0.8056278824806213,
+      "learning_rate": 0.003,
+      "loss": 4.0967,
+      "step": 5308
+    },
+    {
+      "epoch": 0.05309,
+      "grad_norm": 0.6631887555122375,
+      "learning_rate": 0.003,
+      "loss": 4.0576,
+      "step": 5309
+    },
+    {
+      "epoch": 0.0531,
+      "grad_norm": 0.6476406455039978,
+      "learning_rate": 0.003,
+      "loss": 4.0702,
+      "step": 5310
+    },
+    {
+      "epoch": 0.05311,
+      "grad_norm": 0.7253839373588562,
+      "learning_rate": 0.003,
+      "loss": 4.1079,
+      "step": 5311
+    },
+    {
+      "epoch": 0.05312,
+      "grad_norm": 0.7280350923538208,
+      "learning_rate": 0.003,
+      "loss": 4.0625,
+      "step": 5312
+    },
+    {
+      "epoch": 0.05313,
+      "grad_norm": 0.7820828557014465,
+      "learning_rate": 0.003,
+      "loss": 4.0481,
+      "step": 5313
+    },
+    {
+      "epoch": 0.05314,
+      "grad_norm": 0.782772958278656,
+      "learning_rate": 0.003,
+      "loss": 4.0778,
+      "step": 5314
+    },
+    {
+      "epoch": 0.05315,
+      "grad_norm": 0.7675919532775879,
+      "learning_rate": 0.003,
+      "loss": 4.0819,
+      "step": 5315
+    },
+    {
+      "epoch": 0.05316,
+      "grad_norm": 0.7182729244232178,
+      "learning_rate": 0.003,
+      "loss": 4.0847,
+      "step": 5316
+    },
+    {
+      "epoch": 0.05317,
+      "grad_norm": 0.6802192330360413,
+      "learning_rate": 0.003,
+      "loss": 4.0844,
+      "step": 5317
+    },
+    {
+      "epoch": 0.05318,
+      "grad_norm": 0.6332565546035767,
+      "learning_rate": 0.003,
+      "loss": 4.0525,
+      "step": 5318
+    },
+    {
+      "epoch": 0.05319,
+      "grad_norm": 0.5553032159805298,
+      "learning_rate": 0.003,
+      "loss": 4.0539,
+      "step": 5319
+    },
+    {
+      "epoch": 0.0532,
+      "grad_norm": 0.5743739008903503,
+      "learning_rate": 0.003,
+      "loss": 4.0661,
+      "step": 5320
+    },
+    {
+      "epoch": 0.05321,
+      "grad_norm": 0.5735093951225281,
+      "learning_rate": 0.003,
+      "loss": 4.0651,
+      "step": 5321
+    },
+    {
+      "epoch": 0.05322,
+      "grad_norm": 0.5387066006660461,
+      "learning_rate": 0.003,
+      "loss": 4.0832,
+      "step": 5322
+    },
+    {
+      "epoch": 0.05323,
+      "grad_norm": 0.5573674440383911,
+      "learning_rate": 0.003,
+      "loss": 4.1068,
+      "step": 5323
+    },
+    {
+      "epoch": 0.05324,
+      "grad_norm": 0.5417758226394653,
+      "learning_rate": 0.003,
+      "loss": 4.0491,
+      "step": 5324
+    },
+    {
+      "epoch": 0.05325,
+      "grad_norm": 0.5368819832801819,
+      "learning_rate": 0.003,
+      "loss": 4.0675,
+      "step": 5325
+    },
+    {
+      "epoch": 0.05326,
+      "grad_norm": 0.6424285173416138,
+      "learning_rate": 0.003,
+      "loss": 4.0899,
+      "step": 5326
+    },
+    {
+      "epoch": 0.05327,
+      "grad_norm": 0.6725877523422241,
+      "learning_rate": 0.003,
+      "loss": 4.0689,
+      "step": 5327
+    },
+    {
+      "epoch": 0.05328,
+      "grad_norm": 0.8542432188987732,
+      "learning_rate": 0.003,
+      "loss": 4.0838,
+      "step": 5328
+    },
+    {
+      "epoch": 0.05329,
+      "grad_norm": 1.01321542263031,
+      "learning_rate": 0.003,
+      "loss": 4.0771,
+      "step": 5329
+    },
+    {
+      "epoch": 0.0533,
+      "grad_norm": 0.9700354933738708,
+      "learning_rate": 0.003,
+      "loss": 4.0948,
+      "step": 5330
+    },
+    {
+      "epoch": 0.05331,
+      "grad_norm": 0.8577306270599365,
+      "learning_rate": 0.003,
+      "loss": 4.0851,
+      "step": 5331
+    },
+    {
+      "epoch": 0.05332,
+      "grad_norm": 0.9007319808006287,
+      "learning_rate": 0.003,
+      "loss": 4.1053,
+      "step": 5332
+    },
+    {
+      "epoch": 0.05333,
+      "grad_norm": 0.7996346950531006,
+      "learning_rate": 0.003,
+      "loss": 4.0752,
+      "step": 5333
+    },
+    {
+      "epoch": 0.05334,
+      "grad_norm": 0.7633571624755859,
+      "learning_rate": 0.003,
+      "loss": 4.061,
+      "step": 5334
+    },
+    {
+      "epoch": 0.05335,
+      "grad_norm": 0.7068867683410645,
+      "learning_rate": 0.003,
+      "loss": 4.0898,
+      "step": 5335
+    },
+    {
+      "epoch": 0.05336,
+      "grad_norm": 0.6439290046691895,
+      "learning_rate": 0.003,
+      "loss": 4.0984,
+      "step": 5336
+    },
+    {
+      "epoch": 0.05337,
+      "grad_norm": 0.5971997380256653,
+      "learning_rate": 0.003,
+      "loss": 4.0818,
+      "step": 5337
+    },
+    {
+      "epoch": 0.05338,
+      "grad_norm": 0.6028808355331421,
+      "learning_rate": 0.003,
+      "loss": 4.0698,
+      "step": 5338
+    },
+    {
+      "epoch": 0.05339,
+      "grad_norm": 0.5407232046127319,
+      "learning_rate": 0.003,
+      "loss": 4.0548,
+      "step": 5339
+    },
+    {
+      "epoch": 0.0534,
+      "grad_norm": 0.5392054915428162,
+      "learning_rate": 0.003,
+      "loss": 4.073,
+      "step": 5340
+    },
+    {
+      "epoch": 0.05341,
+      "grad_norm": 0.5244172215461731,
+      "learning_rate": 0.003,
+      "loss": 4.0731,
+      "step": 5341
+    },
+    {
+      "epoch": 0.05342,
+      "grad_norm": 0.5212708115577698,
+      "learning_rate": 0.003,
+      "loss": 4.0577,
+      "step": 5342
+    },
+    {
+      "epoch": 0.05343,
+      "grad_norm": 0.531294047832489,
+      "learning_rate": 0.003,
+      "loss": 4.0765,
+      "step": 5343
+    },
+    {
+      "epoch": 0.05344,
+      "grad_norm": 0.4932909309864044,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 5344
+    },
+    {
+      "epoch": 0.05345,
+      "grad_norm": 0.5027388334274292,
+      "learning_rate": 0.003,
+      "loss": 4.0472,
+      "step": 5345
+    },
+    {
+      "epoch": 0.05346,
+      "grad_norm": 0.6342618465423584,
+      "learning_rate": 0.003,
+      "loss": 4.0659,
+      "step": 5346
+    },
+    {
+      "epoch": 0.05347,
+      "grad_norm": 0.8551032543182373,
+      "learning_rate": 0.003,
+      "loss": 4.0577,
+      "step": 5347
+    },
+    {
+      "epoch": 0.05348,
+      "grad_norm": 1.2991005182266235,
+      "learning_rate": 0.003,
+      "loss": 4.0715,
+      "step": 5348
+    },
+    {
+      "epoch": 0.05349,
+      "grad_norm": 0.8248401284217834,
+      "learning_rate": 0.003,
+      "loss": 4.0719,
+      "step": 5349
+    },
+    {
+      "epoch": 0.0535,
+      "grad_norm": 0.6131325960159302,
+      "learning_rate": 0.003,
+      "loss": 4.0564,
+      "step": 5350
+    },
+    {
+      "epoch": 0.05351,
+      "grad_norm": 0.7400457859039307,
+      "learning_rate": 0.003,
+      "loss": 4.0653,
+      "step": 5351
+    },
+    {
+      "epoch": 0.05352,
+      "grad_norm": 0.8651459813117981,
+      "learning_rate": 0.003,
+      "loss": 4.0602,
+      "step": 5352
+    },
+    {
+      "epoch": 0.05353,
+      "grad_norm": 0.7526074051856995,
+      "learning_rate": 0.003,
+      "loss": 4.07,
+      "step": 5353
+    },
+    {
+      "epoch": 0.05354,
+      "grad_norm": 0.6973598599433899,
+      "learning_rate": 0.003,
+      "loss": 4.0544,
+      "step": 5354
+    },
+    {
+      "epoch": 0.05355,
+      "grad_norm": 0.7028447985649109,
+      "learning_rate": 0.003,
+      "loss": 4.0501,
+      "step": 5355
+    },
+    {
+      "epoch": 0.05356,
+      "grad_norm": 0.7545419335365295,
+      "learning_rate": 0.003,
+      "loss": 4.0875,
+      "step": 5356
+    },
+    {
+      "epoch": 0.05357,
+      "grad_norm": 0.7842448949813843,
+      "learning_rate": 0.003,
+      "loss": 4.0777,
+      "step": 5357
+    },
+    {
+      "epoch": 0.05358,
+      "grad_norm": 0.8168363571166992,
+      "learning_rate": 0.003,
+      "loss": 4.0679,
+      "step": 5358
+    },
+    {
+      "epoch": 0.05359,
+      "grad_norm": 0.8426969647407532,
+      "learning_rate": 0.003,
+      "loss": 4.0818,
+      "step": 5359
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.7467427849769592,
+      "learning_rate": 0.003,
+      "loss": 4.0719,
+      "step": 5360
+    },
+    {
+      "epoch": 0.05361,
+      "grad_norm": 0.7218089699745178,
+      "learning_rate": 0.003,
+      "loss": 4.0802,
+      "step": 5361
+    },
+    {
+      "epoch": 0.05362,
+      "grad_norm": 0.8307386636734009,
+      "learning_rate": 0.003,
+      "loss": 4.0946,
+      "step": 5362
+    },
+    {
+      "epoch": 0.05363,
+      "grad_norm": 0.8872959017753601,
+      "learning_rate": 0.003,
+      "loss": 4.1001,
+      "step": 5363
+    },
+    {
+      "epoch": 0.05364,
+      "grad_norm": 0.7643839716911316,
+      "learning_rate": 0.003,
+      "loss": 4.0927,
+      "step": 5364
+    },
+    {
+      "epoch": 0.05365,
+      "grad_norm": 0.7082914113998413,
+      "learning_rate": 0.003,
+      "loss": 4.0862,
+      "step": 5365
+    },
+    {
+      "epoch": 0.05366,
+      "grad_norm": 0.8015276789665222,
+      "learning_rate": 0.003,
+      "loss": 4.0731,
+      "step": 5366
+    },
+    {
+      "epoch": 0.05367,
+      "grad_norm": 0.8754156827926636,
+      "learning_rate": 0.003,
+      "loss": 4.0722,
+      "step": 5367
+    },
+    {
+      "epoch": 0.05368,
+      "grad_norm": 1.0555367469787598,
+      "learning_rate": 0.003,
+      "loss": 4.0703,
+      "step": 5368
+    },
+    {
+      "epoch": 0.05369,
+      "grad_norm": 1.0798786878585815,
+      "learning_rate": 0.003,
+      "loss": 4.0871,
+      "step": 5369
+    },
+    {
+      "epoch": 0.0537,
+      "grad_norm": 0.8246363997459412,
+      "learning_rate": 0.003,
+      "loss": 4.0826,
+      "step": 5370
+    },
+    {
+      "epoch": 0.05371,
+      "grad_norm": 0.5798512697219849,
+      "learning_rate": 0.003,
+      "loss": 4.0964,
+      "step": 5371
+    },
+    {
+      "epoch": 0.05372,
+      "grad_norm": 0.6000420451164246,
+      "learning_rate": 0.003,
+      "loss": 4.0706,
+      "step": 5372
+    },
+    {
+      "epoch": 0.05373,
+      "grad_norm": 0.6344287395477295,
+      "learning_rate": 0.003,
+      "loss": 4.0893,
+      "step": 5373
+    },
+    {
+      "epoch": 0.05374,
+      "grad_norm": 0.7042878270149231,
+      "learning_rate": 0.003,
+      "loss": 4.0662,
+      "step": 5374
+    },
+    {
+      "epoch": 0.05375,
+      "grad_norm": 0.728354275226593,
+      "learning_rate": 0.003,
+      "loss": 4.063,
+      "step": 5375
+    },
+    {
+      "epoch": 0.05376,
+      "grad_norm": 0.6129562854766846,
+      "learning_rate": 0.003,
+      "loss": 4.0872,
+      "step": 5376
+    },
+    {
+      "epoch": 0.05377,
+      "grad_norm": 0.4737764000892639,
+      "learning_rate": 0.003,
+      "loss": 4.0738,
+      "step": 5377
+    },
+    {
+      "epoch": 0.05378,
+      "grad_norm": 0.593817949295044,
+      "learning_rate": 0.003,
+      "loss": 4.0792,
+      "step": 5378
+    },
+    {
+      "epoch": 0.05379,
+      "grad_norm": 0.6451365351676941,
+      "learning_rate": 0.003,
+      "loss": 4.0861,
+      "step": 5379
+    },
+    {
+      "epoch": 0.0538,
+      "grad_norm": 0.5840117335319519,
+      "learning_rate": 0.003,
+      "loss": 4.0518,
+      "step": 5380
+    },
+    {
+      "epoch": 0.05381,
+      "grad_norm": 0.5710886716842651,
+      "learning_rate": 0.003,
+      "loss": 4.0615,
+      "step": 5381
+    },
+    {
+      "epoch": 0.05382,
+      "grad_norm": 0.5268121957778931,
+      "learning_rate": 0.003,
+      "loss": 4.0792,
+      "step": 5382
+    },
+    {
+      "epoch": 0.05383,
+      "grad_norm": 0.4619216322898865,
+      "learning_rate": 0.003,
+      "loss": 4.074,
+      "step": 5383
+    },
+    {
+      "epoch": 0.05384,
+      "grad_norm": 0.5457313060760498,
+      "learning_rate": 0.003,
+      "loss": 4.0554,
+      "step": 5384
+    },
+    {
+      "epoch": 0.05385,
+      "grad_norm": 0.5551273226737976,
+      "learning_rate": 0.003,
+      "loss": 4.082,
+      "step": 5385
+    },
+    {
+      "epoch": 0.05386,
+      "grad_norm": 0.6332549452781677,
+      "learning_rate": 0.003,
+      "loss": 4.0655,
+      "step": 5386
+    },
+    {
+      "epoch": 0.05387,
+      "grad_norm": 0.7257327437400818,
+      "learning_rate": 0.003,
+      "loss": 4.053,
+      "step": 5387
+    },
+    {
+      "epoch": 0.05388,
+      "grad_norm": 0.7519972920417786,
+      "learning_rate": 0.003,
+      "loss": 4.0913,
+      "step": 5388
+    },
+    {
+      "epoch": 0.05389,
+      "grad_norm": 0.7855745553970337,
+      "learning_rate": 0.003,
+      "loss": 4.0672,
+      "step": 5389
+    },
+    {
+      "epoch": 0.0539,
+      "grad_norm": 0.8400787115097046,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 5390
+    },
+    {
+      "epoch": 0.05391,
+      "grad_norm": 0.8107534050941467,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 5391
+    },
+    {
+      "epoch": 0.05392,
+      "grad_norm": 0.6744571328163147,
+      "learning_rate": 0.003,
+      "loss": 4.0574,
+      "step": 5392
+    },
+    {
+      "epoch": 0.05393,
+      "grad_norm": 0.5944811105728149,
+      "learning_rate": 0.003,
+      "loss": 4.0815,
+      "step": 5393
+    },
+    {
+      "epoch": 0.05394,
+      "grad_norm": 0.581836998462677,
+      "learning_rate": 0.003,
+      "loss": 4.0548,
+      "step": 5394
+    },
+    {
+      "epoch": 0.05395,
+      "grad_norm": 0.5920911431312561,
+      "learning_rate": 0.003,
+      "loss": 4.0845,
+      "step": 5395
+    },
+    {
+      "epoch": 0.05396,
+      "grad_norm": 0.6112498641014099,
+      "learning_rate": 0.003,
+      "loss": 4.0753,
+      "step": 5396
+    },
+    {
+      "epoch": 0.05397,
+      "grad_norm": 0.6823821067810059,
+      "learning_rate": 0.003,
+      "loss": 4.0529,
+      "step": 5397
+    },
+    {
+      "epoch": 0.05398,
+      "grad_norm": 0.807902991771698,
+      "learning_rate": 0.003,
+      "loss": 4.0535,
+      "step": 5398
+    },
+    {
+      "epoch": 0.05399,
+      "grad_norm": 1.0339542627334595,
+      "learning_rate": 0.003,
+      "loss": 4.0899,
+      "step": 5399
+    },
+    {
+      "epoch": 0.054,
+      "grad_norm": 1.1419845819473267,
+      "learning_rate": 0.003,
+      "loss": 4.1055,
+      "step": 5400
+    },
+    {
+      "epoch": 0.05401,
+      "grad_norm": 0.7827608585357666,
+      "learning_rate": 0.003,
+      "loss": 4.0773,
+      "step": 5401
+    },
+    {
+      "epoch": 0.05402,
+      "grad_norm": 0.725298285484314,
+      "learning_rate": 0.003,
+      "loss": 4.0887,
+      "step": 5402
+    },
+    {
+      "epoch": 0.05403,
+      "grad_norm": 0.7886972427368164,
+      "learning_rate": 0.003,
+      "loss": 4.0743,
+      "step": 5403
+    },
+    {
+      "epoch": 0.05404,
+      "grad_norm": 0.8297346234321594,
+      "learning_rate": 0.003,
+      "loss": 4.0705,
+      "step": 5404
+    },
+    {
+      "epoch": 0.05405,
+      "grad_norm": 0.8761243224143982,
+      "learning_rate": 0.003,
+      "loss": 4.0779,
+      "step": 5405
+    },
+    {
+      "epoch": 0.05406,
+      "grad_norm": 0.9909228086471558,
+      "learning_rate": 0.003,
+      "loss": 4.0667,
+      "step": 5406
+    },
+    {
+      "epoch": 0.05407,
+      "grad_norm": 1.0708101987838745,
+      "learning_rate": 0.003,
+      "loss": 4.0879,
+      "step": 5407
+    },
+    {
+      "epoch": 0.05408,
+      "grad_norm": 0.9255216717720032,
+      "learning_rate": 0.003,
+      "loss": 4.0724,
+      "step": 5408
+    },
+    {
+      "epoch": 0.05409,
+      "grad_norm": 1.0283777713775635,
+      "learning_rate": 0.003,
+      "loss": 4.0454,
+      "step": 5409
+    },
+    {
+      "epoch": 0.0541,
+      "grad_norm": 1.0340330600738525,
+      "learning_rate": 0.003,
+      "loss": 4.1005,
+      "step": 5410
+    },
+    {
+      "epoch": 0.05411,
+      "grad_norm": 0.8593031167984009,
+      "learning_rate": 0.003,
+      "loss": 4.0924,
+      "step": 5411
+    },
+    {
+      "epoch": 0.05412,
+      "grad_norm": 0.7811865210533142,
+      "learning_rate": 0.003,
+      "loss": 4.09,
+      "step": 5412
+    },
+    {
+      "epoch": 0.05413,
+      "grad_norm": 0.8573576211929321,
+      "learning_rate": 0.003,
+      "loss": 4.0854,
+      "step": 5413
+    },
+    {
+      "epoch": 0.05414,
+      "grad_norm": 0.9427306652069092,
+      "learning_rate": 0.003,
+      "loss": 4.0723,
+      "step": 5414
+    },
+    {
+      "epoch": 0.05415,
+      "grad_norm": 1.2074553966522217,
+      "learning_rate": 0.003,
+      "loss": 4.1187,
+      "step": 5415
+    },
+    {
+      "epoch": 0.05416,
+      "grad_norm": 0.827156662940979,
+      "learning_rate": 0.003,
+      "loss": 4.0658,
+      "step": 5416
+    },
+    {
+      "epoch": 0.05417,
+      "grad_norm": 0.715948760509491,
+      "learning_rate": 0.003,
+      "loss": 4.0869,
+      "step": 5417
+    },
+    {
+      "epoch": 0.05418,
+      "grad_norm": 0.6932428479194641,
+      "learning_rate": 0.003,
+      "loss": 4.0759,
+      "step": 5418
+    },
+    {
+      "epoch": 0.05419,
+      "grad_norm": 0.849570095539093,
+      "learning_rate": 0.003,
+      "loss": 4.0952,
+      "step": 5419
+    },
+    {
+      "epoch": 0.0542,
+      "grad_norm": 0.8560599088668823,
+      "learning_rate": 0.003,
+      "loss": 4.0832,
+      "step": 5420
+    },
+    {
+      "epoch": 0.05421,
+      "grad_norm": 0.7644525170326233,
+      "learning_rate": 0.003,
+      "loss": 4.0766,
+      "step": 5421
+    },
+    {
+      "epoch": 0.05422,
+      "grad_norm": 0.652097761631012,
+      "learning_rate": 0.003,
+      "loss": 4.0888,
+      "step": 5422
+    },
+    {
+      "epoch": 0.05423,
+      "grad_norm": 0.5640384554862976,
+      "learning_rate": 0.003,
+      "loss": 4.0767,
+      "step": 5423
+    },
+    {
+      "epoch": 0.05424,
+      "grad_norm": 0.5313403010368347,
+      "learning_rate": 0.003,
+      "loss": 4.1068,
+      "step": 5424
+    },
+    {
+      "epoch": 0.05425,
+      "grad_norm": 0.6055113077163696,
+      "learning_rate": 0.003,
+      "loss": 4.0805,
+      "step": 5425
+    },
+    {
+      "epoch": 0.05426,
+      "grad_norm": 0.5395570993423462,
+      "learning_rate": 0.003,
+      "loss": 4.0619,
+      "step": 5426
+    },
+    {
+      "epoch": 0.05427,
+      "grad_norm": 0.5673665404319763,
+      "learning_rate": 0.003,
+      "loss": 4.0586,
+      "step": 5427
+    },
+    {
+      "epoch": 0.05428,
+      "grad_norm": 0.6410355567932129,
+      "learning_rate": 0.003,
+      "loss": 4.0783,
+      "step": 5428
+    },
+    {
+      "epoch": 0.05429,
+      "grad_norm": 0.7815173268318176,
+      "learning_rate": 0.003,
+      "loss": 4.0808,
+      "step": 5429
+    },
+    {
+      "epoch": 0.0543,
+      "grad_norm": 0.8199576735496521,
+      "learning_rate": 0.003,
+      "loss": 4.1021,
+      "step": 5430
+    },
+    {
+      "epoch": 0.05431,
+      "grad_norm": 0.7338206768035889,
+      "learning_rate": 0.003,
+      "loss": 4.0706,
+      "step": 5431
+    },
+    {
+      "epoch": 0.05432,
+      "grad_norm": 0.6335470676422119,
+      "learning_rate": 0.003,
+      "loss": 4.0627,
+      "step": 5432
+    },
+    {
+      "epoch": 0.05433,
+      "grad_norm": 0.599968433380127,
+      "learning_rate": 0.003,
+      "loss": 4.0375,
+      "step": 5433
+    },
+    {
+      "epoch": 0.05434,
+      "grad_norm": 0.5905585885047913,
+      "learning_rate": 0.003,
+      "loss": 4.0593,
+      "step": 5434
+    },
+    {
+      "epoch": 0.05435,
+      "grad_norm": 0.6752657890319824,
+      "learning_rate": 0.003,
+      "loss": 4.0534,
+      "step": 5435
+    },
+    {
+      "epoch": 0.05436,
+      "grad_norm": 0.6819540858268738,
+      "learning_rate": 0.003,
+      "loss": 4.0761,
+      "step": 5436
+    },
+    {
+      "epoch": 0.05437,
+      "grad_norm": 0.6730635762214661,
+      "learning_rate": 0.003,
+      "loss": 4.0772,
+      "step": 5437
+    },
+    {
+      "epoch": 0.05438,
+      "grad_norm": 0.7070807814598083,
+      "learning_rate": 0.003,
+      "loss": 4.0694,
+      "step": 5438
+    },
+    {
+      "epoch": 0.05439,
+      "grad_norm": 0.722388744354248,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 5439
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.6570700407028198,
+      "learning_rate": 0.003,
+      "loss": 4.0558,
+      "step": 5440
+    },
+    {
+      "epoch": 0.05441,
+      "grad_norm": 0.6882500648498535,
+      "learning_rate": 0.003,
+      "loss": 4.0869,
+      "step": 5441
+    },
+    {
+      "epoch": 0.05442,
+      "grad_norm": 0.7646239995956421,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 5442
+    },
+    {
+      "epoch": 0.05443,
+      "grad_norm": 0.7554492354393005,
+      "learning_rate": 0.003,
+      "loss": 4.0674,
+      "step": 5443
+    },
+    {
+      "epoch": 0.05444,
+      "grad_norm": 0.7602279782295227,
+      "learning_rate": 0.003,
+      "loss": 4.0568,
+      "step": 5444
+    },
+    {
+      "epoch": 0.05445,
+      "grad_norm": 0.6765475273132324,
+      "learning_rate": 0.003,
+      "loss": 4.0549,
+      "step": 5445
+    },
+    {
+      "epoch": 0.05446,
+      "grad_norm": 0.5565710067749023,
+      "learning_rate": 0.003,
+      "loss": 4.054,
+      "step": 5446
+    },
+    {
+      "epoch": 0.05447,
+      "grad_norm": 0.49256643652915955,
+      "learning_rate": 0.003,
+      "loss": 4.0456,
+      "step": 5447
+    },
+    {
+      "epoch": 0.05448,
+      "grad_norm": 0.5195604562759399,
+      "learning_rate": 0.003,
+      "loss": 4.0832,
+      "step": 5448
+    },
+    {
+      "epoch": 0.05449,
+      "grad_norm": 0.5942506194114685,
+      "learning_rate": 0.003,
+      "loss": 4.0741,
+      "step": 5449
+    },
+    {
+      "epoch": 0.0545,
+      "grad_norm": 0.6499919891357422,
+      "learning_rate": 0.003,
+      "loss": 4.0722,
+      "step": 5450
+    },
+    {
+      "epoch": 0.05451,
+      "grad_norm": 0.6596571803092957,
+      "learning_rate": 0.003,
+      "loss": 4.0955,
+      "step": 5451
+    },
+    {
+      "epoch": 0.05452,
+      "grad_norm": 0.6069369316101074,
+      "learning_rate": 0.003,
+      "loss": 4.0613,
+      "step": 5452
+    },
+    {
+      "epoch": 0.05453,
+      "grad_norm": 0.6619586944580078,
+      "learning_rate": 0.003,
+      "loss": 4.0566,
+      "step": 5453
+    },
+    {
+      "epoch": 0.05454,
+      "grad_norm": 0.6113542318344116,
+      "learning_rate": 0.003,
+      "loss": 4.0606,
+      "step": 5454
+    },
+    {
+      "epoch": 0.05455,
+      "grad_norm": 0.598733127117157,
+      "learning_rate": 0.003,
+      "loss": 4.0882,
+      "step": 5455
+    },
+    {
+      "epoch": 0.05456,
+      "grad_norm": 0.5969057679176331,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 5456
+    },
+    {
+      "epoch": 0.05457,
+      "grad_norm": 0.6789122819900513,
+      "learning_rate": 0.003,
+      "loss": 4.0852,
+      "step": 5457
+    },
+    {
+      "epoch": 0.05458,
+      "grad_norm": 0.805001974105835,
+      "learning_rate": 0.003,
+      "loss": 4.0618,
+      "step": 5458
+    },
+    {
+      "epoch": 0.05459,
+      "grad_norm": 1.0944715738296509,
+      "learning_rate": 0.003,
+      "loss": 4.0821,
+      "step": 5459
+    },
+    {
+      "epoch": 0.0546,
+      "grad_norm": 0.9038564562797546,
+      "learning_rate": 0.003,
+      "loss": 4.07,
+      "step": 5460
+    },
+    {
+      "epoch": 0.05461,
+      "grad_norm": 0.8081420660018921,
+      "learning_rate": 0.003,
+      "loss": 4.0715,
+      "step": 5461
+    },
+    {
+      "epoch": 0.05462,
+      "grad_norm": 0.7987942099571228,
+      "learning_rate": 0.003,
+      "loss": 4.0981,
+      "step": 5462
+    },
+    {
+      "epoch": 0.05463,
+      "grad_norm": 0.8367160558700562,
+      "learning_rate": 0.003,
+      "loss": 4.0624,
+      "step": 5463
+    },
+    {
+      "epoch": 0.05464,
+      "grad_norm": 0.8023653626441956,
+      "learning_rate": 0.003,
+      "loss": 4.1021,
+      "step": 5464
+    },
+    {
+      "epoch": 0.05465,
+      "grad_norm": 0.8824062347412109,
+      "learning_rate": 0.003,
+      "loss": 4.1011,
+      "step": 5465
+    },
+    {
+      "epoch": 0.05466,
+      "grad_norm": 0.8047217130661011,
+      "learning_rate": 0.003,
+      "loss": 4.1078,
+      "step": 5466
+    },
+    {
+      "epoch": 0.05467,
+      "grad_norm": 0.6920082569122314,
+      "learning_rate": 0.003,
+      "loss": 4.0853,
+      "step": 5467
+    },
+    {
+      "epoch": 0.05468,
+      "grad_norm": 0.7836238145828247,
+      "learning_rate": 0.003,
+      "loss": 4.0606,
+      "step": 5468
+    },
+    {
+      "epoch": 0.05469,
+      "grad_norm": 1.0192773342132568,
+      "learning_rate": 0.003,
+      "loss": 4.0633,
+      "step": 5469
+    },
+    {
+      "epoch": 0.0547,
+      "grad_norm": 1.280296802520752,
+      "learning_rate": 0.003,
+      "loss": 4.0765,
+      "step": 5470
+    },
+    {
+      "epoch": 0.05471,
+      "grad_norm": 0.6577931046485901,
+      "learning_rate": 0.003,
+      "loss": 4.0719,
+      "step": 5471
+    },
+    {
+      "epoch": 0.05472,
+      "grad_norm": 0.7552001476287842,
+      "learning_rate": 0.003,
+      "loss": 4.0708,
+      "step": 5472
+    },
+    {
+      "epoch": 0.05473,
+      "grad_norm": 0.7697224020957947,
+      "learning_rate": 0.003,
+      "loss": 4.0893,
+      "step": 5473
+    },
+    {
+      "epoch": 0.05474,
+      "grad_norm": 0.8115640878677368,
+      "learning_rate": 0.003,
+      "loss": 4.064,
+      "step": 5474
+    },
+    {
+      "epoch": 0.05475,
+      "grad_norm": 0.8436885476112366,
+      "learning_rate": 0.003,
+      "loss": 4.1221,
+      "step": 5475
+    },
+    {
+      "epoch": 0.05476,
+      "grad_norm": 0.9373152256011963,
+      "learning_rate": 0.003,
+      "loss": 4.0541,
+      "step": 5476
+    },
+    {
+      "epoch": 0.05477,
+      "grad_norm": 0.8886646032333374,
+      "learning_rate": 0.003,
+      "loss": 4.0839,
+      "step": 5477
+    },
+    {
+      "epoch": 0.05478,
+      "grad_norm": 0.7719184756278992,
+      "learning_rate": 0.003,
+      "loss": 4.0792,
+      "step": 5478
+    },
+    {
+      "epoch": 0.05479,
+      "grad_norm": 0.7790704965591431,
+      "learning_rate": 0.003,
+      "loss": 4.0918,
+      "step": 5479
+    },
+    {
+      "epoch": 0.0548,
+      "grad_norm": 0.7557684779167175,
+      "learning_rate": 0.003,
+      "loss": 4.0899,
+      "step": 5480
+    },
+    {
+      "epoch": 0.05481,
+      "grad_norm": 0.761655867099762,
+      "learning_rate": 0.003,
+      "loss": 4.0814,
+      "step": 5481
+    },
+    {
+      "epoch": 0.05482,
+      "grad_norm": 0.7313814163208008,
+      "learning_rate": 0.003,
+      "loss": 4.0734,
+      "step": 5482
+    },
+    {
+      "epoch": 0.05483,
+      "grad_norm": 0.7328376770019531,
+      "learning_rate": 0.003,
+      "loss": 4.0766,
+      "step": 5483
+    },
+    {
+      "epoch": 0.05484,
+      "grad_norm": 0.6982470750808716,
+      "learning_rate": 0.003,
+      "loss": 4.0938,
+      "step": 5484
+    },
+    {
+      "epoch": 0.05485,
+      "grad_norm": 0.5700132846832275,
+      "learning_rate": 0.003,
+      "loss": 4.0978,
+      "step": 5485
+    },
+    {
+      "epoch": 0.05486,
+      "grad_norm": 0.5434563755989075,
+      "learning_rate": 0.003,
+      "loss": 4.0775,
+      "step": 5486
+    },
+    {
+      "epoch": 0.05487,
+      "grad_norm": 0.48218417167663574,
+      "learning_rate": 0.003,
+      "loss": 4.0829,
+      "step": 5487
+    },
+    {
+      "epoch": 0.05488,
+      "grad_norm": 0.5017120242118835,
+      "learning_rate": 0.003,
+      "loss": 4.0774,
+      "step": 5488
+    },
+    {
+      "epoch": 0.05489,
+      "grad_norm": 0.48504316806793213,
+      "learning_rate": 0.003,
+      "loss": 4.0942,
+      "step": 5489
+    },
+    {
+      "epoch": 0.0549,
+      "grad_norm": 0.5177329778671265,
+      "learning_rate": 0.003,
+      "loss": 4.067,
+      "step": 5490
+    },
+    {
+      "epoch": 0.05491,
+      "grad_norm": 0.6282394528388977,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 5491
+    },
+    {
+      "epoch": 0.05492,
+      "grad_norm": 0.8392156958580017,
+      "learning_rate": 0.003,
+      "loss": 4.0907,
+      "step": 5492
+    },
+    {
+      "epoch": 0.05493,
+      "grad_norm": 1.1088998317718506,
+      "learning_rate": 0.003,
+      "loss": 4.0613,
+      "step": 5493
+    },
+    {
+      "epoch": 0.05494,
+      "grad_norm": 1.0195647478103638,
+      "learning_rate": 0.003,
+      "loss": 4.0934,
+      "step": 5494
+    },
+    {
+      "epoch": 0.05495,
+      "grad_norm": 0.8553412556648254,
+      "learning_rate": 0.003,
+      "loss": 4.0758,
+      "step": 5495
+    },
+    {
+      "epoch": 0.05496,
+      "grad_norm": 0.7595451474189758,
+      "learning_rate": 0.003,
+      "loss": 4.0655,
+      "step": 5496
+    },
+    {
+      "epoch": 0.05497,
+      "grad_norm": 0.7024115920066833,
+      "learning_rate": 0.003,
+      "loss": 4.0854,
+      "step": 5497
+    },
+    {
+      "epoch": 0.05498,
+      "grad_norm": 0.5727024674415588,
+      "learning_rate": 0.003,
+      "loss": 4.0634,
+      "step": 5498
+    },
+    {
+      "epoch": 0.05499,
+      "grad_norm": 0.5514757037162781,
+      "learning_rate": 0.003,
+      "loss": 4.0471,
+      "step": 5499
+    },
+    {
+      "epoch": 0.055,
+      "grad_norm": 0.6886269450187683,
+      "learning_rate": 0.003,
+      "loss": 4.0533,
+      "step": 5500
+    },
+    {
+      "epoch": 0.05501,
+      "grad_norm": 0.8767561912536621,
+      "learning_rate": 0.003,
+      "loss": 4.0665,
+      "step": 5501
+    },
+    {
+      "epoch": 0.05502,
+      "grad_norm": 0.9685607552528381,
+      "learning_rate": 0.003,
+      "loss": 4.06,
+      "step": 5502
+    },
+    {
+      "epoch": 0.05503,
+      "grad_norm": 0.8435690402984619,
+      "learning_rate": 0.003,
+      "loss": 4.0642,
+      "step": 5503
+    },
+    {
+      "epoch": 0.05504,
+      "grad_norm": 0.6953498721122742,
+      "learning_rate": 0.003,
+      "loss": 4.1101,
+      "step": 5504
+    },
+    {
+      "epoch": 0.05505,
+      "grad_norm": 0.6831181645393372,
+      "learning_rate": 0.003,
+      "loss": 4.0626,
+      "step": 5505
+    },
+    {
+      "epoch": 0.05506,
+      "grad_norm": 0.6843089461326599,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 5506
+    },
+    {
+      "epoch": 0.05507,
+      "grad_norm": 0.5160518884658813,
+      "learning_rate": 0.003,
+      "loss": 4.0918,
+      "step": 5507
+    },
+    {
+      "epoch": 0.05508,
+      "grad_norm": 0.5106964111328125,
+      "learning_rate": 0.003,
+      "loss": 4.069,
+      "step": 5508
+    },
+    {
+      "epoch": 0.05509,
+      "grad_norm": 0.49985066056251526,
+      "learning_rate": 0.003,
+      "loss": 4.0625,
+      "step": 5509
+    },
+    {
+      "epoch": 0.0551,
+      "grad_norm": 0.511665403842926,
+      "learning_rate": 0.003,
+      "loss": 4.0441,
+      "step": 5510
+    },
+    {
+      "epoch": 0.05511,
+      "grad_norm": 0.4913787543773651,
+      "learning_rate": 0.003,
+      "loss": 4.07,
+      "step": 5511
+    },
+    {
+      "epoch": 0.05512,
+      "grad_norm": 0.5255283713340759,
+      "learning_rate": 0.003,
+      "loss": 4.0729,
+      "step": 5512
+    },
+    {
+      "epoch": 0.05513,
+      "grad_norm": 0.5358270406723022,
+      "learning_rate": 0.003,
+      "loss": 4.0576,
+      "step": 5513
+    },
+    {
+      "epoch": 0.05514,
+      "grad_norm": 0.4987005293369293,
+      "learning_rate": 0.003,
+      "loss": 4.0677,
+      "step": 5514
+    },
+    {
+      "epoch": 0.05515,
+      "grad_norm": 0.43703991174697876,
+      "learning_rate": 0.003,
+      "loss": 4.0476,
+      "step": 5515
+    },
+    {
+      "epoch": 0.05516,
+      "grad_norm": 0.45214366912841797,
+      "learning_rate": 0.003,
+      "loss": 4.0806,
+      "step": 5516
+    },
+    {
+      "epoch": 0.05517,
+      "grad_norm": 0.4733821749687195,
+      "learning_rate": 0.003,
+      "loss": 4.0409,
+      "step": 5517
+    },
+    {
+      "epoch": 0.05518,
+      "grad_norm": 0.5811468958854675,
+      "learning_rate": 0.003,
+      "loss": 4.0616,
+      "step": 5518
+    },
+    {
+      "epoch": 0.05519,
+      "grad_norm": 0.6899372339248657,
+      "learning_rate": 0.003,
+      "loss": 4.0941,
+      "step": 5519
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.7010800838470459,
+      "learning_rate": 0.003,
+      "loss": 4.0685,
+      "step": 5520
+    },
+    {
+      "epoch": 0.05521,
+      "grad_norm": 0.7988923788070679,
+      "learning_rate": 0.003,
+      "loss": 4.0857,
+      "step": 5521
+    },
+    {
+      "epoch": 0.05522,
+      "grad_norm": 0.8185820579528809,
+      "learning_rate": 0.003,
+      "loss": 4.1009,
+      "step": 5522
+    },
+    {
+      "epoch": 0.05523,
+      "grad_norm": 0.6712361574172974,
+      "learning_rate": 0.003,
+      "loss": 4.0737,
+      "step": 5523
+    },
+    {
+      "epoch": 0.05524,
+      "grad_norm": 0.8089494705200195,
+      "learning_rate": 0.003,
+      "loss": 4.0875,
+      "step": 5524
+    },
+    {
+      "epoch": 0.05525,
+      "grad_norm": 0.9261167049407959,
+      "learning_rate": 0.003,
+      "loss": 4.0623,
+      "step": 5525
+    },
+    {
+      "epoch": 0.05526,
+      "grad_norm": 0.9133365154266357,
+      "learning_rate": 0.003,
+      "loss": 4.0691,
+      "step": 5526
+    },
+    {
+      "epoch": 0.05527,
+      "grad_norm": 0.8450931310653687,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 5527
+    },
+    {
+      "epoch": 0.05528,
+      "grad_norm": 0.8984810709953308,
+      "learning_rate": 0.003,
+      "loss": 4.0803,
+      "step": 5528
+    },
+    {
+      "epoch": 0.05529,
+      "grad_norm": 0.9497979283332825,
+      "learning_rate": 0.003,
+      "loss": 4.0965,
+      "step": 5529
+    },
+    {
+      "epoch": 0.0553,
+      "grad_norm": 1.1338160037994385,
+      "learning_rate": 0.003,
+      "loss": 4.1126,
+      "step": 5530
+    },
+    {
+      "epoch": 0.05531,
+      "grad_norm": 0.8146209120750427,
+      "learning_rate": 0.003,
+      "loss": 4.0925,
+      "step": 5531
+    },
+    {
+      "epoch": 0.05532,
+      "grad_norm": 0.6237087249755859,
+      "learning_rate": 0.003,
+      "loss": 4.0718,
+      "step": 5532
+    },
+    {
+      "epoch": 0.05533,
+      "grad_norm": 0.5327185392379761,
+      "learning_rate": 0.003,
+      "loss": 4.0854,
+      "step": 5533
+    },
+    {
+      "epoch": 0.05534,
+      "grad_norm": 0.6118891835212708,
+      "learning_rate": 0.003,
+      "loss": 4.0728,
+      "step": 5534
+    },
+    {
+      "epoch": 0.05535,
+      "grad_norm": 0.6608664989471436,
+      "learning_rate": 0.003,
+      "loss": 4.088,
+      "step": 5535
+    },
+    {
+      "epoch": 0.05536,
+      "grad_norm": 0.7102659344673157,
+      "learning_rate": 0.003,
+      "loss": 4.0784,
+      "step": 5536
+    },
+    {
+      "epoch": 0.05537,
+      "grad_norm": 0.684587836265564,
+      "learning_rate": 0.003,
+      "loss": 4.0741,
+      "step": 5537
+    },
+    {
+      "epoch": 0.05538,
+      "grad_norm": 0.7321231961250305,
+      "learning_rate": 0.003,
+      "loss": 4.0701,
+      "step": 5538
+    },
+    {
+      "epoch": 0.05539,
+      "grad_norm": 0.798923909664154,
+      "learning_rate": 0.003,
+      "loss": 4.0567,
+      "step": 5539
+    },
+    {
+      "epoch": 0.0554,
+      "grad_norm": 0.93647700548172,
+      "learning_rate": 0.003,
+      "loss": 4.1197,
+      "step": 5540
+    },
+    {
+      "epoch": 0.05541,
+      "grad_norm": 1.2185319662094116,
+      "learning_rate": 0.003,
+      "loss": 4.1074,
+      "step": 5541
+    },
+    {
+      "epoch": 0.05542,
+      "grad_norm": 0.9062533378601074,
+      "learning_rate": 0.003,
+      "loss": 4.074,
+      "step": 5542
+    },
+    {
+      "epoch": 0.05543,
+      "grad_norm": 0.7092111706733704,
+      "learning_rate": 0.003,
+      "loss": 4.0643,
+      "step": 5543
+    },
+    {
+      "epoch": 0.05544,
+      "grad_norm": 0.5959237217903137,
+      "learning_rate": 0.003,
+      "loss": 4.1011,
+      "step": 5544
+    },
+    {
+      "epoch": 0.05545,
+      "grad_norm": 0.6062164306640625,
+      "learning_rate": 0.003,
+      "loss": 4.0873,
+      "step": 5545
+    },
+    {
+      "epoch": 0.05546,
+      "grad_norm": 0.64149010181427,
+      "learning_rate": 0.003,
+      "loss": 4.0704,
+      "step": 5546
+    },
+    {
+      "epoch": 0.05547,
+      "grad_norm": 0.6897364258766174,
+      "learning_rate": 0.003,
+      "loss": 4.071,
+      "step": 5547
+    },
+    {
+      "epoch": 0.05548,
+      "grad_norm": 0.7177988886833191,
+      "learning_rate": 0.003,
+      "loss": 4.046,
+      "step": 5548
+    },
+    {
+      "epoch": 0.05549,
+      "grad_norm": 0.7397828102111816,
+      "learning_rate": 0.003,
+      "loss": 4.0923,
+      "step": 5549
+    },
+    {
+      "epoch": 0.0555,
+      "grad_norm": 0.7000327110290527,
+      "learning_rate": 0.003,
+      "loss": 4.0696,
+      "step": 5550
+    },
+    {
+      "epoch": 0.05551,
+      "grad_norm": 0.657970666885376,
+      "learning_rate": 0.003,
+      "loss": 4.0357,
+      "step": 5551
+    },
+    {
+      "epoch": 0.05552,
+      "grad_norm": 0.5877078771591187,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 5552
+    },
+    {
+      "epoch": 0.05553,
+      "grad_norm": 0.6410927176475525,
+      "learning_rate": 0.003,
+      "loss": 4.0753,
+      "step": 5553
+    },
+    {
+      "epoch": 0.05554,
+      "grad_norm": 0.7099154591560364,
+      "learning_rate": 0.003,
+      "loss": 4.0615,
+      "step": 5554
+    },
+    {
+      "epoch": 0.05555,
+      "grad_norm": 0.9199838042259216,
+      "learning_rate": 0.003,
+      "loss": 4.109,
+      "step": 5555
+    },
+    {
+      "epoch": 0.05556,
+      "grad_norm": 1.068846344947815,
+      "learning_rate": 0.003,
+      "loss": 4.0691,
+      "step": 5556
+    },
+    {
+      "epoch": 0.05557,
+      "grad_norm": 1.1641470193862915,
+      "learning_rate": 0.003,
+      "loss": 4.0991,
+      "step": 5557
+    },
+    {
+      "epoch": 0.05558,
+      "grad_norm": 0.830019474029541,
+      "learning_rate": 0.003,
+      "loss": 4.0517,
+      "step": 5558
+    },
+    {
+      "epoch": 0.05559,
+      "grad_norm": 0.6737862825393677,
+      "learning_rate": 0.003,
+      "loss": 4.0766,
+      "step": 5559
+    },
+    {
+      "epoch": 0.0556,
+      "grad_norm": 0.725947916507721,
+      "learning_rate": 0.003,
+      "loss": 4.1006,
+      "step": 5560
+    },
+    {
+      "epoch": 0.05561,
+      "grad_norm": 0.8011854887008667,
+      "learning_rate": 0.003,
+      "loss": 4.0585,
+      "step": 5561
+    },
+    {
+      "epoch": 0.05562,
+      "grad_norm": 0.900126039981842,
+      "learning_rate": 0.003,
+      "loss": 4.0719,
+      "step": 5562
+    },
+    {
+      "epoch": 0.05563,
+      "grad_norm": 0.9443784952163696,
+      "learning_rate": 0.003,
+      "loss": 4.0871,
+      "step": 5563
+    },
+    {
+      "epoch": 0.05564,
+      "grad_norm": 0.8231141567230225,
+      "learning_rate": 0.003,
+      "loss": 4.0654,
+      "step": 5564
+    },
+    {
+      "epoch": 0.05565,
+      "grad_norm": 0.6643379330635071,
+      "learning_rate": 0.003,
+      "loss": 4.103,
+      "step": 5565
+    },
+    {
+      "epoch": 0.05566,
+      "grad_norm": 0.6313121318817139,
+      "learning_rate": 0.003,
+      "loss": 4.0525,
+      "step": 5566
+    },
+    {
+      "epoch": 0.05567,
+      "grad_norm": 0.651742160320282,
+      "learning_rate": 0.003,
+      "loss": 4.062,
+      "step": 5567
+    },
+    {
+      "epoch": 0.05568,
+      "grad_norm": 0.6961485743522644,
+      "learning_rate": 0.003,
+      "loss": 4.095,
+      "step": 5568
+    },
+    {
+      "epoch": 0.05569,
+      "grad_norm": 0.7175015211105347,
+      "learning_rate": 0.003,
+      "loss": 4.0642,
+      "step": 5569
+    },
+    {
+      "epoch": 0.0557,
+      "grad_norm": 0.8055482506752014,
+      "learning_rate": 0.003,
+      "loss": 4.0912,
+      "step": 5570
+    },
+    {
+      "epoch": 0.05571,
+      "grad_norm": 0.8521894812583923,
+      "learning_rate": 0.003,
+      "loss": 4.0825,
+      "step": 5571
+    },
+    {
+      "epoch": 0.05572,
+      "grad_norm": 0.8425765037536621,
+      "learning_rate": 0.003,
+      "loss": 4.0591,
+      "step": 5572
+    },
+    {
+      "epoch": 0.05573,
+      "grad_norm": 0.8626205325126648,
+      "learning_rate": 0.003,
+      "loss": 4.0996,
+      "step": 5573
+    },
+    {
+      "epoch": 0.05574,
+      "grad_norm": 0.9191294312477112,
+      "learning_rate": 0.003,
+      "loss": 4.0716,
+      "step": 5574
+    },
+    {
+      "epoch": 0.05575,
+      "grad_norm": 0.7949203252792358,
+      "learning_rate": 0.003,
+      "loss": 4.0705,
+      "step": 5575
+    },
+    {
+      "epoch": 0.05576,
+      "grad_norm": 0.6600616574287415,
+      "learning_rate": 0.003,
+      "loss": 4.0429,
+      "step": 5576
+    },
+    {
+      "epoch": 0.05577,
+      "grad_norm": 0.5764843821525574,
+      "learning_rate": 0.003,
+      "loss": 4.064,
+      "step": 5577
+    },
+    {
+      "epoch": 0.05578,
+      "grad_norm": 0.5156270265579224,
+      "learning_rate": 0.003,
+      "loss": 4.0622,
+      "step": 5578
+    },
+    {
+      "epoch": 0.05579,
+      "grad_norm": 0.5665589570999146,
+      "learning_rate": 0.003,
+      "loss": 4.078,
+      "step": 5579
+    },
+    {
+      "epoch": 0.0558,
+      "grad_norm": 0.5314322710037231,
+      "learning_rate": 0.003,
+      "loss": 4.068,
+      "step": 5580
+    },
+    {
+      "epoch": 0.05581,
+      "grad_norm": 0.5085921287536621,
+      "learning_rate": 0.003,
+      "loss": 4.0774,
+      "step": 5581
+    },
+    {
+      "epoch": 0.05582,
+      "grad_norm": 0.5511303544044495,
+      "learning_rate": 0.003,
+      "loss": 4.0897,
+      "step": 5582
+    },
+    {
+      "epoch": 0.05583,
+      "grad_norm": 0.5478542447090149,
+      "learning_rate": 0.003,
+      "loss": 4.0632,
+      "step": 5583
+    },
+    {
+      "epoch": 0.05584,
+      "grad_norm": 0.601093590259552,
+      "learning_rate": 0.003,
+      "loss": 4.0622,
+      "step": 5584
+    },
+    {
+      "epoch": 0.05585,
+      "grad_norm": 0.6396297812461853,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 5585
+    },
+    {
+      "epoch": 0.05586,
+      "grad_norm": 0.5573460459709167,
+      "learning_rate": 0.003,
+      "loss": 4.074,
+      "step": 5586
+    },
+    {
+      "epoch": 0.05587,
+      "grad_norm": 0.5427320599555969,
+      "learning_rate": 0.003,
+      "loss": 4.0774,
+      "step": 5587
+    },
+    {
+      "epoch": 0.05588,
+      "grad_norm": 0.5215794444084167,
+      "learning_rate": 0.003,
+      "loss": 4.0516,
+      "step": 5588
+    },
+    {
+      "epoch": 0.05589,
+      "grad_norm": 0.5945990085601807,
+      "learning_rate": 0.003,
+      "loss": 4.0652,
+      "step": 5589
+    },
+    {
+      "epoch": 0.0559,
+      "grad_norm": 0.7637835144996643,
+      "learning_rate": 0.003,
+      "loss": 4.0653,
+      "step": 5590
+    },
+    {
+      "epoch": 0.05591,
+      "grad_norm": 0.8685312271118164,
+      "learning_rate": 0.003,
+      "loss": 4.0751,
+      "step": 5591
+    },
+    {
+      "epoch": 0.05592,
+      "grad_norm": 1.0447856187820435,
+      "learning_rate": 0.003,
+      "loss": 4.0831,
+      "step": 5592
+    },
+    {
+      "epoch": 0.05593,
+      "grad_norm": 0.9590107798576355,
+      "learning_rate": 0.003,
+      "loss": 4.0709,
+      "step": 5593
+    },
+    {
+      "epoch": 0.05594,
+      "grad_norm": 0.6435539722442627,
+      "learning_rate": 0.003,
+      "loss": 4.0555,
+      "step": 5594
+    },
+    {
+      "epoch": 0.05595,
+      "grad_norm": 0.6978614330291748,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 5595
+    },
+    {
+      "epoch": 0.05596,
+      "grad_norm": 0.8227618932723999,
+      "learning_rate": 0.003,
+      "loss": 4.0535,
+      "step": 5596
+    },
+    {
+      "epoch": 0.05597,
+      "grad_norm": 0.8983748555183411,
+      "learning_rate": 0.003,
+      "loss": 4.0877,
+      "step": 5597
+    },
+    {
+      "epoch": 0.05598,
+      "grad_norm": 0.864307165145874,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 5598
+    },
+    {
+      "epoch": 0.05599,
+      "grad_norm": 0.7344439029693604,
+      "learning_rate": 0.003,
+      "loss": 4.0959,
+      "step": 5599
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.5893545746803284,
+      "learning_rate": 0.003,
+      "loss": 4.0321,
+      "step": 5600
+    },
+    {
+      "epoch": 0.05601,
+      "grad_norm": 0.6228591203689575,
+      "learning_rate": 0.003,
+      "loss": 4.0507,
+      "step": 5601
+    },
+    {
+      "epoch": 0.05602,
+      "grad_norm": 0.7110965847969055,
+      "learning_rate": 0.003,
+      "loss": 4.0473,
+      "step": 5602
+    },
+    {
+      "epoch": 0.05603,
+      "grad_norm": 0.6936246752738953,
+      "learning_rate": 0.003,
+      "loss": 4.0573,
+      "step": 5603
+    },
+    {
+      "epoch": 0.05604,
+      "grad_norm": 0.696966290473938,
+      "learning_rate": 0.003,
+      "loss": 4.0925,
+      "step": 5604
+    },
+    {
+      "epoch": 0.05605,
+      "grad_norm": 0.7240187525749207,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 5605
+    },
+    {
+      "epoch": 0.05606,
+      "grad_norm": 0.807013750076294,
+      "learning_rate": 0.003,
+      "loss": 4.0829,
+      "step": 5606
+    },
+    {
+      "epoch": 0.05607,
+      "grad_norm": 0.8161320090293884,
+      "learning_rate": 0.003,
+      "loss": 4.0985,
+      "step": 5607
+    },
+    {
+      "epoch": 0.05608,
+      "grad_norm": 0.78641277551651,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 5608
+    },
+    {
+      "epoch": 0.05609,
+      "grad_norm": 0.6392374038696289,
+      "learning_rate": 0.003,
+      "loss": 4.0653,
+      "step": 5609
+    },
+    {
+      "epoch": 0.0561,
+      "grad_norm": 0.7331043481826782,
+      "learning_rate": 0.003,
+      "loss": 4.101,
+      "step": 5610
+    },
+    {
+      "epoch": 0.05611,
+      "grad_norm": 0.7653400301933289,
+      "learning_rate": 0.003,
+      "loss": 4.0726,
+      "step": 5611
+    },
+    {
+      "epoch": 0.05612,
+      "grad_norm": 0.8886222243309021,
+      "learning_rate": 0.003,
+      "loss": 4.0689,
+      "step": 5612
+    },
+    {
+      "epoch": 0.05613,
+      "grad_norm": 1.155055284500122,
+      "learning_rate": 0.003,
+      "loss": 4.1033,
+      "step": 5613
+    },
+    {
+      "epoch": 0.05614,
+      "grad_norm": 0.8169151544570923,
+      "learning_rate": 0.003,
+      "loss": 4.0775,
+      "step": 5614
+    },
+    {
+      "epoch": 0.05615,
+      "grad_norm": 0.7616031765937805,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 5615
+    },
+    {
+      "epoch": 0.05616,
+      "grad_norm": 0.7899324297904968,
+      "learning_rate": 0.003,
+      "loss": 4.0603,
+      "step": 5616
+    },
+    {
+      "epoch": 0.05617,
+      "grad_norm": 0.7144830226898193,
+      "learning_rate": 0.003,
+      "loss": 4.0826,
+      "step": 5617
+    },
+    {
+      "epoch": 0.05618,
+      "grad_norm": 0.6747182011604309,
+      "learning_rate": 0.003,
+      "loss": 4.059,
+      "step": 5618
+    },
+    {
+      "epoch": 0.05619,
+      "grad_norm": 0.6562235951423645,
+      "learning_rate": 0.003,
+      "loss": 4.1157,
+      "step": 5619
+    },
+    {
+      "epoch": 0.0562,
+      "grad_norm": 0.5861763954162598,
+      "learning_rate": 0.003,
+      "loss": 4.0676,
+      "step": 5620
+    },
+    {
+      "epoch": 0.05621,
+      "grad_norm": 0.5951058864593506,
+      "learning_rate": 0.003,
+      "loss": 4.0779,
+      "step": 5621
+    },
+    {
+      "epoch": 0.05622,
+      "grad_norm": 0.5614970922470093,
+      "learning_rate": 0.003,
+      "loss": 4.0596,
+      "step": 5622
+    },
+    {
+      "epoch": 0.05623,
+      "grad_norm": 0.5402902364730835,
+      "learning_rate": 0.003,
+      "loss": 4.0783,
+      "step": 5623
+    },
+    {
+      "epoch": 0.05624,
+      "grad_norm": 0.5211602449417114,
+      "learning_rate": 0.003,
+      "loss": 4.0944,
+      "step": 5624
+    },
+    {
+      "epoch": 0.05625,
+      "grad_norm": 0.5597396492958069,
+      "learning_rate": 0.003,
+      "loss": 4.0873,
+      "step": 5625
+    },
+    {
+      "epoch": 0.05626,
+      "grad_norm": 0.6586695909500122,
+      "learning_rate": 0.003,
+      "loss": 4.0555,
+      "step": 5626
+    },
+    {
+      "epoch": 0.05627,
+      "grad_norm": 0.8000897765159607,
+      "learning_rate": 0.003,
+      "loss": 4.0712,
+      "step": 5627
+    },
+    {
+      "epoch": 0.05628,
+      "grad_norm": 0.8259612321853638,
+      "learning_rate": 0.003,
+      "loss": 4.0716,
+      "step": 5628
+    },
+    {
+      "epoch": 0.05629,
+      "grad_norm": 0.9033710956573486,
+      "learning_rate": 0.003,
+      "loss": 4.081,
+      "step": 5629
+    },
+    {
+      "epoch": 0.0563,
+      "grad_norm": 0.9907213449478149,
+      "learning_rate": 0.003,
+      "loss": 4.066,
+      "step": 5630
+    },
+    {
+      "epoch": 0.05631,
+      "grad_norm": 0.9617007970809937,
+      "learning_rate": 0.003,
+      "loss": 4.0529,
+      "step": 5631
+    },
+    {
+      "epoch": 0.05632,
+      "grad_norm": 1.084578275680542,
+      "learning_rate": 0.003,
+      "loss": 4.0884,
+      "step": 5632
+    },
+    {
+      "epoch": 0.05633,
+      "grad_norm": 0.8903836607933044,
+      "learning_rate": 0.003,
+      "loss": 4.0825,
+      "step": 5633
+    },
+    {
+      "epoch": 0.05634,
+      "grad_norm": 0.8303310871124268,
+      "learning_rate": 0.003,
+      "loss": 4.0634,
+      "step": 5634
+    },
+    {
+      "epoch": 0.05635,
+      "grad_norm": 0.8191870450973511,
+      "learning_rate": 0.003,
+      "loss": 4.0842,
+      "step": 5635
+    },
+    {
+      "epoch": 0.05636,
+      "grad_norm": 0.8257316946983337,
+      "learning_rate": 0.003,
+      "loss": 4.077,
+      "step": 5636
+    },
+    {
+      "epoch": 0.05637,
+      "grad_norm": 0.8293499946594238,
+      "learning_rate": 0.003,
+      "loss": 4.1177,
+      "step": 5637
+    },
+    {
+      "epoch": 0.05638,
+      "grad_norm": 0.7823120355606079,
+      "learning_rate": 0.003,
+      "loss": 4.0679,
+      "step": 5638
+    },
+    {
+      "epoch": 0.05639,
+      "grad_norm": 0.7697476148605347,
+      "learning_rate": 0.003,
+      "loss": 4.0728,
+      "step": 5639
+    },
+    {
+      "epoch": 0.0564,
+      "grad_norm": 0.783143937587738,
+      "learning_rate": 0.003,
+      "loss": 4.0819,
+      "step": 5640
+    },
+    {
+      "epoch": 0.05641,
+      "grad_norm": 0.7607929110527039,
+      "learning_rate": 0.003,
+      "loss": 4.0331,
+      "step": 5641
+    },
+    {
+      "epoch": 0.05642,
+      "grad_norm": 0.8684077858924866,
+      "learning_rate": 0.003,
+      "loss": 4.1037,
+      "step": 5642
+    },
+    {
+      "epoch": 0.05643,
+      "grad_norm": 0.8539713025093079,
+      "learning_rate": 0.003,
+      "loss": 4.0643,
+      "step": 5643
+    },
+    {
+      "epoch": 0.05644,
+      "grad_norm": 0.7667094469070435,
+      "learning_rate": 0.003,
+      "loss": 4.0766,
+      "step": 5644
+    },
+    {
+      "epoch": 0.05645,
+      "grad_norm": 0.6266912817955017,
+      "learning_rate": 0.003,
+      "loss": 4.0628,
+      "step": 5645
+    },
+    {
+      "epoch": 0.05646,
+      "grad_norm": 0.5951451659202576,
+      "learning_rate": 0.003,
+      "loss": 4.0695,
+      "step": 5646
+    },
+    {
+      "epoch": 0.05647,
+      "grad_norm": 0.5667705535888672,
+      "learning_rate": 0.003,
+      "loss": 4.0706,
+      "step": 5647
+    },
+    {
+      "epoch": 0.05648,
+      "grad_norm": 0.5914098024368286,
+      "learning_rate": 0.003,
+      "loss": 4.0813,
+      "step": 5648
+    },
+    {
+      "epoch": 0.05649,
+      "grad_norm": 0.6961567401885986,
+      "learning_rate": 0.003,
+      "loss": 4.0748,
+      "step": 5649
+    },
+    {
+      "epoch": 0.0565,
+      "grad_norm": 0.6270180940628052,
+      "learning_rate": 0.003,
+      "loss": 4.0876,
+      "step": 5650
+    },
+    {
+      "epoch": 0.05651,
+      "grad_norm": 0.5370350480079651,
+      "learning_rate": 0.003,
+      "loss": 4.0501,
+      "step": 5651
+    },
+    {
+      "epoch": 0.05652,
+      "grad_norm": 0.43584731221199036,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 5652
+    },
+    {
+      "epoch": 0.05653,
+      "grad_norm": 0.4690995514392853,
+      "learning_rate": 0.003,
+      "loss": 4.0729,
+      "step": 5653
+    },
+    {
+      "epoch": 0.05654,
+      "grad_norm": 0.48374971747398376,
+      "learning_rate": 0.003,
+      "loss": 4.0816,
+      "step": 5654
+    },
+    {
+      "epoch": 0.05655,
+      "grad_norm": 0.6023554801940918,
+      "learning_rate": 0.003,
+      "loss": 4.0731,
+      "step": 5655
+    },
+    {
+      "epoch": 0.05656,
+      "grad_norm": 0.8215942978858948,
+      "learning_rate": 0.003,
+      "loss": 4.0758,
+      "step": 5656
+    },
+    {
+      "epoch": 0.05657,
+      "grad_norm": 1.086574912071228,
+      "learning_rate": 0.003,
+      "loss": 4.0816,
+      "step": 5657
+    },
+    {
+      "epoch": 0.05658,
+      "grad_norm": 0.9434396028518677,
+      "learning_rate": 0.003,
+      "loss": 4.0687,
+      "step": 5658
+    },
+    {
+      "epoch": 0.05659,
+      "grad_norm": 0.8089240789413452,
+      "learning_rate": 0.003,
+      "loss": 4.0718,
+      "step": 5659
+    },
+    {
+      "epoch": 0.0566,
+      "grad_norm": 0.6663325428962708,
+      "learning_rate": 0.003,
+      "loss": 4.0733,
+      "step": 5660
+    },
+    {
+      "epoch": 0.05661,
+      "grad_norm": 0.7506017088890076,
+      "learning_rate": 0.003,
+      "loss": 4.1112,
+      "step": 5661
+    },
+    {
+      "epoch": 0.05662,
+      "grad_norm": 0.8057636618614197,
+      "learning_rate": 0.003,
+      "loss": 4.0544,
+      "step": 5662
+    },
+    {
+      "epoch": 0.05663,
+      "grad_norm": 0.7259244322776794,
+      "learning_rate": 0.003,
+      "loss": 4.0749,
+      "step": 5663
+    },
+    {
+      "epoch": 0.05664,
+      "grad_norm": 0.6917450428009033,
+      "learning_rate": 0.003,
+      "loss": 4.0504,
+      "step": 5664
+    },
+    {
+      "epoch": 0.05665,
+      "grad_norm": 0.7202029824256897,
+      "learning_rate": 0.003,
+      "loss": 4.0617,
+      "step": 5665
+    },
+    {
+      "epoch": 0.05666,
+      "grad_norm": 0.7679628133773804,
+      "learning_rate": 0.003,
+      "loss": 4.075,
+      "step": 5666
+    },
+    {
+      "epoch": 0.05667,
+      "grad_norm": 0.725118100643158,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 5667
+    },
+    {
+      "epoch": 0.05668,
+      "grad_norm": 0.6143912672996521,
+      "learning_rate": 0.003,
+      "loss": 4.0858,
+      "step": 5668
+    },
+    {
+      "epoch": 0.05669,
+      "grad_norm": 0.5653799772262573,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 5669
+    },
+    {
+      "epoch": 0.0567,
+      "grad_norm": 0.6119061708450317,
+      "learning_rate": 0.003,
+      "loss": 4.0743,
+      "step": 5670
+    },
+    {
+      "epoch": 0.05671,
+      "grad_norm": 0.735202968120575,
+      "learning_rate": 0.003,
+      "loss": 4.059,
+      "step": 5671
+    },
+    {
+      "epoch": 0.05672,
+      "grad_norm": 0.7487754225730896,
+      "learning_rate": 0.003,
+      "loss": 4.0608,
+      "step": 5672
+    },
+    {
+      "epoch": 0.05673,
+      "grad_norm": 0.7744391560554504,
+      "learning_rate": 0.003,
+      "loss": 4.0403,
+      "step": 5673
+    },
+    {
+      "epoch": 0.05674,
+      "grad_norm": 0.7870099544525146,
+      "learning_rate": 0.003,
+      "loss": 4.0743,
+      "step": 5674
+    },
+    {
+      "epoch": 0.05675,
+      "grad_norm": 0.888359785079956,
+      "learning_rate": 0.003,
+      "loss": 4.0683,
+      "step": 5675
+    },
+    {
+      "epoch": 0.05676,
+      "grad_norm": 0.7705642580986023,
+      "learning_rate": 0.003,
+      "loss": 4.0782,
+      "step": 5676
+    },
+    {
+      "epoch": 0.05677,
+      "grad_norm": 0.5689957737922668,
+      "learning_rate": 0.003,
+      "loss": 4.0655,
+      "step": 5677
+    },
+    {
+      "epoch": 0.05678,
+      "grad_norm": 0.5393115282058716,
+      "learning_rate": 0.003,
+      "loss": 4.061,
+      "step": 5678
+    },
+    {
+      "epoch": 0.05679,
+      "grad_norm": 0.5655098557472229,
+      "learning_rate": 0.003,
+      "loss": 4.0744,
+      "step": 5679
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.6075883507728577,
+      "learning_rate": 0.003,
+      "loss": 4.0529,
+      "step": 5680
+    },
+    {
+      "epoch": 0.05681,
+      "grad_norm": 0.6243870854377747,
+      "learning_rate": 0.003,
+      "loss": 4.0574,
+      "step": 5681
+    },
+    {
+      "epoch": 0.05682,
+      "grad_norm": 0.6489701867103577,
+      "learning_rate": 0.003,
+      "loss": 4.0688,
+      "step": 5682
+    },
+    {
+      "epoch": 0.05683,
+      "grad_norm": 0.6358552575111389,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 5683
+    },
+    {
+      "epoch": 0.05684,
+      "grad_norm": 0.6328252553939819,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 5684
+    },
+    {
+      "epoch": 0.05685,
+      "grad_norm": 0.596019446849823,
+      "learning_rate": 0.003,
+      "loss": 4.0802,
+      "step": 5685
+    },
+    {
+      "epoch": 0.05686,
+      "grad_norm": 0.6575425863265991,
+      "learning_rate": 0.003,
+      "loss": 4.0834,
+      "step": 5686
+    },
+    {
+      "epoch": 0.05687,
+      "grad_norm": 0.6724589467048645,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 5687
+    },
+    {
+      "epoch": 0.05688,
+      "grad_norm": 0.6699223518371582,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 5688
+    },
+    {
+      "epoch": 0.05689,
+      "grad_norm": 0.7112768292427063,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 5689
+    },
+    {
+      "epoch": 0.0569,
+      "grad_norm": 0.8006166815757751,
+      "learning_rate": 0.003,
+      "loss": 4.067,
+      "step": 5690
+    },
+    {
+      "epoch": 0.05691,
+      "grad_norm": 0.9695540070533752,
+      "learning_rate": 0.003,
+      "loss": 4.0979,
+      "step": 5691
+    },
+    {
+      "epoch": 0.05692,
+      "grad_norm": 1.068497896194458,
+      "learning_rate": 0.003,
+      "loss": 4.0794,
+      "step": 5692
+    },
+    {
+      "epoch": 0.05693,
+      "grad_norm": 0.9201273918151855,
+      "learning_rate": 0.003,
+      "loss": 4.0779,
+      "step": 5693
+    },
+    {
+      "epoch": 0.05694,
+      "grad_norm": 0.8582456111907959,
+      "learning_rate": 0.003,
+      "loss": 4.0563,
+      "step": 5694
+    },
+    {
+      "epoch": 0.05695,
+      "grad_norm": 0.933272123336792,
+      "learning_rate": 0.003,
+      "loss": 4.0653,
+      "step": 5695
+    },
+    {
+      "epoch": 0.05696,
+      "grad_norm": 0.8945035338401794,
+      "learning_rate": 0.003,
+      "loss": 4.0688,
+      "step": 5696
+    },
+    {
+      "epoch": 0.05697,
+      "grad_norm": 0.8932650685310364,
+      "learning_rate": 0.003,
+      "loss": 4.1102,
+      "step": 5697
+    },
+    {
+      "epoch": 0.05698,
+      "grad_norm": 1.038320541381836,
+      "learning_rate": 0.003,
+      "loss": 4.1129,
+      "step": 5698
+    },
+    {
+      "epoch": 0.05699,
+      "grad_norm": 1.0705705881118774,
+      "learning_rate": 0.003,
+      "loss": 4.1001,
+      "step": 5699
+    },
+    {
+      "epoch": 0.057,
+      "grad_norm": 0.9729557037353516,
+      "learning_rate": 0.003,
+      "loss": 4.0801,
+      "step": 5700
+    },
+    {
+      "epoch": 0.05701,
+      "grad_norm": 0.9241772294044495,
+      "learning_rate": 0.003,
+      "loss": 4.1023,
+      "step": 5701
+    },
+    {
+      "epoch": 0.05702,
+      "grad_norm": 0.9331616759300232,
+      "learning_rate": 0.003,
+      "loss": 4.0805,
+      "step": 5702
+    },
+    {
+      "epoch": 0.05703,
+      "grad_norm": 0.8024580478668213,
+      "learning_rate": 0.003,
+      "loss": 4.1206,
+      "step": 5703
+    },
+    {
+      "epoch": 0.05704,
+      "grad_norm": 0.726856529712677,
+      "learning_rate": 0.003,
+      "loss": 4.078,
+      "step": 5704
+    },
+    {
+      "epoch": 0.05705,
+      "grad_norm": 0.6963912844657898,
+      "learning_rate": 0.003,
+      "loss": 4.0783,
+      "step": 5705
+    },
+    {
+      "epoch": 0.05706,
+      "grad_norm": 0.6920854449272156,
+      "learning_rate": 0.003,
+      "loss": 4.0704,
+      "step": 5706
+    },
+    {
+      "epoch": 0.05707,
+      "grad_norm": 0.7907909154891968,
+      "learning_rate": 0.003,
+      "loss": 4.075,
+      "step": 5707
+    },
+    {
+      "epoch": 0.05708,
+      "grad_norm": 0.8948379755020142,
+      "learning_rate": 0.003,
+      "loss": 4.1126,
+      "step": 5708
+    },
+    {
+      "epoch": 0.05709,
+      "grad_norm": 0.7752033472061157,
+      "learning_rate": 0.003,
+      "loss": 4.0813,
+      "step": 5709
+    },
+    {
+      "epoch": 0.0571,
+      "grad_norm": 0.577285885810852,
+      "learning_rate": 0.003,
+      "loss": 4.0577,
+      "step": 5710
+    },
+    {
+      "epoch": 0.05711,
+      "grad_norm": 0.5138233304023743,
+      "learning_rate": 0.003,
+      "loss": 4.0834,
+      "step": 5711
+    },
+    {
+      "epoch": 0.05712,
+      "grad_norm": 0.5642086267471313,
+      "learning_rate": 0.003,
+      "loss": 4.0588,
+      "step": 5712
+    },
+    {
+      "epoch": 0.05713,
+      "grad_norm": 0.6007573008537292,
+      "learning_rate": 0.003,
+      "loss": 4.0598,
+      "step": 5713
+    },
+    {
+      "epoch": 0.05714,
+      "grad_norm": 0.5792896151542664,
+      "learning_rate": 0.003,
+      "loss": 4.1123,
+      "step": 5714
+    },
+    {
+      "epoch": 0.05715,
+      "grad_norm": 0.551459550857544,
+      "learning_rate": 0.003,
+      "loss": 4.1101,
+      "step": 5715
+    },
+    {
+      "epoch": 0.05716,
+      "grad_norm": 0.6014549136161804,
+      "learning_rate": 0.003,
+      "loss": 4.0817,
+      "step": 5716
+    },
+    {
+      "epoch": 0.05717,
+      "grad_norm": 0.6075748801231384,
+      "learning_rate": 0.003,
+      "loss": 4.0926,
+      "step": 5717
+    },
+    {
+      "epoch": 0.05718,
+      "grad_norm": 0.7294437885284424,
+      "learning_rate": 0.003,
+      "loss": 4.0777,
+      "step": 5718
+    },
+    {
+      "epoch": 0.05719,
+      "grad_norm": 0.8543415069580078,
+      "learning_rate": 0.003,
+      "loss": 4.0812,
+      "step": 5719
+    },
+    {
+      "epoch": 0.0572,
+      "grad_norm": 0.8952844738960266,
+      "learning_rate": 0.003,
+      "loss": 4.0752,
+      "step": 5720
+    },
+    {
+      "epoch": 0.05721,
+      "grad_norm": 0.8221727013587952,
+      "learning_rate": 0.003,
+      "loss": 4.0552,
+      "step": 5721
+    },
+    {
+      "epoch": 0.05722,
+      "grad_norm": 0.8208548426628113,
+      "learning_rate": 0.003,
+      "loss": 4.0644,
+      "step": 5722
+    },
+    {
+      "epoch": 0.05723,
+      "grad_norm": 0.8748874664306641,
+      "learning_rate": 0.003,
+      "loss": 4.0698,
+      "step": 5723
+    },
+    {
+      "epoch": 0.05724,
+      "grad_norm": 0.747161328792572,
+      "learning_rate": 0.003,
+      "loss": 4.0665,
+      "step": 5724
+    },
+    {
+      "epoch": 0.05725,
+      "grad_norm": 0.6817445158958435,
+      "learning_rate": 0.003,
+      "loss": 4.0789,
+      "step": 5725
+    },
+    {
+      "epoch": 0.05726,
+      "grad_norm": 0.6044837832450867,
+      "learning_rate": 0.003,
+      "loss": 4.0654,
+      "step": 5726
+    },
+    {
+      "epoch": 0.05727,
+      "grad_norm": 0.5440679788589478,
+      "learning_rate": 0.003,
+      "loss": 4.049,
+      "step": 5727
+    },
+    {
+      "epoch": 0.05728,
+      "grad_norm": 0.5482798218727112,
+      "learning_rate": 0.003,
+      "loss": 4.0797,
+      "step": 5728
+    },
+    {
+      "epoch": 0.05729,
+      "grad_norm": 0.5878225564956665,
+      "learning_rate": 0.003,
+      "loss": 4.0622,
+      "step": 5729
+    },
+    {
+      "epoch": 0.0573,
+      "grad_norm": 0.5776148438453674,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 5730
+    },
+    {
+      "epoch": 0.05731,
+      "grad_norm": 0.6275573968887329,
+      "learning_rate": 0.003,
+      "loss": 4.0517,
+      "step": 5731
+    },
+    {
+      "epoch": 0.05732,
+      "grad_norm": 0.6557134985923767,
+      "learning_rate": 0.003,
+      "loss": 4.0797,
+      "step": 5732
+    },
+    {
+      "epoch": 0.05733,
+      "grad_norm": 0.6577096581459045,
+      "learning_rate": 0.003,
+      "loss": 4.0872,
+      "step": 5733
+    },
+    {
+      "epoch": 0.05734,
+      "grad_norm": 0.7086383104324341,
+      "learning_rate": 0.003,
+      "loss": 4.0827,
+      "step": 5734
+    },
+    {
+      "epoch": 0.05735,
+      "grad_norm": 0.7373378276824951,
+      "learning_rate": 0.003,
+      "loss": 4.0464,
+      "step": 5735
+    },
+    {
+      "epoch": 0.05736,
+      "grad_norm": 0.8890331983566284,
+      "learning_rate": 0.003,
+      "loss": 4.0799,
+      "step": 5736
+    },
+    {
+      "epoch": 0.05737,
+      "grad_norm": 1.07709801197052,
+      "learning_rate": 0.003,
+      "loss": 4.0773,
+      "step": 5737
+    },
+    {
+      "epoch": 0.05738,
+      "grad_norm": 0.7901797294616699,
+      "learning_rate": 0.003,
+      "loss": 4.0457,
+      "step": 5738
+    },
+    {
+      "epoch": 0.05739,
+      "grad_norm": 0.596919596195221,
+      "learning_rate": 0.003,
+      "loss": 4.0484,
+      "step": 5739
+    },
+    {
+      "epoch": 0.0574,
+      "grad_norm": 0.7254593372344971,
+      "learning_rate": 0.003,
+      "loss": 4.0547,
+      "step": 5740
+    },
+    {
+      "epoch": 0.05741,
+      "grad_norm": 0.7549036741256714,
+      "learning_rate": 0.003,
+      "loss": 4.046,
+      "step": 5741
+    },
+    {
+      "epoch": 0.05742,
+      "grad_norm": 0.8220978379249573,
+      "learning_rate": 0.003,
+      "loss": 4.065,
+      "step": 5742
+    },
+    {
+      "epoch": 0.05743,
+      "grad_norm": 0.8430297374725342,
+      "learning_rate": 0.003,
+      "loss": 4.0615,
+      "step": 5743
+    },
+    {
+      "epoch": 0.05744,
+      "grad_norm": 0.9336887001991272,
+      "learning_rate": 0.003,
+      "loss": 4.0808,
+      "step": 5744
+    },
+    {
+      "epoch": 0.05745,
+      "grad_norm": 0.9208829402923584,
+      "learning_rate": 0.003,
+      "loss": 4.0802,
+      "step": 5745
+    },
+    {
+      "epoch": 0.05746,
+      "grad_norm": 0.8885453939437866,
+      "learning_rate": 0.003,
+      "loss": 4.069,
+      "step": 5746
+    },
+    {
+      "epoch": 0.05747,
+      "grad_norm": 0.8862046003341675,
+      "learning_rate": 0.003,
+      "loss": 4.0945,
+      "step": 5747
+    },
+    {
+      "epoch": 0.05748,
+      "grad_norm": 0.8300103545188904,
+      "learning_rate": 0.003,
+      "loss": 4.1143,
+      "step": 5748
+    },
+    {
+      "epoch": 0.05749,
+      "grad_norm": 0.747201681137085,
+      "learning_rate": 0.003,
+      "loss": 4.0616,
+      "step": 5749
+    },
+    {
+      "epoch": 0.0575,
+      "grad_norm": 0.7727035880088806,
+      "learning_rate": 0.003,
+      "loss": 4.0829,
+      "step": 5750
+    },
+    {
+      "epoch": 0.05751,
+      "grad_norm": 0.7511826753616333,
+      "learning_rate": 0.003,
+      "loss": 4.0988,
+      "step": 5751
+    },
+    {
+      "epoch": 0.05752,
+      "grad_norm": 0.9084669947624207,
+      "learning_rate": 0.003,
+      "loss": 4.0943,
+      "step": 5752
+    },
+    {
+      "epoch": 0.05753,
+      "grad_norm": 0.8014945983886719,
+      "learning_rate": 0.003,
+      "loss": 4.0806,
+      "step": 5753
+    },
+    {
+      "epoch": 0.05754,
+      "grad_norm": 0.8008480072021484,
+      "learning_rate": 0.003,
+      "loss": 4.0792,
+      "step": 5754
+    },
+    {
+      "epoch": 0.05755,
+      "grad_norm": 0.8061231970787048,
+      "learning_rate": 0.003,
+      "loss": 4.0649,
+      "step": 5755
+    },
+    {
+      "epoch": 0.05756,
+      "grad_norm": 0.8867525458335876,
+      "learning_rate": 0.003,
+      "loss": 4.0986,
+      "step": 5756
+    },
+    {
+      "epoch": 0.05757,
+      "grad_norm": 0.9424989223480225,
+      "learning_rate": 0.003,
+      "loss": 4.0848,
+      "step": 5757
+    },
+    {
+      "epoch": 0.05758,
+      "grad_norm": 0.9214147329330444,
+      "learning_rate": 0.003,
+      "loss": 4.0831,
+      "step": 5758
+    },
+    {
+      "epoch": 0.05759,
+      "grad_norm": 0.8139206767082214,
+      "learning_rate": 0.003,
+      "loss": 4.0795,
+      "step": 5759
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.7159584760665894,
+      "learning_rate": 0.003,
+      "loss": 4.0742,
+      "step": 5760
+    },
+    {
+      "epoch": 0.05761,
+      "grad_norm": 0.7513906955718994,
+      "learning_rate": 0.003,
+      "loss": 4.0696,
+      "step": 5761
+    },
+    {
+      "epoch": 0.05762,
+      "grad_norm": 0.7571491003036499,
+      "learning_rate": 0.003,
+      "loss": 4.0559,
+      "step": 5762
+    },
+    {
+      "epoch": 0.05763,
+      "grad_norm": 0.7423854470252991,
+      "learning_rate": 0.003,
+      "loss": 4.0727,
+      "step": 5763
+    },
+    {
+      "epoch": 0.05764,
+      "grad_norm": 0.7205909490585327,
+      "learning_rate": 0.003,
+      "loss": 4.0697,
+      "step": 5764
+    },
+    {
+      "epoch": 0.05765,
+      "grad_norm": 0.672808051109314,
+      "learning_rate": 0.003,
+      "loss": 4.0795,
+      "step": 5765
+    },
+    {
+      "epoch": 0.05766,
+      "grad_norm": 0.5956676006317139,
+      "learning_rate": 0.003,
+      "loss": 4.0759,
+      "step": 5766
+    },
+    {
+      "epoch": 0.05767,
+      "grad_norm": 0.6225743889808655,
+      "learning_rate": 0.003,
+      "loss": 4.0625,
+      "step": 5767
+    },
+    {
+      "epoch": 0.05768,
+      "grad_norm": 0.6613940000534058,
+      "learning_rate": 0.003,
+      "loss": 4.0841,
+      "step": 5768
+    },
+    {
+      "epoch": 0.05769,
+      "grad_norm": 0.6825332641601562,
+      "learning_rate": 0.003,
+      "loss": 4.0692,
+      "step": 5769
+    },
+    {
+      "epoch": 0.0577,
+      "grad_norm": 0.6207984089851379,
+      "learning_rate": 0.003,
+      "loss": 4.0701,
+      "step": 5770
+    },
+    {
+      "epoch": 0.05771,
+      "grad_norm": 0.6129540801048279,
+      "learning_rate": 0.003,
+      "loss": 4.0751,
+      "step": 5771
+    },
+    {
+      "epoch": 0.05772,
+      "grad_norm": 0.6390023827552795,
+      "learning_rate": 0.003,
+      "loss": 4.084,
+      "step": 5772
+    },
+    {
+      "epoch": 0.05773,
+      "grad_norm": 0.7322221994400024,
+      "learning_rate": 0.003,
+      "loss": 4.0781,
+      "step": 5773
+    },
+    {
+      "epoch": 0.05774,
+      "grad_norm": 0.7655969262123108,
+      "learning_rate": 0.003,
+      "loss": 4.0531,
+      "step": 5774
+    },
+    {
+      "epoch": 0.05775,
+      "grad_norm": 0.7013616561889648,
+      "learning_rate": 0.003,
+      "loss": 4.0581,
+      "step": 5775
+    },
+    {
+      "epoch": 0.05776,
+      "grad_norm": 0.5979982018470764,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 5776
+    },
+    {
+      "epoch": 0.05777,
+      "grad_norm": 0.4852938652038574,
+      "learning_rate": 0.003,
+      "loss": 4.0642,
+      "step": 5777
+    },
+    {
+      "epoch": 0.05778,
+      "grad_norm": 0.5266335010528564,
+      "learning_rate": 0.003,
+      "loss": 4.0504,
+      "step": 5778
+    },
+    {
+      "epoch": 0.05779,
+      "grad_norm": 0.676627516746521,
+      "learning_rate": 0.003,
+      "loss": 4.0748,
+      "step": 5779
+    },
+    {
+      "epoch": 0.0578,
+      "grad_norm": 0.8605179190635681,
+      "learning_rate": 0.003,
+      "loss": 4.0364,
+      "step": 5780
+    },
+    {
+      "epoch": 0.05781,
+      "grad_norm": 0.9018716216087341,
+      "learning_rate": 0.003,
+      "loss": 4.0518,
+      "step": 5781
+    },
+    {
+      "epoch": 0.05782,
+      "grad_norm": 0.7411509156227112,
+      "learning_rate": 0.003,
+      "loss": 4.0825,
+      "step": 5782
+    },
+    {
+      "epoch": 0.05783,
+      "grad_norm": 0.5312531590461731,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 5783
+    },
+    {
+      "epoch": 0.05784,
+      "grad_norm": 0.5841469764709473,
+      "learning_rate": 0.003,
+      "loss": 4.0619,
+      "step": 5784
+    },
+    {
+      "epoch": 0.05785,
+      "grad_norm": 0.7773861289024353,
+      "learning_rate": 0.003,
+      "loss": 4.0779,
+      "step": 5785
+    },
+    {
+      "epoch": 0.05786,
+      "grad_norm": 0.7710773944854736,
+      "learning_rate": 0.003,
+      "loss": 4.0677,
+      "step": 5786
+    },
+    {
+      "epoch": 0.05787,
+      "grad_norm": 0.6817703247070312,
+      "learning_rate": 0.003,
+      "loss": 4.039,
+      "step": 5787
+    },
+    {
+      "epoch": 0.05788,
+      "grad_norm": 0.7051790952682495,
+      "learning_rate": 0.003,
+      "loss": 4.0599,
+      "step": 5788
+    },
+    {
+      "epoch": 0.05789,
+      "grad_norm": 0.7617840766906738,
+      "learning_rate": 0.003,
+      "loss": 4.0601,
+      "step": 5789
+    },
+    {
+      "epoch": 0.0579,
+      "grad_norm": 0.7429543733596802,
+      "learning_rate": 0.003,
+      "loss": 4.079,
+      "step": 5790
+    },
+    {
+      "epoch": 0.05791,
+      "grad_norm": 0.6779160499572754,
+      "learning_rate": 0.003,
+      "loss": 4.0552,
+      "step": 5791
+    },
+    {
+      "epoch": 0.05792,
+      "grad_norm": 0.7390137314796448,
+      "learning_rate": 0.003,
+      "loss": 4.0575,
+      "step": 5792
+    },
+    {
+      "epoch": 0.05793,
+      "grad_norm": 0.64161217212677,
+      "learning_rate": 0.003,
+      "loss": 4.0468,
+      "step": 5793
+    },
+    {
+      "epoch": 0.05794,
+      "grad_norm": 0.6376305818557739,
+      "learning_rate": 0.003,
+      "loss": 4.0624,
+      "step": 5794
+    },
+    {
+      "epoch": 0.05795,
+      "grad_norm": 0.5993477702140808,
+      "learning_rate": 0.003,
+      "loss": 4.0926,
+      "step": 5795
+    },
+    {
+      "epoch": 0.05796,
+      "grad_norm": 0.6392378211021423,
+      "learning_rate": 0.003,
+      "loss": 4.0637,
+      "step": 5796
+    },
+    {
+      "epoch": 0.05797,
+      "grad_norm": 0.6526309251785278,
+      "learning_rate": 0.003,
+      "loss": 4.0628,
+      "step": 5797
+    },
+    {
+      "epoch": 0.05798,
+      "grad_norm": 0.7773505449295044,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 5798
+    },
+    {
+      "epoch": 0.05799,
+      "grad_norm": 0.9118677377700806,
+      "learning_rate": 0.003,
+      "loss": 4.0826,
+      "step": 5799
+    },
+    {
+      "epoch": 0.058,
+      "grad_norm": 1.0093053579330444,
+      "learning_rate": 0.003,
+      "loss": 4.0847,
+      "step": 5800
+    },
+    {
+      "epoch": 0.05801,
+      "grad_norm": 1.0082321166992188,
+      "learning_rate": 0.003,
+      "loss": 4.068,
+      "step": 5801
+    },
+    {
+      "epoch": 0.05802,
+      "grad_norm": 0.8237224221229553,
+      "learning_rate": 0.003,
+      "loss": 4.0725,
+      "step": 5802
+    },
+    {
+      "epoch": 0.05803,
+      "grad_norm": 0.8256529569625854,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 5803
+    },
+    {
+      "epoch": 0.05804,
+      "grad_norm": 0.8300235867500305,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 5804
+    },
+    {
+      "epoch": 0.05805,
+      "grad_norm": 0.8762092590332031,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 5805
+    },
+    {
+      "epoch": 0.05806,
+      "grad_norm": 0.8307374715805054,
+      "learning_rate": 0.003,
+      "loss": 4.0763,
+      "step": 5806
+    },
+    {
+      "epoch": 0.05807,
+      "grad_norm": 0.8506011962890625,
+      "learning_rate": 0.003,
+      "loss": 4.0879,
+      "step": 5807
+    },
+    {
+      "epoch": 0.05808,
+      "grad_norm": 0.8518326282501221,
+      "learning_rate": 0.003,
+      "loss": 4.0994,
+      "step": 5808
+    },
+    {
+      "epoch": 0.05809,
+      "grad_norm": 0.9622186422348022,
+      "learning_rate": 0.003,
+      "loss": 4.1058,
+      "step": 5809
+    },
+    {
+      "epoch": 0.0581,
+      "grad_norm": 1.2263989448547363,
+      "learning_rate": 0.003,
+      "loss": 4.0692,
+      "step": 5810
+    },
+    {
+      "epoch": 0.05811,
+      "grad_norm": 0.9196377396583557,
+      "learning_rate": 0.003,
+      "loss": 4.0729,
+      "step": 5811
+    },
+    {
+      "epoch": 0.05812,
+      "grad_norm": 0.8726242184638977,
+      "learning_rate": 0.003,
+      "loss": 4.0763,
+      "step": 5812
+    },
+    {
+      "epoch": 0.05813,
+      "grad_norm": 0.8841713070869446,
+      "learning_rate": 0.003,
+      "loss": 4.0902,
+      "step": 5813
+    },
+    {
+      "epoch": 0.05814,
+      "grad_norm": 0.851186215877533,
+      "learning_rate": 0.003,
+      "loss": 4.0718,
+      "step": 5814
+    },
+    {
+      "epoch": 0.05815,
+      "grad_norm": 0.8571911454200745,
+      "learning_rate": 0.003,
+      "loss": 4.0516,
+      "step": 5815
+    },
+    {
+      "epoch": 0.05816,
+      "grad_norm": 0.7161019444465637,
+      "learning_rate": 0.003,
+      "loss": 4.0888,
+      "step": 5816
+    },
+    {
+      "epoch": 0.05817,
+      "grad_norm": 0.7098151445388794,
+      "learning_rate": 0.003,
+      "loss": 4.1024,
+      "step": 5817
+    },
+    {
+      "epoch": 0.05818,
+      "grad_norm": 0.6854085922241211,
+      "learning_rate": 0.003,
+      "loss": 4.0802,
+      "step": 5818
+    },
+    {
+      "epoch": 0.05819,
+      "grad_norm": 0.6578537225723267,
+      "learning_rate": 0.003,
+      "loss": 4.0715,
+      "step": 5819
+    },
+    {
+      "epoch": 0.0582,
+      "grad_norm": 0.5946629643440247,
+      "learning_rate": 0.003,
+      "loss": 4.0744,
+      "step": 5820
+    },
+    {
+      "epoch": 0.05821,
+      "grad_norm": 0.5087273716926575,
+      "learning_rate": 0.003,
+      "loss": 4.0827,
+      "step": 5821
+    },
+    {
+      "epoch": 0.05822,
+      "grad_norm": 0.4627154767513275,
+      "learning_rate": 0.003,
+      "loss": 4.0425,
+      "step": 5822
+    },
+    {
+      "epoch": 0.05823,
+      "grad_norm": 0.3847285509109497,
+      "learning_rate": 0.003,
+      "loss": 4.071,
+      "step": 5823
+    },
+    {
+      "epoch": 0.05824,
+      "grad_norm": 0.3624761998653412,
+      "learning_rate": 0.003,
+      "loss": 4.0729,
+      "step": 5824
+    },
+    {
+      "epoch": 0.05825,
+      "grad_norm": 0.3458106219768524,
+      "learning_rate": 0.003,
+      "loss": 4.0925,
+      "step": 5825
+    },
+    {
+      "epoch": 0.05826,
+      "grad_norm": 0.3547622561454773,
+      "learning_rate": 0.003,
+      "loss": 4.0435,
+      "step": 5826
+    },
+    {
+      "epoch": 0.05827,
+      "grad_norm": 0.3819022476673126,
+      "learning_rate": 0.003,
+      "loss": 4.0671,
+      "step": 5827
+    },
+    {
+      "epoch": 0.05828,
+      "grad_norm": 0.38096553087234497,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 5828
+    },
+    {
+      "epoch": 0.05829,
+      "grad_norm": 0.41201043128967285,
+      "learning_rate": 0.003,
+      "loss": 4.0661,
+      "step": 5829
+    },
+    {
+      "epoch": 0.0583,
+      "grad_norm": 0.5885247588157654,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 5830
+    },
+    {
+      "epoch": 0.05831,
+      "grad_norm": 0.8928015828132629,
+      "learning_rate": 0.003,
+      "loss": 4.0342,
+      "step": 5831
+    },
+    {
+      "epoch": 0.05832,
+      "grad_norm": 1.3094123601913452,
+      "learning_rate": 0.003,
+      "loss": 4.0959,
+      "step": 5832
+    },
+    {
+      "epoch": 0.05833,
+      "grad_norm": 0.48254647850990295,
+      "learning_rate": 0.003,
+      "loss": 4.061,
+      "step": 5833
+    },
+    {
+      "epoch": 0.05834,
+      "grad_norm": 0.7129554748535156,
+      "learning_rate": 0.003,
+      "loss": 4.0739,
+      "step": 5834
+    },
+    {
+      "epoch": 0.05835,
+      "grad_norm": 1.0598682165145874,
+      "learning_rate": 0.003,
+      "loss": 4.0999,
+      "step": 5835
+    },
+    {
+      "epoch": 0.05836,
+      "grad_norm": 0.7417224645614624,
+      "learning_rate": 0.003,
+      "loss": 4.0769,
+      "step": 5836
+    },
+    {
+      "epoch": 0.05837,
+      "grad_norm": 0.6337145566940308,
+      "learning_rate": 0.003,
+      "loss": 4.0706,
+      "step": 5837
+    },
+    {
+      "epoch": 0.05838,
+      "grad_norm": 0.7507103085517883,
+      "learning_rate": 0.003,
+      "loss": 4.0753,
+      "step": 5838
+    },
+    {
+      "epoch": 0.05839,
+      "grad_norm": 0.9548166990280151,
+      "learning_rate": 0.003,
+      "loss": 4.0835,
+      "step": 5839
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.9799003005027771,
+      "learning_rate": 0.003,
+      "loss": 4.106,
+      "step": 5840
+    },
+    {
+      "epoch": 0.05841,
+      "grad_norm": 0.9687362909317017,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 5841
+    },
+    {
+      "epoch": 0.05842,
+      "grad_norm": 0.8212959170341492,
+      "learning_rate": 0.003,
+      "loss": 4.0761,
+      "step": 5842
+    },
+    {
+      "epoch": 0.05843,
+      "grad_norm": 0.8485821485519409,
+      "learning_rate": 0.003,
+      "loss": 4.0762,
+      "step": 5843
+    },
+    {
+      "epoch": 0.05844,
+      "grad_norm": 1.0464115142822266,
+      "learning_rate": 0.003,
+      "loss": 4.0844,
+      "step": 5844
+    },
+    {
+      "epoch": 0.05845,
+      "grad_norm": 0.9341375231742859,
+      "learning_rate": 0.003,
+      "loss": 4.0862,
+      "step": 5845
+    },
+    {
+      "epoch": 0.05846,
+      "grad_norm": 0.9866040349006653,
+      "learning_rate": 0.003,
+      "loss": 4.0853,
+      "step": 5846
+    },
+    {
+      "epoch": 0.05847,
+      "grad_norm": 0.9109794497489929,
+      "learning_rate": 0.003,
+      "loss": 4.1139,
+      "step": 5847
+    },
+    {
+      "epoch": 0.05848,
+      "grad_norm": 0.715682864189148,
+      "learning_rate": 0.003,
+      "loss": 4.0809,
+      "step": 5848
+    },
+    {
+      "epoch": 0.05849,
+      "grad_norm": 0.618606686592102,
+      "learning_rate": 0.003,
+      "loss": 4.0434,
+      "step": 5849
+    },
+    {
+      "epoch": 0.0585,
+      "grad_norm": 0.6357572078704834,
+      "learning_rate": 0.003,
+      "loss": 4.0695,
+      "step": 5850
+    },
+    {
+      "epoch": 0.05851,
+      "grad_norm": 0.6797095537185669,
+      "learning_rate": 0.003,
+      "loss": 4.0639,
+      "step": 5851
+    },
+    {
+      "epoch": 0.05852,
+      "grad_norm": 0.6348204612731934,
+      "learning_rate": 0.003,
+      "loss": 4.0675,
+      "step": 5852
+    },
+    {
+      "epoch": 0.05853,
+      "grad_norm": 0.5872939229011536,
+      "learning_rate": 0.003,
+      "loss": 4.0508,
+      "step": 5853
+    },
+    {
+      "epoch": 0.05854,
+      "grad_norm": 0.6542626619338989,
+      "learning_rate": 0.003,
+      "loss": 4.1206,
+      "step": 5854
+    },
+    {
+      "epoch": 0.05855,
+      "grad_norm": 0.6466689705848694,
+      "learning_rate": 0.003,
+      "loss": 4.0773,
+      "step": 5855
+    },
+    {
+      "epoch": 0.05856,
+      "grad_norm": 0.7588222622871399,
+      "learning_rate": 0.003,
+      "loss": 4.0722,
+      "step": 5856
+    },
+    {
+      "epoch": 0.05857,
+      "grad_norm": 1.0768864154815674,
+      "learning_rate": 0.003,
+      "loss": 4.0929,
+      "step": 5857
+    },
+    {
+      "epoch": 0.05858,
+      "grad_norm": 0.9665427803993225,
+      "learning_rate": 0.003,
+      "loss": 4.0614,
+      "step": 5858
+    },
+    {
+      "epoch": 0.05859,
+      "grad_norm": 0.7883841395378113,
+      "learning_rate": 0.003,
+      "loss": 4.0803,
+      "step": 5859
+    },
+    {
+      "epoch": 0.0586,
+      "grad_norm": 0.7918685078620911,
+      "learning_rate": 0.003,
+      "loss": 4.087,
+      "step": 5860
+    },
+    {
+      "epoch": 0.05861,
+      "grad_norm": 0.711283802986145,
+      "learning_rate": 0.003,
+      "loss": 4.0981,
+      "step": 5861
+    },
+    {
+      "epoch": 0.05862,
+      "grad_norm": 0.7785149812698364,
+      "learning_rate": 0.003,
+      "loss": 4.0911,
+      "step": 5862
+    },
+    {
+      "epoch": 0.05863,
+      "grad_norm": 0.6729006171226501,
+      "learning_rate": 0.003,
+      "loss": 4.0865,
+      "step": 5863
+    },
+    {
+      "epoch": 0.05864,
+      "grad_norm": 0.5089790225028992,
+      "learning_rate": 0.003,
+      "loss": 4.0587,
+      "step": 5864
+    },
+    {
+      "epoch": 0.05865,
+      "grad_norm": 0.5055457949638367,
+      "learning_rate": 0.003,
+      "loss": 4.0538,
+      "step": 5865
+    },
+    {
+      "epoch": 0.05866,
+      "grad_norm": 0.5680795311927795,
+      "learning_rate": 0.003,
+      "loss": 4.0743,
+      "step": 5866
+    },
+    {
+      "epoch": 0.05867,
+      "grad_norm": 0.6734050512313843,
+      "learning_rate": 0.003,
+      "loss": 4.0589,
+      "step": 5867
+    },
+    {
+      "epoch": 0.05868,
+      "grad_norm": 0.68412184715271,
+      "learning_rate": 0.003,
+      "loss": 4.0461,
+      "step": 5868
+    },
+    {
+      "epoch": 0.05869,
+      "grad_norm": 0.6876507997512817,
+      "learning_rate": 0.003,
+      "loss": 4.0574,
+      "step": 5869
+    },
+    {
+      "epoch": 0.0587,
+      "grad_norm": 0.6638537645339966,
+      "learning_rate": 0.003,
+      "loss": 4.0559,
+      "step": 5870
+    },
+    {
+      "epoch": 0.05871,
+      "grad_norm": 0.6519478559494019,
+      "learning_rate": 0.003,
+      "loss": 4.0567,
+      "step": 5871
+    },
+    {
+      "epoch": 0.05872,
+      "grad_norm": 0.5919815897941589,
+      "learning_rate": 0.003,
+      "loss": 4.0518,
+      "step": 5872
+    },
+    {
+      "epoch": 0.05873,
+      "grad_norm": 0.5140731930732727,
+      "learning_rate": 0.003,
+      "loss": 4.0648,
+      "step": 5873
+    },
+    {
+      "epoch": 0.05874,
+      "grad_norm": 0.5663528442382812,
+      "learning_rate": 0.003,
+      "loss": 4.0559,
+      "step": 5874
+    },
+    {
+      "epoch": 0.05875,
+      "grad_norm": 0.6782262921333313,
+      "learning_rate": 0.003,
+      "loss": 4.068,
+      "step": 5875
+    },
+    {
+      "epoch": 0.05876,
+      "grad_norm": 0.8224900364875793,
+      "learning_rate": 0.003,
+      "loss": 4.0667,
+      "step": 5876
+    },
+    {
+      "epoch": 0.05877,
+      "grad_norm": 0.8717034459114075,
+      "learning_rate": 0.003,
+      "loss": 4.0775,
+      "step": 5877
+    },
+    {
+      "epoch": 0.05878,
+      "grad_norm": 0.8781859874725342,
+      "learning_rate": 0.003,
+      "loss": 4.0908,
+      "step": 5878
+    },
+    {
+      "epoch": 0.05879,
+      "grad_norm": 0.6580458879470825,
+      "learning_rate": 0.003,
+      "loss": 4.0949,
+      "step": 5879
+    },
+    {
+      "epoch": 0.0588,
+      "grad_norm": 0.7042397856712341,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 5880
+    },
+    {
+      "epoch": 0.05881,
+      "grad_norm": 0.6979352831840515,
+      "learning_rate": 0.003,
+      "loss": 4.0628,
+      "step": 5881
+    },
+    {
+      "epoch": 0.05882,
+      "grad_norm": 0.7222563624382019,
+      "learning_rate": 0.003,
+      "loss": 4.0815,
+      "step": 5882
+    },
+    {
+      "epoch": 0.05883,
+      "grad_norm": 0.7145103216171265,
+      "learning_rate": 0.003,
+      "loss": 4.0759,
+      "step": 5883
+    },
+    {
+      "epoch": 0.05884,
+      "grad_norm": 0.6383862495422363,
+      "learning_rate": 0.003,
+      "loss": 4.0601,
+      "step": 5884
+    },
+    {
+      "epoch": 0.05885,
+      "grad_norm": 0.5817567706108093,
+      "learning_rate": 0.003,
+      "loss": 4.0731,
+      "step": 5885
+    },
+    {
+      "epoch": 0.05886,
+      "grad_norm": 0.6241205930709839,
+      "learning_rate": 0.003,
+      "loss": 4.069,
+      "step": 5886
+    },
+    {
+      "epoch": 0.05887,
+      "grad_norm": 0.702506959438324,
+      "learning_rate": 0.003,
+      "loss": 4.0605,
+      "step": 5887
+    },
+    {
+      "epoch": 0.05888,
+      "grad_norm": 0.6908167600631714,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 5888
+    },
+    {
+      "epoch": 0.05889,
+      "grad_norm": 0.7523607015609741,
+      "learning_rate": 0.003,
+      "loss": 4.0526,
+      "step": 5889
+    },
+    {
+      "epoch": 0.0589,
+      "grad_norm": 0.8569447994232178,
+      "learning_rate": 0.003,
+      "loss": 4.0595,
+      "step": 5890
+    },
+    {
+      "epoch": 0.05891,
+      "grad_norm": 1.0577572584152222,
+      "learning_rate": 0.003,
+      "loss": 4.064,
+      "step": 5891
+    },
+    {
+      "epoch": 0.05892,
+      "grad_norm": 0.9264476895332336,
+      "learning_rate": 0.003,
+      "loss": 4.095,
+      "step": 5892
+    },
+    {
+      "epoch": 0.05893,
+      "grad_norm": 0.7721839547157288,
+      "learning_rate": 0.003,
+      "loss": 4.067,
+      "step": 5893
+    },
+    {
+      "epoch": 0.05894,
+      "grad_norm": 0.7444570660591125,
+      "learning_rate": 0.003,
+      "loss": 4.0432,
+      "step": 5894
+    },
+    {
+      "epoch": 0.05895,
+      "grad_norm": 0.7716593146324158,
+      "learning_rate": 0.003,
+      "loss": 4.0538,
+      "step": 5895
+    },
+    {
+      "epoch": 0.05896,
+      "grad_norm": 0.834312915802002,
+      "learning_rate": 0.003,
+      "loss": 4.0836,
+      "step": 5896
+    },
+    {
+      "epoch": 0.05897,
+      "grad_norm": 0.9077129364013672,
+      "learning_rate": 0.003,
+      "loss": 4.0929,
+      "step": 5897
+    },
+    {
+      "epoch": 0.05898,
+      "grad_norm": 0.7383908629417419,
+      "learning_rate": 0.003,
+      "loss": 4.0829,
+      "step": 5898
+    },
+    {
+      "epoch": 0.05899,
+      "grad_norm": 0.7159144878387451,
+      "learning_rate": 0.003,
+      "loss": 4.0476,
+      "step": 5899
+    },
+    {
+      "epoch": 0.059,
+      "grad_norm": 0.8070700168609619,
+      "learning_rate": 0.003,
+      "loss": 4.0739,
+      "step": 5900
+    },
+    {
+      "epoch": 0.05901,
+      "grad_norm": 0.9503213763237,
+      "learning_rate": 0.003,
+      "loss": 4.0618,
+      "step": 5901
+    },
+    {
+      "epoch": 0.05902,
+      "grad_norm": 0.9452478885650635,
+      "learning_rate": 0.003,
+      "loss": 4.0781,
+      "step": 5902
+    },
+    {
+      "epoch": 0.05903,
+      "grad_norm": 0.9364940524101257,
+      "learning_rate": 0.003,
+      "loss": 4.0853,
+      "step": 5903
+    },
+    {
+      "epoch": 0.05904,
+      "grad_norm": 0.710010290145874,
+      "learning_rate": 0.003,
+      "loss": 4.0648,
+      "step": 5904
+    },
+    {
+      "epoch": 0.05905,
+      "grad_norm": 0.7259859442710876,
+      "learning_rate": 0.003,
+      "loss": 4.0659,
+      "step": 5905
+    },
+    {
+      "epoch": 0.05906,
+      "grad_norm": 0.7474007606506348,
+      "learning_rate": 0.003,
+      "loss": 4.068,
+      "step": 5906
+    },
+    {
+      "epoch": 0.05907,
+      "grad_norm": 0.7143236994743347,
+      "learning_rate": 0.003,
+      "loss": 4.0577,
+      "step": 5907
+    },
+    {
+      "epoch": 0.05908,
+      "grad_norm": 0.846874475479126,
+      "learning_rate": 0.003,
+      "loss": 4.0847,
+      "step": 5908
+    },
+    {
+      "epoch": 0.05909,
+      "grad_norm": 0.8068088889122009,
+      "learning_rate": 0.003,
+      "loss": 4.0974,
+      "step": 5909
+    },
+    {
+      "epoch": 0.0591,
+      "grad_norm": 0.681771993637085,
+      "learning_rate": 0.003,
+      "loss": 4.0608,
+      "step": 5910
+    },
+    {
+      "epoch": 0.05911,
+      "grad_norm": 0.7519658207893372,
+      "learning_rate": 0.003,
+      "loss": 4.0544,
+      "step": 5911
+    },
+    {
+      "epoch": 0.05912,
+      "grad_norm": 0.7764798998832703,
+      "learning_rate": 0.003,
+      "loss": 4.0984,
+      "step": 5912
+    },
+    {
+      "epoch": 0.05913,
+      "grad_norm": 0.757898211479187,
+      "learning_rate": 0.003,
+      "loss": 4.1192,
+      "step": 5913
+    },
+    {
+      "epoch": 0.05914,
+      "grad_norm": 0.7228046655654907,
+      "learning_rate": 0.003,
+      "loss": 4.088,
+      "step": 5914
+    },
+    {
+      "epoch": 0.05915,
+      "grad_norm": 0.6196589469909668,
+      "learning_rate": 0.003,
+      "loss": 4.0846,
+      "step": 5915
+    },
+    {
+      "epoch": 0.05916,
+      "grad_norm": 0.5313118696212769,
+      "learning_rate": 0.003,
+      "loss": 4.0663,
+      "step": 5916
+    },
+    {
+      "epoch": 0.05917,
+      "grad_norm": 0.536577045917511,
+      "learning_rate": 0.003,
+      "loss": 4.0507,
+      "step": 5917
+    },
+    {
+      "epoch": 0.05918,
+      "grad_norm": 0.5481544733047485,
+      "learning_rate": 0.003,
+      "loss": 4.059,
+      "step": 5918
+    },
+    {
+      "epoch": 0.05919,
+      "grad_norm": 0.6215094327926636,
+      "learning_rate": 0.003,
+      "loss": 4.0598,
+      "step": 5919
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.6950171589851379,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 5920
+    },
+    {
+      "epoch": 0.05921,
+      "grad_norm": 0.7951127290725708,
+      "learning_rate": 0.003,
+      "loss": 4.0794,
+      "step": 5921
+    },
+    {
+      "epoch": 0.05922,
+      "grad_norm": 0.9624848961830139,
+      "learning_rate": 0.003,
+      "loss": 4.0761,
+      "step": 5922
+    },
+    {
+      "epoch": 0.05923,
+      "grad_norm": 1.0542023181915283,
+      "learning_rate": 0.003,
+      "loss": 4.0493,
+      "step": 5923
+    },
+    {
+      "epoch": 0.05924,
+      "grad_norm": 0.8465194702148438,
+      "learning_rate": 0.003,
+      "loss": 4.0752,
+      "step": 5924
+    },
+    {
+      "epoch": 0.05925,
+      "grad_norm": 0.7096172571182251,
+      "learning_rate": 0.003,
+      "loss": 4.0598,
+      "step": 5925
+    },
+    {
+      "epoch": 0.05926,
+      "grad_norm": 0.7524508833885193,
+      "learning_rate": 0.003,
+      "loss": 4.0654,
+      "step": 5926
+    },
+    {
+      "epoch": 0.05927,
+      "grad_norm": 0.6621628403663635,
+      "learning_rate": 0.003,
+      "loss": 4.0454,
+      "step": 5927
+    },
+    {
+      "epoch": 0.05928,
+      "grad_norm": 0.5687286853790283,
+      "learning_rate": 0.003,
+      "loss": 4.0623,
+      "step": 5928
+    },
+    {
+      "epoch": 0.05929,
+      "grad_norm": 0.4693851172924042,
+      "learning_rate": 0.003,
+      "loss": 4.0689,
+      "step": 5929
+    },
+    {
+      "epoch": 0.0593,
+      "grad_norm": 0.4943394064903259,
+      "learning_rate": 0.003,
+      "loss": 4.0707,
+      "step": 5930
+    },
+    {
+      "epoch": 0.05931,
+      "grad_norm": 0.5767983198165894,
+      "learning_rate": 0.003,
+      "loss": 4.0521,
+      "step": 5931
+    },
+    {
+      "epoch": 0.05932,
+      "grad_norm": 0.7448903918266296,
+      "learning_rate": 0.003,
+      "loss": 4.0846,
+      "step": 5932
+    },
+    {
+      "epoch": 0.05933,
+      "grad_norm": 0.9883130788803101,
+      "learning_rate": 0.003,
+      "loss": 4.0614,
+      "step": 5933
+    },
+    {
+      "epoch": 0.05934,
+      "grad_norm": 1.1292710304260254,
+      "learning_rate": 0.003,
+      "loss": 4.1067,
+      "step": 5934
+    },
+    {
+      "epoch": 0.05935,
+      "grad_norm": 0.6654629707336426,
+      "learning_rate": 0.003,
+      "loss": 4.0761,
+      "step": 5935
+    },
+    {
+      "epoch": 0.05936,
+      "grad_norm": 0.7037827372550964,
+      "learning_rate": 0.003,
+      "loss": 4.0758,
+      "step": 5936
+    },
+    {
+      "epoch": 0.05937,
+      "grad_norm": 0.928344190120697,
+      "learning_rate": 0.003,
+      "loss": 4.0786,
+      "step": 5937
+    },
+    {
+      "epoch": 0.05938,
+      "grad_norm": 0.9594924449920654,
+      "learning_rate": 0.003,
+      "loss": 4.0579,
+      "step": 5938
+    },
+    {
+      "epoch": 0.05939,
+      "grad_norm": 0.8684089183807373,
+      "learning_rate": 0.003,
+      "loss": 4.0724,
+      "step": 5939
+    },
+    {
+      "epoch": 0.0594,
+      "grad_norm": 0.7445632815361023,
+      "learning_rate": 0.003,
+      "loss": 4.0844,
+      "step": 5940
+    },
+    {
+      "epoch": 0.05941,
+      "grad_norm": 0.7455309629440308,
+      "learning_rate": 0.003,
+      "loss": 4.088,
+      "step": 5941
+    },
+    {
+      "epoch": 0.05942,
+      "grad_norm": 0.6535532474517822,
+      "learning_rate": 0.003,
+      "loss": 4.0718,
+      "step": 5942
+    },
+    {
+      "epoch": 0.05943,
+      "grad_norm": 0.7083020210266113,
+      "learning_rate": 0.003,
+      "loss": 4.0625,
+      "step": 5943
+    },
+    {
+      "epoch": 0.05944,
+      "grad_norm": 0.5781853795051575,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 5944
+    },
+    {
+      "epoch": 0.05945,
+      "grad_norm": 0.4758516550064087,
+      "learning_rate": 0.003,
+      "loss": 4.0509,
+      "step": 5945
+    },
+    {
+      "epoch": 0.05946,
+      "grad_norm": 0.43648025393486023,
+      "learning_rate": 0.003,
+      "loss": 4.0465,
+      "step": 5946
+    },
+    {
+      "epoch": 0.05947,
+      "grad_norm": 0.40230774879455566,
+      "learning_rate": 0.003,
+      "loss": 4.0394,
+      "step": 5947
+    },
+    {
+      "epoch": 0.05948,
+      "grad_norm": 0.43578827381134033,
+      "learning_rate": 0.003,
+      "loss": 4.0533,
+      "step": 5948
+    },
+    {
+      "epoch": 0.05949,
+      "grad_norm": 0.4666615426540375,
+      "learning_rate": 0.003,
+      "loss": 4.0476,
+      "step": 5949
+    },
+    {
+      "epoch": 0.0595,
+      "grad_norm": 0.5350773334503174,
+      "learning_rate": 0.003,
+      "loss": 4.0596,
+      "step": 5950
+    },
+    {
+      "epoch": 0.05951,
+      "grad_norm": 0.7080516815185547,
+      "learning_rate": 0.003,
+      "loss": 4.0543,
+      "step": 5951
+    },
+    {
+      "epoch": 0.05952,
+      "grad_norm": 0.7631043791770935,
+      "learning_rate": 0.003,
+      "loss": 4.0678,
+      "step": 5952
+    },
+    {
+      "epoch": 0.05953,
+      "grad_norm": 0.7321561574935913,
+      "learning_rate": 0.003,
+      "loss": 4.0461,
+      "step": 5953
+    },
+    {
+      "epoch": 0.05954,
+      "grad_norm": 0.7224215269088745,
+      "learning_rate": 0.003,
+      "loss": 4.0728,
+      "step": 5954
+    },
+    {
+      "epoch": 0.05955,
+      "grad_norm": 0.6791732311248779,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 5955
+    },
+    {
+      "epoch": 0.05956,
+      "grad_norm": 0.6897500157356262,
+      "learning_rate": 0.003,
+      "loss": 4.0373,
+      "step": 5956
+    },
+    {
+      "epoch": 0.05957,
+      "grad_norm": 0.9236707091331482,
+      "learning_rate": 0.003,
+      "loss": 4.0492,
+      "step": 5957
+    },
+    {
+      "epoch": 0.05958,
+      "grad_norm": 1.0184153318405151,
+      "learning_rate": 0.003,
+      "loss": 4.0634,
+      "step": 5958
+    },
+    {
+      "epoch": 0.05959,
+      "grad_norm": 0.9222090840339661,
+      "learning_rate": 0.003,
+      "loss": 4.0618,
+      "step": 5959
+    },
+    {
+      "epoch": 0.0596,
+      "grad_norm": 0.823390007019043,
+      "learning_rate": 0.003,
+      "loss": 4.0827,
+      "step": 5960
+    },
+    {
+      "epoch": 0.05961,
+      "grad_norm": 0.85313880443573,
+      "learning_rate": 0.003,
+      "loss": 4.0758,
+      "step": 5961
+    },
+    {
+      "epoch": 0.05962,
+      "grad_norm": 0.9738097786903381,
+      "learning_rate": 0.003,
+      "loss": 4.073,
+      "step": 5962
+    },
+    {
+      "epoch": 0.05963,
+      "grad_norm": 0.9151560664176941,
+      "learning_rate": 0.003,
+      "loss": 4.0784,
+      "step": 5963
+    },
+    {
+      "epoch": 0.05964,
+      "grad_norm": 0.9065539240837097,
+      "learning_rate": 0.003,
+      "loss": 4.0737,
+      "step": 5964
+    },
+    {
+      "epoch": 0.05965,
+      "grad_norm": 0.7944349050521851,
+      "learning_rate": 0.003,
+      "loss": 4.0678,
+      "step": 5965
+    },
+    {
+      "epoch": 0.05966,
+      "grad_norm": 0.7729898691177368,
+      "learning_rate": 0.003,
+      "loss": 4.0832,
+      "step": 5966
+    },
+    {
+      "epoch": 0.05967,
+      "grad_norm": 0.8545308113098145,
+      "learning_rate": 0.003,
+      "loss": 4.0659,
+      "step": 5967
+    },
+    {
+      "epoch": 0.05968,
+      "grad_norm": 0.9394176006317139,
+      "learning_rate": 0.003,
+      "loss": 4.0708,
+      "step": 5968
+    },
+    {
+      "epoch": 0.05969,
+      "grad_norm": 0.8269079327583313,
+      "learning_rate": 0.003,
+      "loss": 4.0578,
+      "step": 5969
+    },
+    {
+      "epoch": 0.0597,
+      "grad_norm": 0.7771570086479187,
+      "learning_rate": 0.003,
+      "loss": 4.0663,
+      "step": 5970
+    },
+    {
+      "epoch": 0.05971,
+      "grad_norm": 0.7587796449661255,
+      "learning_rate": 0.003,
+      "loss": 4.0838,
+      "step": 5971
+    },
+    {
+      "epoch": 0.05972,
+      "grad_norm": 0.8089188933372498,
+      "learning_rate": 0.003,
+      "loss": 4.097,
+      "step": 5972
+    },
+    {
+      "epoch": 0.05973,
+      "grad_norm": 0.7736026048660278,
+      "learning_rate": 0.003,
+      "loss": 4.0568,
+      "step": 5973
+    },
+    {
+      "epoch": 0.05974,
+      "grad_norm": 0.7009705305099487,
+      "learning_rate": 0.003,
+      "loss": 4.0529,
+      "step": 5974
+    },
+    {
+      "epoch": 0.05975,
+      "grad_norm": 0.6236057281494141,
+      "learning_rate": 0.003,
+      "loss": 4.0837,
+      "step": 5975
+    },
+    {
+      "epoch": 0.05976,
+      "grad_norm": 0.6456733345985413,
+      "learning_rate": 0.003,
+      "loss": 4.0709,
+      "step": 5976
+    },
+    {
+      "epoch": 0.05977,
+      "grad_norm": 0.5764129757881165,
+      "learning_rate": 0.003,
+      "loss": 4.076,
+      "step": 5977
+    },
+    {
+      "epoch": 0.05978,
+      "grad_norm": 0.5077314972877502,
+      "learning_rate": 0.003,
+      "loss": 4.0612,
+      "step": 5978
+    },
+    {
+      "epoch": 0.05979,
+      "grad_norm": 0.4247589111328125,
+      "learning_rate": 0.003,
+      "loss": 4.065,
+      "step": 5979
+    },
+    {
+      "epoch": 0.0598,
+      "grad_norm": 0.42051416635513306,
+      "learning_rate": 0.003,
+      "loss": 4.0403,
+      "step": 5980
+    },
+    {
+      "epoch": 0.05981,
+      "grad_norm": 0.42619195580482483,
+      "learning_rate": 0.003,
+      "loss": 4.0429,
+      "step": 5981
+    },
+    {
+      "epoch": 0.05982,
+      "grad_norm": 0.5673835873603821,
+      "learning_rate": 0.003,
+      "loss": 4.0578,
+      "step": 5982
+    },
+    {
+      "epoch": 0.05983,
+      "grad_norm": 0.9145967364311218,
+      "learning_rate": 0.003,
+      "loss": 4.0769,
+      "step": 5983
+    },
+    {
+      "epoch": 0.05984,
+      "grad_norm": 1.3005692958831787,
+      "learning_rate": 0.003,
+      "loss": 4.06,
+      "step": 5984
+    },
+    {
+      "epoch": 0.05985,
+      "grad_norm": 0.5140734910964966,
+      "learning_rate": 0.003,
+      "loss": 4.0744,
+      "step": 5985
+    },
+    {
+      "epoch": 0.05986,
+      "grad_norm": 0.6605982184410095,
+      "learning_rate": 0.003,
+      "loss": 4.0767,
+      "step": 5986
+    },
+    {
+      "epoch": 0.05987,
+      "grad_norm": 0.8068457841873169,
+      "learning_rate": 0.003,
+      "loss": 4.0498,
+      "step": 5987
+    },
+    {
+      "epoch": 0.05988,
+      "grad_norm": 0.7576589584350586,
+      "learning_rate": 0.003,
+      "loss": 4.0569,
+      "step": 5988
+    },
+    {
+      "epoch": 0.05989,
+      "grad_norm": 0.6234875321388245,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 5989
+    },
+    {
+      "epoch": 0.0599,
+      "grad_norm": 0.5436728596687317,
+      "learning_rate": 0.003,
+      "loss": 4.0571,
+      "step": 5990
+    },
+    {
+      "epoch": 0.05991,
+      "grad_norm": 0.5909923911094666,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 5991
+    },
+    {
+      "epoch": 0.05992,
+      "grad_norm": 0.6115368008613586,
+      "learning_rate": 0.003,
+      "loss": 4.0532,
+      "step": 5992
+    },
+    {
+      "epoch": 0.05993,
+      "grad_norm": 0.5838632583618164,
+      "learning_rate": 0.003,
+      "loss": 4.061,
+      "step": 5993
+    },
+    {
+      "epoch": 0.05994,
+      "grad_norm": 0.6138798594474792,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 5994
+    },
+    {
+      "epoch": 0.05995,
+      "grad_norm": 0.6672993302345276,
+      "learning_rate": 0.003,
+      "loss": 4.0517,
+      "step": 5995
+    },
+    {
+      "epoch": 0.05996,
+      "grad_norm": 0.7187176942825317,
+      "learning_rate": 0.003,
+      "loss": 4.0879,
+      "step": 5996
+    },
+    {
+      "epoch": 0.05997,
+      "grad_norm": 0.7012836933135986,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 5997
+    },
+    {
+      "epoch": 0.05998,
+      "grad_norm": 0.6116321086883545,
+      "learning_rate": 0.003,
+      "loss": 4.083,
+      "step": 5998
+    },
+    {
+      "epoch": 0.05999,
+      "grad_norm": 0.5162574648857117,
+      "learning_rate": 0.003,
+      "loss": 4.047,
+      "step": 5999
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.6102678179740906,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 6000
+    },
+    {
+      "epoch": 0.06001,
+      "grad_norm": 0.5848944187164307,
+      "learning_rate": 0.003,
+      "loss": 4.0473,
+      "step": 6001
+    },
+    {
+      "epoch": 0.06002,
+      "grad_norm": 0.5892003178596497,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 6002
+    },
+    {
+      "epoch": 0.06003,
+      "grad_norm": 0.5941994190216064,
+      "learning_rate": 0.003,
+      "loss": 4.0607,
+      "step": 6003
+    },
+    {
+      "epoch": 0.06004,
+      "grad_norm": 0.6040821075439453,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 6004
+    },
+    {
+      "epoch": 0.06005,
+      "grad_norm": 0.6764587163925171,
+      "learning_rate": 0.003,
+      "loss": 4.06,
+      "step": 6005
+    },
+    {
+      "epoch": 0.06006,
+      "grad_norm": 0.689892590045929,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 6006
+    },
+    {
+      "epoch": 0.06007,
+      "grad_norm": 0.6645123958587646,
+      "learning_rate": 0.003,
+      "loss": 4.089,
+      "step": 6007
+    },
+    {
+      "epoch": 0.06008,
+      "grad_norm": 0.780821681022644,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 6008
+    },
+    {
+      "epoch": 0.06009,
+      "grad_norm": 1.0214614868164062,
+      "learning_rate": 0.003,
+      "loss": 4.0605,
+      "step": 6009
+    },
+    {
+      "epoch": 0.0601,
+      "grad_norm": 1.1233888864517212,
+      "learning_rate": 0.003,
+      "loss": 4.0699,
+      "step": 6010
+    },
+    {
+      "epoch": 0.06011,
+      "grad_norm": 0.8268139958381653,
+      "learning_rate": 0.003,
+      "loss": 4.0701,
+      "step": 6011
+    },
+    {
+      "epoch": 0.06012,
+      "grad_norm": 0.8503698706626892,
+      "learning_rate": 0.003,
+      "loss": 4.0817,
+      "step": 6012
+    },
+    {
+      "epoch": 0.06013,
+      "grad_norm": 0.9249141216278076,
+      "learning_rate": 0.003,
+      "loss": 4.0819,
+      "step": 6013
+    },
+    {
+      "epoch": 0.06014,
+      "grad_norm": 1.1101765632629395,
+      "learning_rate": 0.003,
+      "loss": 4.076,
+      "step": 6014
+    },
+    {
+      "epoch": 0.06015,
+      "grad_norm": 1.0137884616851807,
+      "learning_rate": 0.003,
+      "loss": 4.0916,
+      "step": 6015
+    },
+    {
+      "epoch": 0.06016,
+      "grad_norm": 0.7965986132621765,
+      "learning_rate": 0.003,
+      "loss": 4.089,
+      "step": 6016
+    },
+    {
+      "epoch": 0.06017,
+      "grad_norm": 0.7965149879455566,
+      "learning_rate": 0.003,
+      "loss": 4.092,
+      "step": 6017
+    },
+    {
+      "epoch": 0.06018,
+      "grad_norm": 0.8309733271598816,
+      "learning_rate": 0.003,
+      "loss": 4.1102,
+      "step": 6018
+    },
+    {
+      "epoch": 0.06019,
+      "grad_norm": 0.768108606338501,
+      "learning_rate": 0.003,
+      "loss": 4.0755,
+      "step": 6019
+    },
+    {
+      "epoch": 0.0602,
+      "grad_norm": 0.8096131086349487,
+      "learning_rate": 0.003,
+      "loss": 4.0721,
+      "step": 6020
+    },
+    {
+      "epoch": 0.06021,
+      "grad_norm": 0.8794146776199341,
+      "learning_rate": 0.003,
+      "loss": 4.092,
+      "step": 6021
+    },
+    {
+      "epoch": 0.06022,
+      "grad_norm": 0.9591919183731079,
+      "learning_rate": 0.003,
+      "loss": 4.1037,
+      "step": 6022
+    },
+    {
+      "epoch": 0.06023,
+      "grad_norm": 0.8065183162689209,
+      "learning_rate": 0.003,
+      "loss": 4.0842,
+      "step": 6023
+    },
+    {
+      "epoch": 0.06024,
+      "grad_norm": 0.8542725443840027,
+      "learning_rate": 0.003,
+      "loss": 4.0613,
+      "step": 6024
+    },
+    {
+      "epoch": 0.06025,
+      "grad_norm": 0.8307952284812927,
+      "learning_rate": 0.003,
+      "loss": 4.1148,
+      "step": 6025
+    },
+    {
+      "epoch": 0.06026,
+      "grad_norm": 0.8478344678878784,
+      "learning_rate": 0.003,
+      "loss": 4.0832,
+      "step": 6026
+    },
+    {
+      "epoch": 0.06027,
+      "grad_norm": 0.8286115527153015,
+      "learning_rate": 0.003,
+      "loss": 4.0795,
+      "step": 6027
+    },
+    {
+      "epoch": 0.06028,
+      "grad_norm": 0.9184243083000183,
+      "learning_rate": 0.003,
+      "loss": 4.1052,
+      "step": 6028
+    },
+    {
+      "epoch": 0.06029,
+      "grad_norm": 0.8957465887069702,
+      "learning_rate": 0.003,
+      "loss": 4.0967,
+      "step": 6029
+    },
+    {
+      "epoch": 0.0603,
+      "grad_norm": 1.0010573863983154,
+      "learning_rate": 0.003,
+      "loss": 4.0921,
+      "step": 6030
+    },
+    {
+      "epoch": 0.06031,
+      "grad_norm": 1.060820460319519,
+      "learning_rate": 0.003,
+      "loss": 4.0928,
+      "step": 6031
+    },
+    {
+      "epoch": 0.06032,
+      "grad_norm": 0.9341599941253662,
+      "learning_rate": 0.003,
+      "loss": 4.0874,
+      "step": 6032
+    },
+    {
+      "epoch": 0.06033,
+      "grad_norm": 0.7799586653709412,
+      "learning_rate": 0.003,
+      "loss": 4.0923,
+      "step": 6033
+    },
+    {
+      "epoch": 0.06034,
+      "grad_norm": 0.6523258686065674,
+      "learning_rate": 0.003,
+      "loss": 4.1033,
+      "step": 6034
+    },
+    {
+      "epoch": 0.06035,
+      "grad_norm": 0.4869402050971985,
+      "learning_rate": 0.003,
+      "loss": 4.0858,
+      "step": 6035
+    },
+    {
+      "epoch": 0.06036,
+      "grad_norm": 0.5165879726409912,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 6036
+    },
+    {
+      "epoch": 0.06037,
+      "grad_norm": 0.5209839940071106,
+      "learning_rate": 0.003,
+      "loss": 4.0761,
+      "step": 6037
+    },
+    {
+      "epoch": 0.06038,
+      "grad_norm": 0.5267767310142517,
+      "learning_rate": 0.003,
+      "loss": 4.0595,
+      "step": 6038
+    },
+    {
+      "epoch": 0.06039,
+      "grad_norm": 0.5402923822402954,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 6039
+    },
+    {
+      "epoch": 0.0604,
+      "grad_norm": 0.5481833815574646,
+      "learning_rate": 0.003,
+      "loss": 4.0644,
+      "step": 6040
+    },
+    {
+      "epoch": 0.06041,
+      "grad_norm": 0.5473806262016296,
+      "learning_rate": 0.003,
+      "loss": 4.0788,
+      "step": 6041
+    },
+    {
+      "epoch": 0.06042,
+      "grad_norm": 0.5405387878417969,
+      "learning_rate": 0.003,
+      "loss": 4.0555,
+      "step": 6042
+    },
+    {
+      "epoch": 0.06043,
+      "grad_norm": 0.5141624808311462,
+      "learning_rate": 0.003,
+      "loss": 4.0589,
+      "step": 6043
+    },
+    {
+      "epoch": 0.06044,
+      "grad_norm": 0.4394950270652771,
+      "learning_rate": 0.003,
+      "loss": 4.0577,
+      "step": 6044
+    },
+    {
+      "epoch": 0.06045,
+      "grad_norm": 0.49056580662727356,
+      "learning_rate": 0.003,
+      "loss": 4.0567,
+      "step": 6045
+    },
+    {
+      "epoch": 0.06046,
+      "grad_norm": 0.5915701389312744,
+      "learning_rate": 0.003,
+      "loss": 4.0634,
+      "step": 6046
+    },
+    {
+      "epoch": 0.06047,
+      "grad_norm": 0.7667026519775391,
+      "learning_rate": 0.003,
+      "loss": 4.0441,
+      "step": 6047
+    },
+    {
+      "epoch": 0.06048,
+      "grad_norm": 0.9280398488044739,
+      "learning_rate": 0.003,
+      "loss": 4.046,
+      "step": 6048
+    },
+    {
+      "epoch": 0.06049,
+      "grad_norm": 0.8987364768981934,
+      "learning_rate": 0.003,
+      "loss": 4.0427,
+      "step": 6049
+    },
+    {
+      "epoch": 0.0605,
+      "grad_norm": 0.6798592209815979,
+      "learning_rate": 0.003,
+      "loss": 4.0631,
+      "step": 6050
+    },
+    {
+      "epoch": 0.06051,
+      "grad_norm": 0.5418064594268799,
+      "learning_rate": 0.003,
+      "loss": 4.0729,
+      "step": 6051
+    },
+    {
+      "epoch": 0.06052,
+      "grad_norm": 0.6874141693115234,
+      "learning_rate": 0.003,
+      "loss": 4.0738,
+      "step": 6052
+    },
+    {
+      "epoch": 0.06053,
+      "grad_norm": 0.7224387526512146,
+      "learning_rate": 0.003,
+      "loss": 4.034,
+      "step": 6053
+    },
+    {
+      "epoch": 0.06054,
+      "grad_norm": 0.6871939897537231,
+      "learning_rate": 0.003,
+      "loss": 4.0673,
+      "step": 6054
+    },
+    {
+      "epoch": 0.06055,
+      "grad_norm": 0.5547940135002136,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 6055
+    },
+    {
+      "epoch": 0.06056,
+      "grad_norm": 0.46671125292778015,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 6056
+    },
+    {
+      "epoch": 0.06057,
+      "grad_norm": 0.5383384823799133,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 6057
+    },
+    {
+      "epoch": 0.06058,
+      "grad_norm": 0.5768425464630127,
+      "learning_rate": 0.003,
+      "loss": 4.046,
+      "step": 6058
+    },
+    {
+      "epoch": 0.06059,
+      "grad_norm": 0.6451521515846252,
+      "learning_rate": 0.003,
+      "loss": 4.0598,
+      "step": 6059
+    },
+    {
+      "epoch": 0.0606,
+      "grad_norm": 0.7362764477729797,
+      "learning_rate": 0.003,
+      "loss": 4.0607,
+      "step": 6060
+    },
+    {
+      "epoch": 0.06061,
+      "grad_norm": 0.7581610083580017,
+      "learning_rate": 0.003,
+      "loss": 4.0672,
+      "step": 6061
+    },
+    {
+      "epoch": 0.06062,
+      "grad_norm": 0.7688402533531189,
+      "learning_rate": 0.003,
+      "loss": 4.0762,
+      "step": 6062
+    },
+    {
+      "epoch": 0.06063,
+      "grad_norm": 0.7899247407913208,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 6063
+    },
+    {
+      "epoch": 0.06064,
+      "grad_norm": 0.6973926424980164,
+      "learning_rate": 0.003,
+      "loss": 4.089,
+      "step": 6064
+    },
+    {
+      "epoch": 0.06065,
+      "grad_norm": 0.6818444132804871,
+      "learning_rate": 0.003,
+      "loss": 4.0589,
+      "step": 6065
+    },
+    {
+      "epoch": 0.06066,
+      "grad_norm": 0.7340236902236938,
+      "learning_rate": 0.003,
+      "loss": 4.0906,
+      "step": 6066
+    },
+    {
+      "epoch": 0.06067,
+      "grad_norm": 0.8874813914299011,
+      "learning_rate": 0.003,
+      "loss": 4.0736,
+      "step": 6067
+    },
+    {
+      "epoch": 0.06068,
+      "grad_norm": 0.9855170845985413,
+      "learning_rate": 0.003,
+      "loss": 4.0646,
+      "step": 6068
+    },
+    {
+      "epoch": 0.06069,
+      "grad_norm": 1.1523756980895996,
+      "learning_rate": 0.003,
+      "loss": 4.0848,
+      "step": 6069
+    },
+    {
+      "epoch": 0.0607,
+      "grad_norm": 0.873957097530365,
+      "learning_rate": 0.003,
+      "loss": 4.0763,
+      "step": 6070
+    },
+    {
+      "epoch": 0.06071,
+      "grad_norm": 0.848626971244812,
+      "learning_rate": 0.003,
+      "loss": 4.052,
+      "step": 6071
+    },
+    {
+      "epoch": 0.06072,
+      "grad_norm": 0.830126166343689,
+      "learning_rate": 0.003,
+      "loss": 4.0556,
+      "step": 6072
+    },
+    {
+      "epoch": 0.06073,
+      "grad_norm": 0.7734904289245605,
+      "learning_rate": 0.003,
+      "loss": 4.0836,
+      "step": 6073
+    },
+    {
+      "epoch": 0.06074,
+      "grad_norm": 0.8461311459541321,
+      "learning_rate": 0.003,
+      "loss": 4.0721,
+      "step": 6074
+    },
+    {
+      "epoch": 0.06075,
+      "grad_norm": 1.0410598516464233,
+      "learning_rate": 0.003,
+      "loss": 4.0974,
+      "step": 6075
+    },
+    {
+      "epoch": 0.06076,
+      "grad_norm": 1.1896835565567017,
+      "learning_rate": 0.003,
+      "loss": 4.0902,
+      "step": 6076
+    },
+    {
+      "epoch": 0.06077,
+      "grad_norm": 0.8349819779396057,
+      "learning_rate": 0.003,
+      "loss": 4.0758,
+      "step": 6077
+    },
+    {
+      "epoch": 0.06078,
+      "grad_norm": 0.7226845026016235,
+      "learning_rate": 0.003,
+      "loss": 4.0831,
+      "step": 6078
+    },
+    {
+      "epoch": 0.06079,
+      "grad_norm": 0.7609370350837708,
+      "learning_rate": 0.003,
+      "loss": 4.075,
+      "step": 6079
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.7053298354148865,
+      "learning_rate": 0.003,
+      "loss": 4.0707,
+      "step": 6080
+    },
+    {
+      "epoch": 0.06081,
+      "grad_norm": 0.6266418099403381,
+      "learning_rate": 0.003,
+      "loss": 4.049,
+      "step": 6081
+    },
+    {
+      "epoch": 0.06082,
+      "grad_norm": 0.5934705138206482,
+      "learning_rate": 0.003,
+      "loss": 4.0951,
+      "step": 6082
+    },
+    {
+      "epoch": 0.06083,
+      "grad_norm": 0.6792622804641724,
+      "learning_rate": 0.003,
+      "loss": 4.0792,
+      "step": 6083
+    },
+    {
+      "epoch": 0.06084,
+      "grad_norm": 0.7161068916320801,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 6084
+    },
+    {
+      "epoch": 0.06085,
+      "grad_norm": 0.6832570433616638,
+      "learning_rate": 0.003,
+      "loss": 4.0295,
+      "step": 6085
+    },
+    {
+      "epoch": 0.06086,
+      "grad_norm": 0.6206290125846863,
+      "learning_rate": 0.003,
+      "loss": 4.0351,
+      "step": 6086
+    },
+    {
+      "epoch": 0.06087,
+      "grad_norm": 0.5676894783973694,
+      "learning_rate": 0.003,
+      "loss": 4.0734,
+      "step": 6087
+    },
+    {
+      "epoch": 0.06088,
+      "grad_norm": 0.6253965497016907,
+      "learning_rate": 0.003,
+      "loss": 4.049,
+      "step": 6088
+    },
+    {
+      "epoch": 0.06089,
+      "grad_norm": 0.5663153529167175,
+      "learning_rate": 0.003,
+      "loss": 4.0474,
+      "step": 6089
+    },
+    {
+      "epoch": 0.0609,
+      "grad_norm": 0.5186519026756287,
+      "learning_rate": 0.003,
+      "loss": 4.0855,
+      "step": 6090
+    },
+    {
+      "epoch": 0.06091,
+      "grad_norm": 0.5059489011764526,
+      "learning_rate": 0.003,
+      "loss": 4.0576,
+      "step": 6091
+    },
+    {
+      "epoch": 0.06092,
+      "grad_norm": 0.4492594599723816,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 6092
+    },
+    {
+      "epoch": 0.06093,
+      "grad_norm": 0.48771554231643677,
+      "learning_rate": 0.003,
+      "loss": 4.0523,
+      "step": 6093
+    },
+    {
+      "epoch": 0.06094,
+      "grad_norm": 0.4371291399002075,
+      "learning_rate": 0.003,
+      "loss": 4.0522,
+      "step": 6094
+    },
+    {
+      "epoch": 0.06095,
+      "grad_norm": 0.4618400037288666,
+      "learning_rate": 0.003,
+      "loss": 4.0554,
+      "step": 6095
+    },
+    {
+      "epoch": 0.06096,
+      "grad_norm": 0.42540499567985535,
+      "learning_rate": 0.003,
+      "loss": 4.0457,
+      "step": 6096
+    },
+    {
+      "epoch": 0.06097,
+      "grad_norm": 0.45694103837013245,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 6097
+    },
+    {
+      "epoch": 0.06098,
+      "grad_norm": 0.5427758693695068,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 6098
+    },
+    {
+      "epoch": 0.06099,
+      "grad_norm": 0.578697919845581,
+      "learning_rate": 0.003,
+      "loss": 4.0557,
+      "step": 6099
+    },
+    {
+      "epoch": 0.061,
+      "grad_norm": 0.7686467170715332,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 6100
+    },
+    {
+      "epoch": 0.06101,
+      "grad_norm": 1.0454940795898438,
+      "learning_rate": 0.003,
+      "loss": 4.0944,
+      "step": 6101
+    },
+    {
+      "epoch": 0.06102,
+      "grad_norm": 1.1773312091827393,
+      "learning_rate": 0.003,
+      "loss": 4.0793,
+      "step": 6102
+    },
+    {
+      "epoch": 0.06103,
+      "grad_norm": 0.805872917175293,
+      "learning_rate": 0.003,
+      "loss": 4.0836,
+      "step": 6103
+    },
+    {
+      "epoch": 0.06104,
+      "grad_norm": 0.7373183369636536,
+      "learning_rate": 0.003,
+      "loss": 4.055,
+      "step": 6104
+    },
+    {
+      "epoch": 0.06105,
+      "grad_norm": 0.6697652339935303,
+      "learning_rate": 0.003,
+      "loss": 4.0554,
+      "step": 6105
+    },
+    {
+      "epoch": 0.06106,
+      "grad_norm": 0.6978040337562561,
+      "learning_rate": 0.003,
+      "loss": 4.0785,
+      "step": 6106
+    },
+    {
+      "epoch": 0.06107,
+      "grad_norm": 0.6311635375022888,
+      "learning_rate": 0.003,
+      "loss": 4.0873,
+      "step": 6107
+    },
+    {
+      "epoch": 0.06108,
+      "grad_norm": 0.6601365208625793,
+      "learning_rate": 0.003,
+      "loss": 4.0551,
+      "step": 6108
+    },
+    {
+      "epoch": 0.06109,
+      "grad_norm": 0.5691725015640259,
+      "learning_rate": 0.003,
+      "loss": 4.0271,
+      "step": 6109
+    },
+    {
+      "epoch": 0.0611,
+      "grad_norm": 0.5813072919845581,
+      "learning_rate": 0.003,
+      "loss": 4.0479,
+      "step": 6110
+    },
+    {
+      "epoch": 0.06111,
+      "grad_norm": 0.5668128728866577,
+      "learning_rate": 0.003,
+      "loss": 4.0608,
+      "step": 6111
+    },
+    {
+      "epoch": 0.06112,
+      "grad_norm": 0.5596557855606079,
+      "learning_rate": 0.003,
+      "loss": 4.0566,
+      "step": 6112
+    },
+    {
+      "epoch": 0.06113,
+      "grad_norm": 0.6563438773155212,
+      "learning_rate": 0.003,
+      "loss": 4.0677,
+      "step": 6113
+    },
+    {
+      "epoch": 0.06114,
+      "grad_norm": 0.7970550060272217,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 6114
+    },
+    {
+      "epoch": 0.06115,
+      "grad_norm": 0.8884240388870239,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 6115
+    },
+    {
+      "epoch": 0.06116,
+      "grad_norm": 1.06717848777771,
+      "learning_rate": 0.003,
+      "loss": 4.0625,
+      "step": 6116
+    },
+    {
+      "epoch": 0.06117,
+      "grad_norm": 1.105543613433838,
+      "learning_rate": 0.003,
+      "loss": 4.0595,
+      "step": 6117
+    },
+    {
+      "epoch": 0.06118,
+      "grad_norm": 0.8822982907295227,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 6118
+    },
+    {
+      "epoch": 0.06119,
+      "grad_norm": 0.8931971192359924,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 6119
+    },
+    {
+      "epoch": 0.0612,
+      "grad_norm": 0.9181088209152222,
+      "learning_rate": 0.003,
+      "loss": 4.0779,
+      "step": 6120
+    },
+    {
+      "epoch": 0.06121,
+      "grad_norm": 0.8232288360595703,
+      "learning_rate": 0.003,
+      "loss": 4.0617,
+      "step": 6121
+    },
+    {
+      "epoch": 0.06122,
+      "grad_norm": 0.662312924861908,
+      "learning_rate": 0.003,
+      "loss": 4.0718,
+      "step": 6122
+    },
+    {
+      "epoch": 0.06123,
+      "grad_norm": 0.7112892270088196,
+      "learning_rate": 0.003,
+      "loss": 4.0733,
+      "step": 6123
+    },
+    {
+      "epoch": 0.06124,
+      "grad_norm": 0.7430440783500671,
+      "learning_rate": 0.003,
+      "loss": 4.0657,
+      "step": 6124
+    },
+    {
+      "epoch": 0.06125,
+      "grad_norm": 0.8577669262886047,
+      "learning_rate": 0.003,
+      "loss": 4.0371,
+      "step": 6125
+    },
+    {
+      "epoch": 0.06126,
+      "grad_norm": 0.9018658399581909,
+      "learning_rate": 0.003,
+      "loss": 4.0535,
+      "step": 6126
+    },
+    {
+      "epoch": 0.06127,
+      "grad_norm": 0.9262766242027283,
+      "learning_rate": 0.003,
+      "loss": 4.0475,
+      "step": 6127
+    },
+    {
+      "epoch": 0.06128,
+      "grad_norm": 0.9801653027534485,
+      "learning_rate": 0.003,
+      "loss": 4.0697,
+      "step": 6128
+    },
+    {
+      "epoch": 0.06129,
+      "grad_norm": 0.9130566120147705,
+      "learning_rate": 0.003,
+      "loss": 4.0793,
+      "step": 6129
+    },
+    {
+      "epoch": 0.0613,
+      "grad_norm": 0.7119159698486328,
+      "learning_rate": 0.003,
+      "loss": 4.0808,
+      "step": 6130
+    },
+    {
+      "epoch": 0.06131,
+      "grad_norm": 0.721723198890686,
+      "learning_rate": 0.003,
+      "loss": 4.0754,
+      "step": 6131
+    },
+    {
+      "epoch": 0.06132,
+      "grad_norm": 0.6934512257575989,
+      "learning_rate": 0.003,
+      "loss": 4.073,
+      "step": 6132
+    },
+    {
+      "epoch": 0.06133,
+      "grad_norm": 0.8672846555709839,
+      "learning_rate": 0.003,
+      "loss": 4.0686,
+      "step": 6133
+    },
+    {
+      "epoch": 0.06134,
+      "grad_norm": 1.000223159790039,
+      "learning_rate": 0.003,
+      "loss": 4.0649,
+      "step": 6134
+    },
+    {
+      "epoch": 0.06135,
+      "grad_norm": 1.0614702701568604,
+      "learning_rate": 0.003,
+      "loss": 4.0974,
+      "step": 6135
+    },
+    {
+      "epoch": 0.06136,
+      "grad_norm": 0.8382878303527832,
+      "learning_rate": 0.003,
+      "loss": 4.071,
+      "step": 6136
+    },
+    {
+      "epoch": 0.06137,
+      "grad_norm": 0.7506641745567322,
+      "learning_rate": 0.003,
+      "loss": 4.0667,
+      "step": 6137
+    },
+    {
+      "epoch": 0.06138,
+      "grad_norm": 0.7175465822219849,
+      "learning_rate": 0.003,
+      "loss": 4.0596,
+      "step": 6138
+    },
+    {
+      "epoch": 0.06139,
+      "grad_norm": 0.7327171564102173,
+      "learning_rate": 0.003,
+      "loss": 4.0806,
+      "step": 6139
+    },
+    {
+      "epoch": 0.0614,
+      "grad_norm": 0.7524228096008301,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 6140
+    },
+    {
+      "epoch": 0.06141,
+      "grad_norm": 0.8113182783126831,
+      "learning_rate": 0.003,
+      "loss": 4.0423,
+      "step": 6141
+    },
+    {
+      "epoch": 0.06142,
+      "grad_norm": 0.8624609708786011,
+      "learning_rate": 0.003,
+      "loss": 4.0677,
+      "step": 6142
+    },
+    {
+      "epoch": 0.06143,
+      "grad_norm": 0.849567711353302,
+      "learning_rate": 0.003,
+      "loss": 4.0749,
+      "step": 6143
+    },
+    {
+      "epoch": 0.06144,
+      "grad_norm": 0.8006253838539124,
+      "learning_rate": 0.003,
+      "loss": 4.0572,
+      "step": 6144
+    },
+    {
+      "epoch": 0.06145,
+      "grad_norm": 0.6653422713279724,
+      "learning_rate": 0.003,
+      "loss": 4.0719,
+      "step": 6145
+    },
+    {
+      "epoch": 0.06146,
+      "grad_norm": 0.5989100337028503,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 6146
+    },
+    {
+      "epoch": 0.06147,
+      "grad_norm": 0.6381635069847107,
+      "learning_rate": 0.003,
+      "loss": 4.0919,
+      "step": 6147
+    },
+    {
+      "epoch": 0.06148,
+      "grad_norm": 0.6760680079460144,
+      "learning_rate": 0.003,
+      "loss": 4.0627,
+      "step": 6148
+    },
+    {
+      "epoch": 0.06149,
+      "grad_norm": 0.7898040413856506,
+      "learning_rate": 0.003,
+      "loss": 4.0577,
+      "step": 6149
+    },
+    {
+      "epoch": 0.0615,
+      "grad_norm": 0.8890254497528076,
+      "learning_rate": 0.003,
+      "loss": 4.0776,
+      "step": 6150
+    },
+    {
+      "epoch": 0.06151,
+      "grad_norm": 0.7810690999031067,
+      "learning_rate": 0.003,
+      "loss": 4.0692,
+      "step": 6151
+    },
+    {
+      "epoch": 0.06152,
+      "grad_norm": 0.7254818677902222,
+      "learning_rate": 0.003,
+      "loss": 4.0528,
+      "step": 6152
+    },
+    {
+      "epoch": 0.06153,
+      "grad_norm": 0.7281543612480164,
+      "learning_rate": 0.003,
+      "loss": 4.0799,
+      "step": 6153
+    },
+    {
+      "epoch": 0.06154,
+      "grad_norm": 0.7368656992912292,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 6154
+    },
+    {
+      "epoch": 0.06155,
+      "grad_norm": 0.6993001699447632,
+      "learning_rate": 0.003,
+      "loss": 4.0689,
+      "step": 6155
+    },
+    {
+      "epoch": 0.06156,
+      "grad_norm": 0.5923306345939636,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 6156
+    },
+    {
+      "epoch": 0.06157,
+      "grad_norm": 0.45892608165740967,
+      "learning_rate": 0.003,
+      "loss": 4.0656,
+      "step": 6157
+    },
+    {
+      "epoch": 0.06158,
+      "grad_norm": 0.41562631726264954,
+      "learning_rate": 0.003,
+      "loss": 4.0868,
+      "step": 6158
+    },
+    {
+      "epoch": 0.06159,
+      "grad_norm": 0.5908848643302917,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 6159
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.7091723084449768,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 6160
+    },
+    {
+      "epoch": 0.06161,
+      "grad_norm": 0.7991834282875061,
+      "learning_rate": 0.003,
+      "loss": 4.0723,
+      "step": 6161
+    },
+    {
+      "epoch": 0.06162,
+      "grad_norm": 0.8020153045654297,
+      "learning_rate": 0.003,
+      "loss": 4.0501,
+      "step": 6162
+    },
+    {
+      "epoch": 0.06163,
+      "grad_norm": 0.6712086200714111,
+      "learning_rate": 0.003,
+      "loss": 4.0881,
+      "step": 6163
+    },
+    {
+      "epoch": 0.06164,
+      "grad_norm": 0.6103070378303528,
+      "learning_rate": 0.003,
+      "loss": 4.0627,
+      "step": 6164
+    },
+    {
+      "epoch": 0.06165,
+      "grad_norm": 0.7152314782142639,
+      "learning_rate": 0.003,
+      "loss": 4.0484,
+      "step": 6165
+    },
+    {
+      "epoch": 0.06166,
+      "grad_norm": 0.8041547536849976,
+      "learning_rate": 0.003,
+      "loss": 4.0787,
+      "step": 6166
+    },
+    {
+      "epoch": 0.06167,
+      "grad_norm": 0.7833403944969177,
+      "learning_rate": 0.003,
+      "loss": 4.0546,
+      "step": 6167
+    },
+    {
+      "epoch": 0.06168,
+      "grad_norm": 0.7658514976501465,
+      "learning_rate": 0.003,
+      "loss": 4.0613,
+      "step": 6168
+    },
+    {
+      "epoch": 0.06169,
+      "grad_norm": 0.6437884569168091,
+      "learning_rate": 0.003,
+      "loss": 4.0566,
+      "step": 6169
+    },
+    {
+      "epoch": 0.0617,
+      "grad_norm": 0.5259288549423218,
+      "learning_rate": 0.003,
+      "loss": 4.0532,
+      "step": 6170
+    },
+    {
+      "epoch": 0.06171,
+      "grad_norm": 0.6789013147354126,
+      "learning_rate": 0.003,
+      "loss": 4.0614,
+      "step": 6171
+    },
+    {
+      "epoch": 0.06172,
+      "grad_norm": 0.6907355189323425,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 6172
+    },
+    {
+      "epoch": 0.06173,
+      "grad_norm": 0.8013864755630493,
+      "learning_rate": 0.003,
+      "loss": 4.0687,
+      "step": 6173
+    },
+    {
+      "epoch": 0.06174,
+      "grad_norm": 0.7729394435882568,
+      "learning_rate": 0.003,
+      "loss": 4.0651,
+      "step": 6174
+    },
+    {
+      "epoch": 0.06175,
+      "grad_norm": 0.6916416883468628,
+      "learning_rate": 0.003,
+      "loss": 4.0633,
+      "step": 6175
+    },
+    {
+      "epoch": 0.06176,
+      "grad_norm": 0.7110108137130737,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 6176
+    },
+    {
+      "epoch": 0.06177,
+      "grad_norm": 0.7319554686546326,
+      "learning_rate": 0.003,
+      "loss": 4.058,
+      "step": 6177
+    },
+    {
+      "epoch": 0.06178,
+      "grad_norm": 0.7097826600074768,
+      "learning_rate": 0.003,
+      "loss": 4.0466,
+      "step": 6178
+    },
+    {
+      "epoch": 0.06179,
+      "grad_norm": 0.6155682802200317,
+      "learning_rate": 0.003,
+      "loss": 4.0585,
+      "step": 6179
+    },
+    {
+      "epoch": 0.0618,
+      "grad_norm": 0.6252792477607727,
+      "learning_rate": 0.003,
+      "loss": 4.059,
+      "step": 6180
+    },
+    {
+      "epoch": 0.06181,
+      "grad_norm": 0.6055827140808105,
+      "learning_rate": 0.003,
+      "loss": 4.0768,
+      "step": 6181
+    },
+    {
+      "epoch": 0.06182,
+      "grad_norm": 0.6797299981117249,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 6182
+    },
+    {
+      "epoch": 0.06183,
+      "grad_norm": 0.740317702293396,
+      "learning_rate": 0.003,
+      "loss": 4.05,
+      "step": 6183
+    },
+    {
+      "epoch": 0.06184,
+      "grad_norm": 0.9291132092475891,
+      "learning_rate": 0.003,
+      "loss": 4.0554,
+      "step": 6184
+    },
+    {
+      "epoch": 0.06185,
+      "grad_norm": 0.8949309587478638,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 6185
+    },
+    {
+      "epoch": 0.06186,
+      "grad_norm": 0.703565239906311,
+      "learning_rate": 0.003,
+      "loss": 4.0699,
+      "step": 6186
+    },
+    {
+      "epoch": 0.06187,
+      "grad_norm": 0.7581156492233276,
+      "learning_rate": 0.003,
+      "loss": 4.0771,
+      "step": 6187
+    },
+    {
+      "epoch": 0.06188,
+      "grad_norm": 0.7350788712501526,
+      "learning_rate": 0.003,
+      "loss": 4.0571,
+      "step": 6188
+    },
+    {
+      "epoch": 0.06189,
+      "grad_norm": 0.7255872488021851,
+      "learning_rate": 0.003,
+      "loss": 4.045,
+      "step": 6189
+    },
+    {
+      "epoch": 0.0619,
+      "grad_norm": 0.8740925788879395,
+      "learning_rate": 0.003,
+      "loss": 4.0906,
+      "step": 6190
+    },
+    {
+      "epoch": 0.06191,
+      "grad_norm": 1.0470556020736694,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 6191
+    },
+    {
+      "epoch": 0.06192,
+      "grad_norm": 0.9209544658660889,
+      "learning_rate": 0.003,
+      "loss": 4.0591,
+      "step": 6192
+    },
+    {
+      "epoch": 0.06193,
+      "grad_norm": 0.8478721976280212,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 6193
+    },
+    {
+      "epoch": 0.06194,
+      "grad_norm": 0.9317265748977661,
+      "learning_rate": 0.003,
+      "loss": 4.0556,
+      "step": 6194
+    },
+    {
+      "epoch": 0.06195,
+      "grad_norm": 0.90468829870224,
+      "learning_rate": 0.003,
+      "loss": 4.0748,
+      "step": 6195
+    },
+    {
+      "epoch": 0.06196,
+      "grad_norm": 0.8252323269844055,
+      "learning_rate": 0.003,
+      "loss": 4.0835,
+      "step": 6196
+    },
+    {
+      "epoch": 0.06197,
+      "grad_norm": 0.7629353404045105,
+      "learning_rate": 0.003,
+      "loss": 4.0701,
+      "step": 6197
+    },
+    {
+      "epoch": 0.06198,
+      "grad_norm": 0.7873762845993042,
+      "learning_rate": 0.003,
+      "loss": 4.0992,
+      "step": 6198
+    },
+    {
+      "epoch": 0.06199,
+      "grad_norm": 0.9207159876823425,
+      "learning_rate": 0.003,
+      "loss": 4.096,
+      "step": 6199
+    },
+    {
+      "epoch": 0.062,
+      "grad_norm": 0.9485105872154236,
+      "learning_rate": 0.003,
+      "loss": 4.0999,
+      "step": 6200
+    },
+    {
+      "epoch": 0.06201,
+      "grad_norm": 0.8545713424682617,
+      "learning_rate": 0.003,
+      "loss": 4.0707,
+      "step": 6201
+    },
+    {
+      "epoch": 0.06202,
+      "grad_norm": 0.7046213150024414,
+      "learning_rate": 0.003,
+      "loss": 4.1067,
+      "step": 6202
+    },
+    {
+      "epoch": 0.06203,
+      "grad_norm": 0.7020225524902344,
+      "learning_rate": 0.003,
+      "loss": 4.0909,
+      "step": 6203
+    },
+    {
+      "epoch": 0.06204,
+      "grad_norm": 0.7241159677505493,
+      "learning_rate": 0.003,
+      "loss": 4.0856,
+      "step": 6204
+    },
+    {
+      "epoch": 0.06205,
+      "grad_norm": 0.6984315514564514,
+      "learning_rate": 0.003,
+      "loss": 4.0845,
+      "step": 6205
+    },
+    {
+      "epoch": 0.06206,
+      "grad_norm": 0.7261056900024414,
+      "learning_rate": 0.003,
+      "loss": 4.0744,
+      "step": 6206
+    },
+    {
+      "epoch": 0.06207,
+      "grad_norm": 0.7522352337837219,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 6207
+    },
+    {
+      "epoch": 0.06208,
+      "grad_norm": 0.7981951832771301,
+      "learning_rate": 0.003,
+      "loss": 4.0648,
+      "step": 6208
+    },
+    {
+      "epoch": 0.06209,
+      "grad_norm": 0.9211035370826721,
+      "learning_rate": 0.003,
+      "loss": 4.0523,
+      "step": 6209
+    },
+    {
+      "epoch": 0.0621,
+      "grad_norm": 1.0060456991195679,
+      "learning_rate": 0.003,
+      "loss": 4.1163,
+      "step": 6210
+    },
+    {
+      "epoch": 0.06211,
+      "grad_norm": 0.993887722492218,
+      "learning_rate": 0.003,
+      "loss": 4.077,
+      "step": 6211
+    },
+    {
+      "epoch": 0.06212,
+      "grad_norm": 0.9443503618240356,
+      "learning_rate": 0.003,
+      "loss": 4.0868,
+      "step": 6212
+    },
+    {
+      "epoch": 0.06213,
+      "grad_norm": 0.9290711879730225,
+      "learning_rate": 0.003,
+      "loss": 4.0657,
+      "step": 6213
+    },
+    {
+      "epoch": 0.06214,
+      "grad_norm": 0.8242254257202148,
+      "learning_rate": 0.003,
+      "loss": 4.0628,
+      "step": 6214
+    },
+    {
+      "epoch": 0.06215,
+      "grad_norm": 0.7361489534378052,
+      "learning_rate": 0.003,
+      "loss": 4.0798,
+      "step": 6215
+    },
+    {
+      "epoch": 0.06216,
+      "grad_norm": 0.720862627029419,
+      "learning_rate": 0.003,
+      "loss": 4.1078,
+      "step": 6216
+    },
+    {
+      "epoch": 0.06217,
+      "grad_norm": 0.612246572971344,
+      "learning_rate": 0.003,
+      "loss": 4.0644,
+      "step": 6217
+    },
+    {
+      "epoch": 0.06218,
+      "grad_norm": 0.43744581937789917,
+      "learning_rate": 0.003,
+      "loss": 4.0468,
+      "step": 6218
+    },
+    {
+      "epoch": 0.06219,
+      "grad_norm": 0.46168094873428345,
+      "learning_rate": 0.003,
+      "loss": 4.0765,
+      "step": 6219
+    },
+    {
+      "epoch": 0.0622,
+      "grad_norm": 0.5252222418785095,
+      "learning_rate": 0.003,
+      "loss": 4.0716,
+      "step": 6220
+    },
+    {
+      "epoch": 0.06221,
+      "grad_norm": 0.6057314276695251,
+      "learning_rate": 0.003,
+      "loss": 4.0747,
+      "step": 6221
+    },
+    {
+      "epoch": 0.06222,
+      "grad_norm": 0.6545212268829346,
+      "learning_rate": 0.003,
+      "loss": 4.062,
+      "step": 6222
+    },
+    {
+      "epoch": 0.06223,
+      "grad_norm": 0.6369545459747314,
+      "learning_rate": 0.003,
+      "loss": 4.0683,
+      "step": 6223
+    },
+    {
+      "epoch": 0.06224,
+      "grad_norm": 0.69230717420578,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 6224
+    },
+    {
+      "epoch": 0.06225,
+      "grad_norm": 0.7959716320037842,
+      "learning_rate": 0.003,
+      "loss": 4.067,
+      "step": 6225
+    },
+    {
+      "epoch": 0.06226,
+      "grad_norm": 0.9329782724380493,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 6226
+    },
+    {
+      "epoch": 0.06227,
+      "grad_norm": 0.9592517018318176,
+      "learning_rate": 0.003,
+      "loss": 4.0682,
+      "step": 6227
+    },
+    {
+      "epoch": 0.06228,
+      "grad_norm": 0.8127028942108154,
+      "learning_rate": 0.003,
+      "loss": 4.0699,
+      "step": 6228
+    },
+    {
+      "epoch": 0.06229,
+      "grad_norm": 0.5880432724952698,
+      "learning_rate": 0.003,
+      "loss": 4.0494,
+      "step": 6229
+    },
+    {
+      "epoch": 0.0623,
+      "grad_norm": 0.5531348586082458,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 6230
+    },
+    {
+      "epoch": 0.06231,
+      "grad_norm": 0.6375777721405029,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 6231
+    },
+    {
+      "epoch": 0.06232,
+      "grad_norm": 0.6568375825881958,
+      "learning_rate": 0.003,
+      "loss": 4.0451,
+      "step": 6232
+    },
+    {
+      "epoch": 0.06233,
+      "grad_norm": 0.6916172504425049,
+      "learning_rate": 0.003,
+      "loss": 4.0416,
+      "step": 6233
+    },
+    {
+      "epoch": 0.06234,
+      "grad_norm": 0.6829954981803894,
+      "learning_rate": 0.003,
+      "loss": 4.0462,
+      "step": 6234
+    },
+    {
+      "epoch": 0.06235,
+      "grad_norm": 0.6556631326675415,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 6235
+    },
+    {
+      "epoch": 0.06236,
+      "grad_norm": 0.6663537621498108,
+      "learning_rate": 0.003,
+      "loss": 4.0641,
+      "step": 6236
+    },
+    {
+      "epoch": 0.06237,
+      "grad_norm": 0.6312961578369141,
+      "learning_rate": 0.003,
+      "loss": 4.0513,
+      "step": 6237
+    },
+    {
+      "epoch": 0.06238,
+      "grad_norm": 0.5864769220352173,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 6238
+    },
+    {
+      "epoch": 0.06239,
+      "grad_norm": 0.607796311378479,
+      "learning_rate": 0.003,
+      "loss": 4.0331,
+      "step": 6239
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.6244125962257385,
+      "learning_rate": 0.003,
+      "loss": 4.0491,
+      "step": 6240
+    },
+    {
+      "epoch": 0.06241,
+      "grad_norm": 0.5208604335784912,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 6241
+    },
+    {
+      "epoch": 0.06242,
+      "grad_norm": 0.570060670375824,
+      "learning_rate": 0.003,
+      "loss": 4.0561,
+      "step": 6242
+    },
+    {
+      "epoch": 0.06243,
+      "grad_norm": 0.5775870084762573,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 6243
+    },
+    {
+      "epoch": 0.06244,
+      "grad_norm": 0.6506668329238892,
+      "learning_rate": 0.003,
+      "loss": 4.0795,
+      "step": 6244
+    },
+    {
+      "epoch": 0.06245,
+      "grad_norm": 0.7539433836936951,
+      "learning_rate": 0.003,
+      "loss": 4.0562,
+      "step": 6245
+    },
+    {
+      "epoch": 0.06246,
+      "grad_norm": 0.9184788465499878,
+      "learning_rate": 0.003,
+      "loss": 4.0786,
+      "step": 6246
+    },
+    {
+      "epoch": 0.06247,
+      "grad_norm": 1.0040006637573242,
+      "learning_rate": 0.003,
+      "loss": 4.066,
+      "step": 6247
+    },
+    {
+      "epoch": 0.06248,
+      "grad_norm": 0.9109286069869995,
+      "learning_rate": 0.003,
+      "loss": 4.064,
+      "step": 6248
+    },
+    {
+      "epoch": 0.06249,
+      "grad_norm": 0.7625318765640259,
+      "learning_rate": 0.003,
+      "loss": 4.0741,
+      "step": 6249
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.6695582270622253,
+      "learning_rate": 0.003,
+      "loss": 4.0572,
+      "step": 6250
+    },
+    {
+      "epoch": 0.06251,
+      "grad_norm": 0.6111177802085876,
+      "learning_rate": 0.003,
+      "loss": 4.0477,
+      "step": 6251
+    },
+    {
+      "epoch": 0.06252,
+      "grad_norm": 0.6622089743614197,
+      "learning_rate": 0.003,
+      "loss": 4.0689,
+      "step": 6252
+    },
+    {
+      "epoch": 0.06253,
+      "grad_norm": 0.6060909628868103,
+      "learning_rate": 0.003,
+      "loss": 4.0584,
+      "step": 6253
+    },
+    {
+      "epoch": 0.06254,
+      "grad_norm": 0.5808566212654114,
+      "learning_rate": 0.003,
+      "loss": 4.0527,
+      "step": 6254
+    },
+    {
+      "epoch": 0.06255,
+      "grad_norm": 0.5286803245544434,
+      "learning_rate": 0.003,
+      "loss": 4.0619,
+      "step": 6255
+    },
+    {
+      "epoch": 0.06256,
+      "grad_norm": 0.5413640141487122,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 6256
+    },
+    {
+      "epoch": 0.06257,
+      "grad_norm": 0.6904173493385315,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 6257
+    },
+    {
+      "epoch": 0.06258,
+      "grad_norm": 0.7716007828712463,
+      "learning_rate": 0.003,
+      "loss": 4.0772,
+      "step": 6258
+    },
+    {
+      "epoch": 0.06259,
+      "grad_norm": 0.9134862422943115,
+      "learning_rate": 0.003,
+      "loss": 4.0953,
+      "step": 6259
+    },
+    {
+      "epoch": 0.0626,
+      "grad_norm": 0.9206618070602417,
+      "learning_rate": 0.003,
+      "loss": 4.0674,
+      "step": 6260
+    },
+    {
+      "epoch": 0.06261,
+      "grad_norm": 0.776254415512085,
+      "learning_rate": 0.003,
+      "loss": 4.049,
+      "step": 6261
+    },
+    {
+      "epoch": 0.06262,
+      "grad_norm": 0.7170522212982178,
+      "learning_rate": 0.003,
+      "loss": 4.0542,
+      "step": 6262
+    },
+    {
+      "epoch": 0.06263,
+      "grad_norm": 0.7091271281242371,
+      "learning_rate": 0.003,
+      "loss": 4.0845,
+      "step": 6263
+    },
+    {
+      "epoch": 0.06264,
+      "grad_norm": 0.7496375441551208,
+      "learning_rate": 0.003,
+      "loss": 4.0596,
+      "step": 6264
+    },
+    {
+      "epoch": 0.06265,
+      "grad_norm": 0.8993237018585205,
+      "learning_rate": 0.003,
+      "loss": 4.0858,
+      "step": 6265
+    },
+    {
+      "epoch": 0.06266,
+      "grad_norm": 0.9477318525314331,
+      "learning_rate": 0.003,
+      "loss": 4.0749,
+      "step": 6266
+    },
+    {
+      "epoch": 0.06267,
+      "grad_norm": 1.139681339263916,
+      "learning_rate": 0.003,
+      "loss": 4.0814,
+      "step": 6267
+    },
+    {
+      "epoch": 0.06268,
+      "grad_norm": 1.0855191946029663,
+      "learning_rate": 0.003,
+      "loss": 4.055,
+      "step": 6268
+    },
+    {
+      "epoch": 0.06269,
+      "grad_norm": 0.8395910859107971,
+      "learning_rate": 0.003,
+      "loss": 4.0554,
+      "step": 6269
+    },
+    {
+      "epoch": 0.0627,
+      "grad_norm": 0.9118239879608154,
+      "learning_rate": 0.003,
+      "loss": 4.0742,
+      "step": 6270
+    },
+    {
+      "epoch": 0.06271,
+      "grad_norm": 0.8857484459877014,
+      "learning_rate": 0.003,
+      "loss": 4.0885,
+      "step": 6271
+    },
+    {
+      "epoch": 0.06272,
+      "grad_norm": 0.8165472745895386,
+      "learning_rate": 0.003,
+      "loss": 4.0931,
+      "step": 6272
+    },
+    {
+      "epoch": 0.06273,
+      "grad_norm": 0.8503358960151672,
+      "learning_rate": 0.003,
+      "loss": 4.0945,
+      "step": 6273
+    },
+    {
+      "epoch": 0.06274,
+      "grad_norm": 0.9717170000076294,
+      "learning_rate": 0.003,
+      "loss": 4.0768,
+      "step": 6274
+    },
+    {
+      "epoch": 0.06275,
+      "grad_norm": 0.9205477833747864,
+      "learning_rate": 0.003,
+      "loss": 4.0664,
+      "step": 6275
+    },
+    {
+      "epoch": 0.06276,
+      "grad_norm": 0.8088390827178955,
+      "learning_rate": 0.003,
+      "loss": 4.0805,
+      "step": 6276
+    },
+    {
+      "epoch": 0.06277,
+      "grad_norm": 0.6606585383415222,
+      "learning_rate": 0.003,
+      "loss": 4.0823,
+      "step": 6277
+    },
+    {
+      "epoch": 0.06278,
+      "grad_norm": 0.5426492094993591,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 6278
+    },
+    {
+      "epoch": 0.06279,
+      "grad_norm": 0.5785556435585022,
+      "learning_rate": 0.003,
+      "loss": 4.0867,
+      "step": 6279
+    },
+    {
+      "epoch": 0.0628,
+      "grad_norm": 0.6185020208358765,
+      "learning_rate": 0.003,
+      "loss": 4.0584,
+      "step": 6280
+    },
+    {
+      "epoch": 0.06281,
+      "grad_norm": 0.671174943447113,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 6281
+    },
+    {
+      "epoch": 0.06282,
+      "grad_norm": 0.7437264323234558,
+      "learning_rate": 0.003,
+      "loss": 4.0616,
+      "step": 6282
+    },
+    {
+      "epoch": 0.06283,
+      "grad_norm": 0.8574413657188416,
+      "learning_rate": 0.003,
+      "loss": 4.0414,
+      "step": 6283
+    },
+    {
+      "epoch": 0.06284,
+      "grad_norm": 0.8529795408248901,
+      "learning_rate": 0.003,
+      "loss": 4.0717,
+      "step": 6284
+    },
+    {
+      "epoch": 0.06285,
+      "grad_norm": 0.6762763261795044,
+      "learning_rate": 0.003,
+      "loss": 4.0655,
+      "step": 6285
+    },
+    {
+      "epoch": 0.06286,
+      "grad_norm": 0.6096449494361877,
+      "learning_rate": 0.003,
+      "loss": 4.0496,
+      "step": 6286
+    },
+    {
+      "epoch": 0.06287,
+      "grad_norm": 0.625961184501648,
+      "learning_rate": 0.003,
+      "loss": 4.0436,
+      "step": 6287
+    },
+    {
+      "epoch": 0.06288,
+      "grad_norm": 0.5842055678367615,
+      "learning_rate": 0.003,
+      "loss": 4.0697,
+      "step": 6288
+    },
+    {
+      "epoch": 0.06289,
+      "grad_norm": 0.5878425240516663,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 6289
+    },
+    {
+      "epoch": 0.0629,
+      "grad_norm": 0.5359255075454712,
+      "learning_rate": 0.003,
+      "loss": 4.0495,
+      "step": 6290
+    },
+    {
+      "epoch": 0.06291,
+      "grad_norm": 0.5622683167457581,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 6291
+    },
+    {
+      "epoch": 0.06292,
+      "grad_norm": 0.6570186614990234,
+      "learning_rate": 0.003,
+      "loss": 4.05,
+      "step": 6292
+    },
+    {
+      "epoch": 0.06293,
+      "grad_norm": 0.709516704082489,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 6293
+    },
+    {
+      "epoch": 0.06294,
+      "grad_norm": 0.709321141242981,
+      "learning_rate": 0.003,
+      "loss": 4.0625,
+      "step": 6294
+    },
+    {
+      "epoch": 0.06295,
+      "grad_norm": 0.702165424823761,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 6295
+    },
+    {
+      "epoch": 0.06296,
+      "grad_norm": 0.5981850028038025,
+      "learning_rate": 0.003,
+      "loss": 4.079,
+      "step": 6296
+    },
+    {
+      "epoch": 0.06297,
+      "grad_norm": 0.5463345050811768,
+      "learning_rate": 0.003,
+      "loss": 4.0486,
+      "step": 6297
+    },
+    {
+      "epoch": 0.06298,
+      "grad_norm": 0.5999792218208313,
+      "learning_rate": 0.003,
+      "loss": 4.0487,
+      "step": 6298
+    },
+    {
+      "epoch": 0.06299,
+      "grad_norm": 0.7810006737709045,
+      "learning_rate": 0.003,
+      "loss": 4.0543,
+      "step": 6299
+    },
+    {
+      "epoch": 0.063,
+      "grad_norm": 0.8830216526985168,
+      "learning_rate": 0.003,
+      "loss": 4.1055,
+      "step": 6300
+    },
+    {
+      "epoch": 0.06301,
+      "grad_norm": 0.9055691361427307,
+      "learning_rate": 0.003,
+      "loss": 4.0873,
+      "step": 6301
+    },
+    {
+      "epoch": 0.06302,
+      "grad_norm": 0.9385949969291687,
+      "learning_rate": 0.003,
+      "loss": 4.0768,
+      "step": 6302
+    },
+    {
+      "epoch": 0.06303,
+      "grad_norm": 0.7440510988235474,
+      "learning_rate": 0.003,
+      "loss": 4.0743,
+      "step": 6303
+    },
+    {
+      "epoch": 0.06304,
+      "grad_norm": 0.6754160523414612,
+      "learning_rate": 0.003,
+      "loss": 4.0843,
+      "step": 6304
+    },
+    {
+      "epoch": 0.06305,
+      "grad_norm": 0.7607747316360474,
+      "learning_rate": 0.003,
+      "loss": 4.0918,
+      "step": 6305
+    },
+    {
+      "epoch": 0.06306,
+      "grad_norm": 0.8998769521713257,
+      "learning_rate": 0.003,
+      "loss": 4.0562,
+      "step": 6306
+    },
+    {
+      "epoch": 0.06307,
+      "grad_norm": 0.9299960136413574,
+      "learning_rate": 0.003,
+      "loss": 4.0513,
+      "step": 6307
+    },
+    {
+      "epoch": 0.06308,
+      "grad_norm": 0.8129688501358032,
+      "learning_rate": 0.003,
+      "loss": 4.0745,
+      "step": 6308
+    },
+    {
+      "epoch": 0.06309,
+      "grad_norm": 0.7249149084091187,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 6309
+    },
+    {
+      "epoch": 0.0631,
+      "grad_norm": 0.827510416507721,
+      "learning_rate": 0.003,
+      "loss": 4.0659,
+      "step": 6310
+    },
+    {
+      "epoch": 0.06311,
+      "grad_norm": 0.8704838752746582,
+      "learning_rate": 0.003,
+      "loss": 4.0613,
+      "step": 6311
+    },
+    {
+      "epoch": 0.06312,
+      "grad_norm": 0.8416996598243713,
+      "learning_rate": 0.003,
+      "loss": 4.0581,
+      "step": 6312
+    },
+    {
+      "epoch": 0.06313,
+      "grad_norm": 0.7403684854507446,
+      "learning_rate": 0.003,
+      "loss": 4.0773,
+      "step": 6313
+    },
+    {
+      "epoch": 0.06314,
+      "grad_norm": 0.6789849400520325,
+      "learning_rate": 0.003,
+      "loss": 4.0824,
+      "step": 6314
+    },
+    {
+      "epoch": 0.06315,
+      "grad_norm": 0.6259597539901733,
+      "learning_rate": 0.003,
+      "loss": 4.0571,
+      "step": 6315
+    },
+    {
+      "epoch": 0.06316,
+      "grad_norm": 0.5615050196647644,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 6316
+    },
+    {
+      "epoch": 0.06317,
+      "grad_norm": 0.6137986183166504,
+      "learning_rate": 0.003,
+      "loss": 4.0529,
+      "step": 6317
+    },
+    {
+      "epoch": 0.06318,
+      "grad_norm": 0.7117832899093628,
+      "learning_rate": 0.003,
+      "loss": 4.0965,
+      "step": 6318
+    },
+    {
+      "epoch": 0.06319,
+      "grad_norm": 0.9356580972671509,
+      "learning_rate": 0.003,
+      "loss": 4.1038,
+      "step": 6319
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 1.1936414241790771,
+      "learning_rate": 0.003,
+      "loss": 4.0749,
+      "step": 6320
+    },
+    {
+      "epoch": 0.06321,
+      "grad_norm": 0.993614673614502,
+      "learning_rate": 0.003,
+      "loss": 4.0836,
+      "step": 6321
+    },
+    {
+      "epoch": 0.06322,
+      "grad_norm": 0.857085645198822,
+      "learning_rate": 0.003,
+      "loss": 4.0678,
+      "step": 6322
+    },
+    {
+      "epoch": 0.06323,
+      "grad_norm": 0.8355134725570679,
+      "learning_rate": 0.003,
+      "loss": 4.0626,
+      "step": 6323
+    },
+    {
+      "epoch": 0.06324,
+      "grad_norm": 0.8579593896865845,
+      "learning_rate": 0.003,
+      "loss": 4.0852,
+      "step": 6324
+    },
+    {
+      "epoch": 0.06325,
+      "grad_norm": 0.8484706878662109,
+      "learning_rate": 0.003,
+      "loss": 4.0656,
+      "step": 6325
+    },
+    {
+      "epoch": 0.06326,
+      "grad_norm": 0.689569354057312,
+      "learning_rate": 0.003,
+      "loss": 4.0721,
+      "step": 6326
+    },
+    {
+      "epoch": 0.06327,
+      "grad_norm": 0.5412641763687134,
+      "learning_rate": 0.003,
+      "loss": 4.0388,
+      "step": 6327
+    },
+    {
+      "epoch": 0.06328,
+      "grad_norm": 0.5388414859771729,
+      "learning_rate": 0.003,
+      "loss": 4.0868,
+      "step": 6328
+    },
+    {
+      "epoch": 0.06329,
+      "grad_norm": 0.5315625071525574,
+      "learning_rate": 0.003,
+      "loss": 4.0676,
+      "step": 6329
+    },
+    {
+      "epoch": 0.0633,
+      "grad_norm": 0.4952152669429779,
+      "learning_rate": 0.003,
+      "loss": 4.0409,
+      "step": 6330
+    },
+    {
+      "epoch": 0.06331,
+      "grad_norm": 0.5569480657577515,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 6331
+    },
+    {
+      "epoch": 0.06332,
+      "grad_norm": 0.5562824606895447,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 6332
+    },
+    {
+      "epoch": 0.06333,
+      "grad_norm": 0.6068150997161865,
+      "learning_rate": 0.003,
+      "loss": 4.0441,
+      "step": 6333
+    },
+    {
+      "epoch": 0.06334,
+      "grad_norm": 0.7447565197944641,
+      "learning_rate": 0.003,
+      "loss": 4.0485,
+      "step": 6334
+    },
+    {
+      "epoch": 0.06335,
+      "grad_norm": 0.7546894550323486,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 6335
+    },
+    {
+      "epoch": 0.06336,
+      "grad_norm": 0.8086697459220886,
+      "learning_rate": 0.003,
+      "loss": 4.0783,
+      "step": 6336
+    },
+    {
+      "epoch": 0.06337,
+      "grad_norm": 0.8379433751106262,
+      "learning_rate": 0.003,
+      "loss": 4.0836,
+      "step": 6337
+    },
+    {
+      "epoch": 0.06338,
+      "grad_norm": 0.8580986857414246,
+      "learning_rate": 0.003,
+      "loss": 4.0698,
+      "step": 6338
+    },
+    {
+      "epoch": 0.06339,
+      "grad_norm": 0.7837094664573669,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 6339
+    },
+    {
+      "epoch": 0.0634,
+      "grad_norm": 0.6586499810218811,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 6340
+    },
+    {
+      "epoch": 0.06341,
+      "grad_norm": 0.6290658116340637,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 6341
+    },
+    {
+      "epoch": 0.06342,
+      "grad_norm": 0.6797477006912231,
+      "learning_rate": 0.003,
+      "loss": 4.0613,
+      "step": 6342
+    },
+    {
+      "epoch": 0.06343,
+      "grad_norm": 0.64532470703125,
+      "learning_rate": 0.003,
+      "loss": 4.0688,
+      "step": 6343
+    },
+    {
+      "epoch": 0.06344,
+      "grad_norm": 0.5823304653167725,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 6344
+    },
+    {
+      "epoch": 0.06345,
+      "grad_norm": 0.6109977960586548,
+      "learning_rate": 0.003,
+      "loss": 4.0434,
+      "step": 6345
+    },
+    {
+      "epoch": 0.06346,
+      "grad_norm": 0.72659832239151,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 6346
+    },
+    {
+      "epoch": 0.06347,
+      "grad_norm": 0.9619790315628052,
+      "learning_rate": 0.003,
+      "loss": 4.0539,
+      "step": 6347
+    },
+    {
+      "epoch": 0.06348,
+      "grad_norm": 0.9899624586105347,
+      "learning_rate": 0.003,
+      "loss": 4.0522,
+      "step": 6348
+    },
+    {
+      "epoch": 0.06349,
+      "grad_norm": 0.9189885854721069,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 6349
+    },
+    {
+      "epoch": 0.0635,
+      "grad_norm": 0.8876418471336365,
+      "learning_rate": 0.003,
+      "loss": 4.0448,
+      "step": 6350
+    },
+    {
+      "epoch": 0.06351,
+      "grad_norm": 0.7306442260742188,
+      "learning_rate": 0.003,
+      "loss": 4.071,
+      "step": 6351
+    },
+    {
+      "epoch": 0.06352,
+      "grad_norm": 0.6585521697998047,
+      "learning_rate": 0.003,
+      "loss": 4.0476,
+      "step": 6352
+    },
+    {
+      "epoch": 0.06353,
+      "grad_norm": 0.5543274879455566,
+      "learning_rate": 0.003,
+      "loss": 4.0587,
+      "step": 6353
+    },
+    {
+      "epoch": 0.06354,
+      "grad_norm": 0.5330720543861389,
+      "learning_rate": 0.003,
+      "loss": 4.0514,
+      "step": 6354
+    },
+    {
+      "epoch": 0.06355,
+      "grad_norm": 0.5855516791343689,
+      "learning_rate": 0.003,
+      "loss": 4.0614,
+      "step": 6355
+    },
+    {
+      "epoch": 0.06356,
+      "grad_norm": 0.5944929122924805,
+      "learning_rate": 0.003,
+      "loss": 4.0535,
+      "step": 6356
+    },
+    {
+      "epoch": 0.06357,
+      "grad_norm": 0.5652433633804321,
+      "learning_rate": 0.003,
+      "loss": 4.0685,
+      "step": 6357
+    },
+    {
+      "epoch": 0.06358,
+      "grad_norm": 0.5926536917686462,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 6358
+    },
+    {
+      "epoch": 0.06359,
+      "grad_norm": 0.6881373524665833,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 6359
+    },
+    {
+      "epoch": 0.0636,
+      "grad_norm": 0.8648481369018555,
+      "learning_rate": 0.003,
+      "loss": 4.0736,
+      "step": 6360
+    },
+    {
+      "epoch": 0.06361,
+      "grad_norm": 0.8661714196205139,
+      "learning_rate": 0.003,
+      "loss": 4.0661,
+      "step": 6361
+    },
+    {
+      "epoch": 0.06362,
+      "grad_norm": 0.7547315955162048,
+      "learning_rate": 0.003,
+      "loss": 4.0732,
+      "step": 6362
+    },
+    {
+      "epoch": 0.06363,
+      "grad_norm": 0.6218990683555603,
+      "learning_rate": 0.003,
+      "loss": 4.0516,
+      "step": 6363
+    },
+    {
+      "epoch": 0.06364,
+      "grad_norm": 0.6733616590499878,
+      "learning_rate": 0.003,
+      "loss": 4.0496,
+      "step": 6364
+    },
+    {
+      "epoch": 0.06365,
+      "grad_norm": 0.848198652267456,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 6365
+    },
+    {
+      "epoch": 0.06366,
+      "grad_norm": 1.0001298189163208,
+      "learning_rate": 0.003,
+      "loss": 4.0782,
+      "step": 6366
+    },
+    {
+      "epoch": 0.06367,
+      "grad_norm": 1.0065937042236328,
+      "learning_rate": 0.003,
+      "loss": 4.0606,
+      "step": 6367
+    },
+    {
+      "epoch": 0.06368,
+      "grad_norm": 0.7900940179824829,
+      "learning_rate": 0.003,
+      "loss": 4.0904,
+      "step": 6368
+    },
+    {
+      "epoch": 0.06369,
+      "grad_norm": 0.6981412768363953,
+      "learning_rate": 0.003,
+      "loss": 4.0917,
+      "step": 6369
+    },
+    {
+      "epoch": 0.0637,
+      "grad_norm": 0.7063578367233276,
+      "learning_rate": 0.003,
+      "loss": 4.0554,
+      "step": 6370
+    },
+    {
+      "epoch": 0.06371,
+      "grad_norm": 0.7151117920875549,
+      "learning_rate": 0.003,
+      "loss": 4.0742,
+      "step": 6371
+    },
+    {
+      "epoch": 0.06372,
+      "grad_norm": 0.6243462562561035,
+      "learning_rate": 0.003,
+      "loss": 4.0695,
+      "step": 6372
+    },
+    {
+      "epoch": 0.06373,
+      "grad_norm": 0.6759061813354492,
+      "learning_rate": 0.003,
+      "loss": 4.0618,
+      "step": 6373
+    },
+    {
+      "epoch": 0.06374,
+      "grad_norm": 0.7649587988853455,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 6374
+    },
+    {
+      "epoch": 0.06375,
+      "grad_norm": 0.8787922859191895,
+      "learning_rate": 0.003,
+      "loss": 4.0432,
+      "step": 6375
+    },
+    {
+      "epoch": 0.06376,
+      "grad_norm": 0.8679553866386414,
+      "learning_rate": 0.003,
+      "loss": 4.067,
+      "step": 6376
+    },
+    {
+      "epoch": 0.06377,
+      "grad_norm": 0.7838705778121948,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 6377
+    },
+    {
+      "epoch": 0.06378,
+      "grad_norm": 0.6770272850990295,
+      "learning_rate": 0.003,
+      "loss": 4.0706,
+      "step": 6378
+    },
+    {
+      "epoch": 0.06379,
+      "grad_norm": 0.7824772000312805,
+      "learning_rate": 0.003,
+      "loss": 4.0723,
+      "step": 6379
+    },
+    {
+      "epoch": 0.0638,
+      "grad_norm": 0.9476855993270874,
+      "learning_rate": 0.003,
+      "loss": 4.0544,
+      "step": 6380
+    },
+    {
+      "epoch": 0.06381,
+      "grad_norm": 0.968838095664978,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 6381
+    },
+    {
+      "epoch": 0.06382,
+      "grad_norm": 0.7842888832092285,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 6382
+    },
+    {
+      "epoch": 0.06383,
+      "grad_norm": 0.6376921534538269,
+      "learning_rate": 0.003,
+      "loss": 4.0722,
+      "step": 6383
+    },
+    {
+      "epoch": 0.06384,
+      "grad_norm": 0.5353909134864807,
+      "learning_rate": 0.003,
+      "loss": 4.0677,
+      "step": 6384
+    },
+    {
+      "epoch": 0.06385,
+      "grad_norm": 0.5485050082206726,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 6385
+    },
+    {
+      "epoch": 0.06386,
+      "grad_norm": 0.6523105502128601,
+      "learning_rate": 0.003,
+      "loss": 4.0398,
+      "step": 6386
+    },
+    {
+      "epoch": 0.06387,
+      "grad_norm": 0.7976499795913696,
+      "learning_rate": 0.003,
+      "loss": 4.0531,
+      "step": 6387
+    },
+    {
+      "epoch": 0.06388,
+      "grad_norm": 0.8359301686286926,
+      "learning_rate": 0.003,
+      "loss": 4.0473,
+      "step": 6388
+    },
+    {
+      "epoch": 0.06389,
+      "grad_norm": 0.7813435792922974,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 6389
+    },
+    {
+      "epoch": 0.0639,
+      "grad_norm": 0.7872916460037231,
+      "learning_rate": 0.003,
+      "loss": 4.0559,
+      "step": 6390
+    },
+    {
+      "epoch": 0.06391,
+      "grad_norm": 0.7407468557357788,
+      "learning_rate": 0.003,
+      "loss": 4.0723,
+      "step": 6391
+    },
+    {
+      "epoch": 0.06392,
+      "grad_norm": 0.8617271184921265,
+      "learning_rate": 0.003,
+      "loss": 4.0493,
+      "step": 6392
+    },
+    {
+      "epoch": 0.06393,
+      "grad_norm": 0.8929271101951599,
+      "learning_rate": 0.003,
+      "loss": 4.0669,
+      "step": 6393
+    },
+    {
+      "epoch": 0.06394,
+      "grad_norm": 0.8621600270271301,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 6394
+    },
+    {
+      "epoch": 0.06395,
+      "grad_norm": 0.8182836771011353,
+      "learning_rate": 0.003,
+      "loss": 4.0881,
+      "step": 6395
+    },
+    {
+      "epoch": 0.06396,
+      "grad_norm": 0.644528329372406,
+      "learning_rate": 0.003,
+      "loss": 4.0948,
+      "step": 6396
+    },
+    {
+      "epoch": 0.06397,
+      "grad_norm": 0.5910365581512451,
+      "learning_rate": 0.003,
+      "loss": 4.0472,
+      "step": 6397
+    },
+    {
+      "epoch": 0.06398,
+      "grad_norm": 0.6243775486946106,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 6398
+    },
+    {
+      "epoch": 0.06399,
+      "grad_norm": 0.6281679272651672,
+      "learning_rate": 0.003,
+      "loss": 4.0707,
+      "step": 6399
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.7241624593734741,
+      "learning_rate": 0.003,
+      "loss": 4.0598,
+      "step": 6400
+    },
+    {
+      "epoch": 0.06401,
+      "grad_norm": 0.8978641033172607,
+      "learning_rate": 0.003,
+      "loss": 4.0509,
+      "step": 6401
+    },
+    {
+      "epoch": 0.06402,
+      "grad_norm": 0.8401793837547302,
+      "learning_rate": 0.003,
+      "loss": 4.0546,
+      "step": 6402
+    },
+    {
+      "epoch": 0.06403,
+      "grad_norm": 0.704843282699585,
+      "learning_rate": 0.003,
+      "loss": 4.0481,
+      "step": 6403
+    },
+    {
+      "epoch": 0.06404,
+      "grad_norm": 0.7263625264167786,
+      "learning_rate": 0.003,
+      "loss": 4.1055,
+      "step": 6404
+    },
+    {
+      "epoch": 0.06405,
+      "grad_norm": 0.7337961196899414,
+      "learning_rate": 0.003,
+      "loss": 4.0623,
+      "step": 6405
+    },
+    {
+      "epoch": 0.06406,
+      "grad_norm": 0.729682445526123,
+      "learning_rate": 0.003,
+      "loss": 4.0504,
+      "step": 6406
+    },
+    {
+      "epoch": 0.06407,
+      "grad_norm": 0.7377498149871826,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 6407
+    },
+    {
+      "epoch": 0.06408,
+      "grad_norm": 0.7378864884376526,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 6408
+    },
+    {
+      "epoch": 0.06409,
+      "grad_norm": 0.6734524369239807,
+      "learning_rate": 0.003,
+      "loss": 4.0713,
+      "step": 6409
+    },
+    {
+      "epoch": 0.0641,
+      "grad_norm": 0.5793408751487732,
+      "learning_rate": 0.003,
+      "loss": 4.0608,
+      "step": 6410
+    },
+    {
+      "epoch": 0.06411,
+      "grad_norm": 0.5833590626716614,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 6411
+    },
+    {
+      "epoch": 0.06412,
+      "grad_norm": 0.6013709902763367,
+      "learning_rate": 0.003,
+      "loss": 4.044,
+      "step": 6412
+    },
+    {
+      "epoch": 0.06413,
+      "grad_norm": 0.7732909321784973,
+      "learning_rate": 0.003,
+      "loss": 4.0828,
+      "step": 6413
+    },
+    {
+      "epoch": 0.06414,
+      "grad_norm": 0.8160001039505005,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 6414
+    },
+    {
+      "epoch": 0.06415,
+      "grad_norm": 0.7408420443534851,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 6415
+    },
+    {
+      "epoch": 0.06416,
+      "grad_norm": 0.7382277846336365,
+      "learning_rate": 0.003,
+      "loss": 4.0748,
+      "step": 6416
+    },
+    {
+      "epoch": 0.06417,
+      "grad_norm": 0.7491039037704468,
+      "learning_rate": 0.003,
+      "loss": 4.0494,
+      "step": 6417
+    },
+    {
+      "epoch": 0.06418,
+      "grad_norm": 0.8163646459579468,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 6418
+    },
+    {
+      "epoch": 0.06419,
+      "grad_norm": 0.7435489296913147,
+      "learning_rate": 0.003,
+      "loss": 4.0454,
+      "step": 6419
+    },
+    {
+      "epoch": 0.0642,
+      "grad_norm": 0.604588508605957,
+      "learning_rate": 0.003,
+      "loss": 4.0533,
+      "step": 6420
+    },
+    {
+      "epoch": 0.06421,
+      "grad_norm": 0.532421350479126,
+      "learning_rate": 0.003,
+      "loss": 4.0642,
+      "step": 6421
+    },
+    {
+      "epoch": 0.06422,
+      "grad_norm": 0.5848860144615173,
+      "learning_rate": 0.003,
+      "loss": 4.0503,
+      "step": 6422
+    },
+    {
+      "epoch": 0.06423,
+      "grad_norm": 0.6077694296836853,
+      "learning_rate": 0.003,
+      "loss": 4.0665,
+      "step": 6423
+    },
+    {
+      "epoch": 0.06424,
+      "grad_norm": 0.5964341163635254,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 6424
+    },
+    {
+      "epoch": 0.06425,
+      "grad_norm": 0.5771605968475342,
+      "learning_rate": 0.003,
+      "loss": 4.0489,
+      "step": 6425
+    },
+    {
+      "epoch": 0.06426,
+      "grad_norm": 0.5373570919036865,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 6426
+    },
+    {
+      "epoch": 0.06427,
+      "grad_norm": 0.417835533618927,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 6427
+    },
+    {
+      "epoch": 0.06428,
+      "grad_norm": 0.4400598406791687,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 6428
+    },
+    {
+      "epoch": 0.06429,
+      "grad_norm": 0.437053918838501,
+      "learning_rate": 0.003,
+      "loss": 4.0498,
+      "step": 6429
+    },
+    {
+      "epoch": 0.0643,
+      "grad_norm": 0.5374695658683777,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 6430
+    },
+    {
+      "epoch": 0.06431,
+      "grad_norm": 0.7007521390914917,
+      "learning_rate": 0.003,
+      "loss": 4.0463,
+      "step": 6431
+    },
+    {
+      "epoch": 0.06432,
+      "grad_norm": 0.9336084723472595,
+      "learning_rate": 0.003,
+      "loss": 4.0478,
+      "step": 6432
+    },
+    {
+      "epoch": 0.06433,
+      "grad_norm": 1.0574723482131958,
+      "learning_rate": 0.003,
+      "loss": 4.0867,
+      "step": 6433
+    },
+    {
+      "epoch": 0.06434,
+      "grad_norm": 0.8598960041999817,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 6434
+    },
+    {
+      "epoch": 0.06435,
+      "grad_norm": 0.8860291838645935,
+      "learning_rate": 0.003,
+      "loss": 4.0524,
+      "step": 6435
+    },
+    {
+      "epoch": 0.06436,
+      "grad_norm": 0.9345943927764893,
+      "learning_rate": 0.003,
+      "loss": 4.0738,
+      "step": 6436
+    },
+    {
+      "epoch": 0.06437,
+      "grad_norm": 1.1661456823349,
+      "learning_rate": 0.003,
+      "loss": 4.0864,
+      "step": 6437
+    },
+    {
+      "epoch": 0.06438,
+      "grad_norm": 1.0386587381362915,
+      "learning_rate": 0.003,
+      "loss": 4.0769,
+      "step": 6438
+    },
+    {
+      "epoch": 0.06439,
+      "grad_norm": 1.1419264078140259,
+      "learning_rate": 0.003,
+      "loss": 4.0812,
+      "step": 6439
+    },
+    {
+      "epoch": 0.0644,
+      "grad_norm": 1.1448736190795898,
+      "learning_rate": 0.003,
+      "loss": 4.077,
+      "step": 6440
+    },
+    {
+      "epoch": 0.06441,
+      "grad_norm": 1.128818154335022,
+      "learning_rate": 0.003,
+      "loss": 4.0589,
+      "step": 6441
+    },
+    {
+      "epoch": 0.06442,
+      "grad_norm": 0.9655861854553223,
+      "learning_rate": 0.003,
+      "loss": 4.0746,
+      "step": 6442
+    },
+    {
+      "epoch": 0.06443,
+      "grad_norm": 0.9556558132171631,
+      "learning_rate": 0.003,
+      "loss": 4.0593,
+      "step": 6443
+    },
+    {
+      "epoch": 0.06444,
+      "grad_norm": 0.9255332350730896,
+      "learning_rate": 0.003,
+      "loss": 4.0722,
+      "step": 6444
+    },
+    {
+      "epoch": 0.06445,
+      "grad_norm": 1.0121527910232544,
+      "learning_rate": 0.003,
+      "loss": 4.0899,
+      "step": 6445
+    },
+    {
+      "epoch": 0.06446,
+      "grad_norm": 0.8829848170280457,
+      "learning_rate": 0.003,
+      "loss": 4.0731,
+      "step": 6446
+    },
+    {
+      "epoch": 0.06447,
+      "grad_norm": 0.8952497839927673,
+      "learning_rate": 0.003,
+      "loss": 4.0853,
+      "step": 6447
+    },
+    {
+      "epoch": 0.06448,
+      "grad_norm": 0.9387702345848083,
+      "learning_rate": 0.003,
+      "loss": 4.0948,
+      "step": 6448
+    },
+    {
+      "epoch": 0.06449,
+      "grad_norm": 0.8153852224349976,
+      "learning_rate": 0.003,
+      "loss": 4.1156,
+      "step": 6449
+    },
+    {
+      "epoch": 0.0645,
+      "grad_norm": 0.814096212387085,
+      "learning_rate": 0.003,
+      "loss": 4.0645,
+      "step": 6450
+    },
+    {
+      "epoch": 0.06451,
+      "grad_norm": 0.7601085901260376,
+      "learning_rate": 0.003,
+      "loss": 4.1072,
+      "step": 6451
+    },
+    {
+      "epoch": 0.06452,
+      "grad_norm": 0.7431148886680603,
+      "learning_rate": 0.003,
+      "loss": 4.0681,
+      "step": 6452
+    },
+    {
+      "epoch": 0.06453,
+      "grad_norm": 0.7803151607513428,
+      "learning_rate": 0.003,
+      "loss": 4.1088,
+      "step": 6453
+    },
+    {
+      "epoch": 0.06454,
+      "grad_norm": 0.7259302735328674,
+      "learning_rate": 0.003,
+      "loss": 4.061,
+      "step": 6454
+    },
+    {
+      "epoch": 0.06455,
+      "grad_norm": 0.6496488451957703,
+      "learning_rate": 0.003,
+      "loss": 4.0712,
+      "step": 6455
+    },
+    {
+      "epoch": 0.06456,
+      "grad_norm": 0.719208836555481,
+      "learning_rate": 0.003,
+      "loss": 4.0843,
+      "step": 6456
+    },
+    {
+      "epoch": 0.06457,
+      "grad_norm": 0.8120383620262146,
+      "learning_rate": 0.003,
+      "loss": 4.0657,
+      "step": 6457
+    },
+    {
+      "epoch": 0.06458,
+      "grad_norm": 0.9060261845588684,
+      "learning_rate": 0.003,
+      "loss": 4.0784,
+      "step": 6458
+    },
+    {
+      "epoch": 0.06459,
+      "grad_norm": 0.9373885989189148,
+      "learning_rate": 0.003,
+      "loss": 4.0851,
+      "step": 6459
+    },
+    {
+      "epoch": 0.0646,
+      "grad_norm": 0.9116479158401489,
+      "learning_rate": 0.003,
+      "loss": 4.0641,
+      "step": 6460
+    },
+    {
+      "epoch": 0.06461,
+      "grad_norm": 0.7550241947174072,
+      "learning_rate": 0.003,
+      "loss": 4.1022,
+      "step": 6461
+    },
+    {
+      "epoch": 0.06462,
+      "grad_norm": 0.5618889927864075,
+      "learning_rate": 0.003,
+      "loss": 4.0474,
+      "step": 6462
+    },
+    {
+      "epoch": 0.06463,
+      "grad_norm": 0.5237634181976318,
+      "learning_rate": 0.003,
+      "loss": 4.0466,
+      "step": 6463
+    },
+    {
+      "epoch": 0.06464,
+      "grad_norm": 0.5218420624732971,
+      "learning_rate": 0.003,
+      "loss": 4.0394,
+      "step": 6464
+    },
+    {
+      "epoch": 0.06465,
+      "grad_norm": 0.6775352954864502,
+      "learning_rate": 0.003,
+      "loss": 4.0375,
+      "step": 6465
+    },
+    {
+      "epoch": 0.06466,
+      "grad_norm": 0.7687344551086426,
+      "learning_rate": 0.003,
+      "loss": 4.0549,
+      "step": 6466
+    },
+    {
+      "epoch": 0.06467,
+      "grad_norm": 0.7261245250701904,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 6467
+    },
+    {
+      "epoch": 0.06468,
+      "grad_norm": 0.5774818062782288,
+      "learning_rate": 0.003,
+      "loss": 4.0626,
+      "step": 6468
+    },
+    {
+      "epoch": 0.06469,
+      "grad_norm": 0.49446791410446167,
+      "learning_rate": 0.003,
+      "loss": 4.0602,
+      "step": 6469
+    },
+    {
+      "epoch": 0.0647,
+      "grad_norm": 0.5678610801696777,
+      "learning_rate": 0.003,
+      "loss": 4.0691,
+      "step": 6470
+    },
+    {
+      "epoch": 0.06471,
+      "grad_norm": 0.586567223072052,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 6471
+    },
+    {
+      "epoch": 0.06472,
+      "grad_norm": 0.5532718896865845,
+      "learning_rate": 0.003,
+      "loss": 4.057,
+      "step": 6472
+    },
+    {
+      "epoch": 0.06473,
+      "grad_norm": 0.5075469613075256,
+      "learning_rate": 0.003,
+      "loss": 4.0599,
+      "step": 6473
+    },
+    {
+      "epoch": 0.06474,
+      "grad_norm": 0.4478394389152527,
+      "learning_rate": 0.003,
+      "loss": 4.0774,
+      "step": 6474
+    },
+    {
+      "epoch": 0.06475,
+      "grad_norm": 0.5158814191818237,
+      "learning_rate": 0.003,
+      "loss": 4.0741,
+      "step": 6475
+    },
+    {
+      "epoch": 0.06476,
+      "grad_norm": 0.5592943429946899,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 6476
+    },
+    {
+      "epoch": 0.06477,
+      "grad_norm": 0.6047767400741577,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 6477
+    },
+    {
+      "epoch": 0.06478,
+      "grad_norm": 0.637370765209198,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 6478
+    },
+    {
+      "epoch": 0.06479,
+      "grad_norm": 0.6284873485565186,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 6479
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.7605395913124084,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 6480
+    },
+    {
+      "epoch": 0.06481,
+      "grad_norm": 0.7899333238601685,
+      "learning_rate": 0.003,
+      "loss": 4.0489,
+      "step": 6481
+    },
+    {
+      "epoch": 0.06482,
+      "grad_norm": 0.8250449299812317,
+      "learning_rate": 0.003,
+      "loss": 4.0458,
+      "step": 6482
+    },
+    {
+      "epoch": 0.06483,
+      "grad_norm": 0.7915768623352051,
+      "learning_rate": 0.003,
+      "loss": 4.0571,
+      "step": 6483
+    },
+    {
+      "epoch": 0.06484,
+      "grad_norm": 0.7572276592254639,
+      "learning_rate": 0.003,
+      "loss": 4.054,
+      "step": 6484
+    },
+    {
+      "epoch": 0.06485,
+      "grad_norm": 0.75547856092453,
+      "learning_rate": 0.003,
+      "loss": 4.0753,
+      "step": 6485
+    },
+    {
+      "epoch": 0.06486,
+      "grad_norm": 0.7201529145240784,
+      "learning_rate": 0.003,
+      "loss": 4.0752,
+      "step": 6486
+    },
+    {
+      "epoch": 0.06487,
+      "grad_norm": 0.6418141722679138,
+      "learning_rate": 0.003,
+      "loss": 4.0623,
+      "step": 6487
+    },
+    {
+      "epoch": 0.06488,
+      "grad_norm": 0.5657222270965576,
+      "learning_rate": 0.003,
+      "loss": 4.0665,
+      "step": 6488
+    },
+    {
+      "epoch": 0.06489,
+      "grad_norm": 0.5981627106666565,
+      "learning_rate": 0.003,
+      "loss": 4.0531,
+      "step": 6489
+    },
+    {
+      "epoch": 0.0649,
+      "grad_norm": 0.6097392439842224,
+      "learning_rate": 0.003,
+      "loss": 4.042,
+      "step": 6490
+    },
+    {
+      "epoch": 0.06491,
+      "grad_norm": 0.6370054483413696,
+      "learning_rate": 0.003,
+      "loss": 4.0486,
+      "step": 6491
+    },
+    {
+      "epoch": 0.06492,
+      "grad_norm": 0.6754859685897827,
+      "learning_rate": 0.003,
+      "loss": 4.0642,
+      "step": 6492
+    },
+    {
+      "epoch": 0.06493,
+      "grad_norm": 0.6854249835014343,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 6493
+    },
+    {
+      "epoch": 0.06494,
+      "grad_norm": 0.6282760500907898,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 6494
+    },
+    {
+      "epoch": 0.06495,
+      "grad_norm": 0.5374482274055481,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 6495
+    },
+    {
+      "epoch": 0.06496,
+      "grad_norm": 0.5668441653251648,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 6496
+    },
+    {
+      "epoch": 0.06497,
+      "grad_norm": 0.5741870403289795,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 6497
+    },
+    {
+      "epoch": 0.06498,
+      "grad_norm": 0.6695168018341064,
+      "learning_rate": 0.003,
+      "loss": 4.0373,
+      "step": 6498
+    },
+    {
+      "epoch": 0.06499,
+      "grad_norm": 0.7330256104469299,
+      "learning_rate": 0.003,
+      "loss": 4.0702,
+      "step": 6499
+    },
+    {
+      "epoch": 0.065,
+      "grad_norm": 0.9226296544075012,
+      "learning_rate": 0.003,
+      "loss": 4.0534,
+      "step": 6500
+    },
+    {
+      "epoch": 0.06501,
+      "grad_norm": 1.1895571947097778,
+      "learning_rate": 0.003,
+      "loss": 4.0585,
+      "step": 6501
+    },
+    {
+      "epoch": 0.06502,
+      "grad_norm": 0.9606831073760986,
+      "learning_rate": 0.003,
+      "loss": 4.0591,
+      "step": 6502
+    },
+    {
+      "epoch": 0.06503,
+      "grad_norm": 0.7914904952049255,
+      "learning_rate": 0.003,
+      "loss": 4.0721,
+      "step": 6503
+    },
+    {
+      "epoch": 0.06504,
+      "grad_norm": 0.8200267553329468,
+      "learning_rate": 0.003,
+      "loss": 4.0517,
+      "step": 6504
+    },
+    {
+      "epoch": 0.06505,
+      "grad_norm": 0.8769012093544006,
+      "learning_rate": 0.003,
+      "loss": 4.0564,
+      "step": 6505
+    },
+    {
+      "epoch": 0.06506,
+      "grad_norm": 1.0376636981964111,
+      "learning_rate": 0.003,
+      "loss": 4.0507,
+      "step": 6506
+    },
+    {
+      "epoch": 0.06507,
+      "grad_norm": 1.039513349533081,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 6507
+    },
+    {
+      "epoch": 0.06508,
+      "grad_norm": 0.9043866395950317,
+      "learning_rate": 0.003,
+      "loss": 4.0655,
+      "step": 6508
+    },
+    {
+      "epoch": 0.06509,
+      "grad_norm": 0.9053654670715332,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 6509
+    },
+    {
+      "epoch": 0.0651,
+      "grad_norm": 0.7699447274208069,
+      "learning_rate": 0.003,
+      "loss": 4.0931,
+      "step": 6510
+    },
+    {
+      "epoch": 0.06511,
+      "grad_norm": 0.7321429252624512,
+      "learning_rate": 0.003,
+      "loss": 4.0525,
+      "step": 6511
+    },
+    {
+      "epoch": 0.06512,
+      "grad_norm": 0.7014932036399841,
+      "learning_rate": 0.003,
+      "loss": 4.0386,
+      "step": 6512
+    },
+    {
+      "epoch": 0.06513,
+      "grad_norm": 0.8323968648910522,
+      "learning_rate": 0.003,
+      "loss": 4.0698,
+      "step": 6513
+    },
+    {
+      "epoch": 0.06514,
+      "grad_norm": 0.9952423572540283,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 6514
+    },
+    {
+      "epoch": 0.06515,
+      "grad_norm": 1.039555311203003,
+      "learning_rate": 0.003,
+      "loss": 4.0508,
+      "step": 6515
+    },
+    {
+      "epoch": 0.06516,
+      "grad_norm": 0.7747417688369751,
+      "learning_rate": 0.003,
+      "loss": 4.0654,
+      "step": 6516
+    },
+    {
+      "epoch": 0.06517,
+      "grad_norm": 0.6039211750030518,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 6517
+    },
+    {
+      "epoch": 0.06518,
+      "grad_norm": 0.619925856590271,
+      "learning_rate": 0.003,
+      "loss": 4.0785,
+      "step": 6518
+    },
+    {
+      "epoch": 0.06519,
+      "grad_norm": 0.635144054889679,
+      "learning_rate": 0.003,
+      "loss": 4.0523,
+      "step": 6519
+    },
+    {
+      "epoch": 0.0652,
+      "grad_norm": 0.5885560512542725,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 6520
+    },
+    {
+      "epoch": 0.06521,
+      "grad_norm": 0.5701271891593933,
+      "learning_rate": 0.003,
+      "loss": 4.0674,
+      "step": 6521
+    },
+    {
+      "epoch": 0.06522,
+      "grad_norm": 0.6156445145606995,
+      "learning_rate": 0.003,
+      "loss": 4.0695,
+      "step": 6522
+    },
+    {
+      "epoch": 0.06523,
+      "grad_norm": 0.5962189435958862,
+      "learning_rate": 0.003,
+      "loss": 4.0567,
+      "step": 6523
+    },
+    {
+      "epoch": 0.06524,
+      "grad_norm": 0.5860252380371094,
+      "learning_rate": 0.003,
+      "loss": 4.0514,
+      "step": 6524
+    },
+    {
+      "epoch": 0.06525,
+      "grad_norm": 0.5954103469848633,
+      "learning_rate": 0.003,
+      "loss": 4.0683,
+      "step": 6525
+    },
+    {
+      "epoch": 0.06526,
+      "grad_norm": 0.5705258250236511,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 6526
+    },
+    {
+      "epoch": 0.06527,
+      "grad_norm": 0.6064693927764893,
+      "learning_rate": 0.003,
+      "loss": 4.0633,
+      "step": 6527
+    },
+    {
+      "epoch": 0.06528,
+      "grad_norm": 0.6207301020622253,
+      "learning_rate": 0.003,
+      "loss": 4.0632,
+      "step": 6528
+    },
+    {
+      "epoch": 0.06529,
+      "grad_norm": 0.5855225920677185,
+      "learning_rate": 0.003,
+      "loss": 4.049,
+      "step": 6529
+    },
+    {
+      "epoch": 0.0653,
+      "grad_norm": 0.6270435452461243,
+      "learning_rate": 0.003,
+      "loss": 4.0452,
+      "step": 6530
+    },
+    {
+      "epoch": 0.06531,
+      "grad_norm": 0.6729288697242737,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 6531
+    },
+    {
+      "epoch": 0.06532,
+      "grad_norm": 0.8055482506752014,
+      "learning_rate": 0.003,
+      "loss": 4.0689,
+      "step": 6532
+    },
+    {
+      "epoch": 0.06533,
+      "grad_norm": 0.8493422865867615,
+      "learning_rate": 0.003,
+      "loss": 4.0688,
+      "step": 6533
+    },
+    {
+      "epoch": 0.06534,
+      "grad_norm": 1.0013922452926636,
+      "learning_rate": 0.003,
+      "loss": 4.0654,
+      "step": 6534
+    },
+    {
+      "epoch": 0.06535,
+      "grad_norm": 1.09293794631958,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 6535
+    },
+    {
+      "epoch": 0.06536,
+      "grad_norm": 0.7350423336029053,
+      "learning_rate": 0.003,
+      "loss": 4.0414,
+      "step": 6536
+    },
+    {
+      "epoch": 0.06537,
+      "grad_norm": 0.589043915271759,
+      "learning_rate": 0.003,
+      "loss": 4.0563,
+      "step": 6537
+    },
+    {
+      "epoch": 0.06538,
+      "grad_norm": 0.6554220914840698,
+      "learning_rate": 0.003,
+      "loss": 4.0628,
+      "step": 6538
+    },
+    {
+      "epoch": 0.06539,
+      "grad_norm": 0.6359267830848694,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 6539
+    },
+    {
+      "epoch": 0.0654,
+      "grad_norm": 0.6657839417457581,
+      "learning_rate": 0.003,
+      "loss": 4.089,
+      "step": 6540
+    },
+    {
+      "epoch": 0.06541,
+      "grad_norm": 0.6628341674804688,
+      "learning_rate": 0.003,
+      "loss": 4.0517,
+      "step": 6541
+    },
+    {
+      "epoch": 0.06542,
+      "grad_norm": 0.6489399671554565,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 6542
+    },
+    {
+      "epoch": 0.06543,
+      "grad_norm": 0.6529117226600647,
+      "learning_rate": 0.003,
+      "loss": 4.0654,
+      "step": 6543
+    },
+    {
+      "epoch": 0.06544,
+      "grad_norm": 0.6327089667320251,
+      "learning_rate": 0.003,
+      "loss": 4.0558,
+      "step": 6544
+    },
+    {
+      "epoch": 0.06545,
+      "grad_norm": 0.6792625784873962,
+      "learning_rate": 0.003,
+      "loss": 4.0627,
+      "step": 6545
+    },
+    {
+      "epoch": 0.06546,
+      "grad_norm": 0.6849966645240784,
+      "learning_rate": 0.003,
+      "loss": 4.0456,
+      "step": 6546
+    },
+    {
+      "epoch": 0.06547,
+      "grad_norm": 0.673355758190155,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 6547
+    },
+    {
+      "epoch": 0.06548,
+      "grad_norm": 0.5450528860092163,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 6548
+    },
+    {
+      "epoch": 0.06549,
+      "grad_norm": 0.519288957118988,
+      "learning_rate": 0.003,
+      "loss": 4.0686,
+      "step": 6549
+    },
+    {
+      "epoch": 0.0655,
+      "grad_norm": 0.5287633538246155,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 6550
+    },
+    {
+      "epoch": 0.06551,
+      "grad_norm": 0.56468665599823,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 6551
+    },
+    {
+      "epoch": 0.06552,
+      "grad_norm": 0.6360222101211548,
+      "learning_rate": 0.003,
+      "loss": 4.0655,
+      "step": 6552
+    },
+    {
+      "epoch": 0.06553,
+      "grad_norm": 0.6669138073921204,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 6553
+    },
+    {
+      "epoch": 0.06554,
+      "grad_norm": 0.7647295594215393,
+      "learning_rate": 0.003,
+      "loss": 4.0537,
+      "step": 6554
+    },
+    {
+      "epoch": 0.06555,
+      "grad_norm": 0.9226347208023071,
+      "learning_rate": 0.003,
+      "loss": 4.052,
+      "step": 6555
+    },
+    {
+      "epoch": 0.06556,
+      "grad_norm": 0.9914687871932983,
+      "learning_rate": 0.003,
+      "loss": 4.0755,
+      "step": 6556
+    },
+    {
+      "epoch": 0.06557,
+      "grad_norm": 1.0823801755905151,
+      "learning_rate": 0.003,
+      "loss": 4.0548,
+      "step": 6557
+    },
+    {
+      "epoch": 0.06558,
+      "grad_norm": 0.888142466545105,
+      "learning_rate": 0.003,
+      "loss": 4.0835,
+      "step": 6558
+    },
+    {
+      "epoch": 0.06559,
+      "grad_norm": 0.8824571371078491,
+      "learning_rate": 0.003,
+      "loss": 4.0645,
+      "step": 6559
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.9318628907203674,
+      "learning_rate": 0.003,
+      "loss": 4.0646,
+      "step": 6560
+    },
+    {
+      "epoch": 0.06561,
+      "grad_norm": 1.0599497556686401,
+      "learning_rate": 0.003,
+      "loss": 4.0908,
+      "step": 6561
+    },
+    {
+      "epoch": 0.06562,
+      "grad_norm": 0.9844862818717957,
+      "learning_rate": 0.003,
+      "loss": 4.0511,
+      "step": 6562
+    },
+    {
+      "epoch": 0.06563,
+      "grad_norm": 0.9434862732887268,
+      "learning_rate": 0.003,
+      "loss": 4.0717,
+      "step": 6563
+    },
+    {
+      "epoch": 0.06564,
+      "grad_norm": 0.8977039456367493,
+      "learning_rate": 0.003,
+      "loss": 4.08,
+      "step": 6564
+    },
+    {
+      "epoch": 0.06565,
+      "grad_norm": 1.0686181783676147,
+      "learning_rate": 0.003,
+      "loss": 4.0549,
+      "step": 6565
+    },
+    {
+      "epoch": 0.06566,
+      "grad_norm": 0.949355959892273,
+      "learning_rate": 0.003,
+      "loss": 4.0777,
+      "step": 6566
+    },
+    {
+      "epoch": 0.06567,
+      "grad_norm": 1.2126977443695068,
+      "learning_rate": 0.003,
+      "loss": 4.1051,
+      "step": 6567
+    },
+    {
+      "epoch": 0.06568,
+      "grad_norm": 0.8228438496589661,
+      "learning_rate": 0.003,
+      "loss": 4.101,
+      "step": 6568
+    },
+    {
+      "epoch": 0.06569,
+      "grad_norm": 0.7816138863563538,
+      "learning_rate": 0.003,
+      "loss": 4.0508,
+      "step": 6569
+    },
+    {
+      "epoch": 0.0657,
+      "grad_norm": 0.7952476143836975,
+      "learning_rate": 0.003,
+      "loss": 4.0753,
+      "step": 6570
+    },
+    {
+      "epoch": 0.06571,
+      "grad_norm": 0.8013068437576294,
+      "learning_rate": 0.003,
+      "loss": 4.0839,
+      "step": 6571
+    },
+    {
+      "epoch": 0.06572,
+      "grad_norm": 0.7705895304679871,
+      "learning_rate": 0.003,
+      "loss": 4.0475,
+      "step": 6572
+    },
+    {
+      "epoch": 0.06573,
+      "grad_norm": 0.9079228043556213,
+      "learning_rate": 0.003,
+      "loss": 4.0773,
+      "step": 6573
+    },
+    {
+      "epoch": 0.06574,
+      "grad_norm": 0.9015453457832336,
+      "learning_rate": 0.003,
+      "loss": 4.0698,
+      "step": 6574
+    },
+    {
+      "epoch": 0.06575,
+      "grad_norm": 0.8765079975128174,
+      "learning_rate": 0.003,
+      "loss": 4.0591,
+      "step": 6575
+    },
+    {
+      "epoch": 0.06576,
+      "grad_norm": 0.9798871874809265,
+      "learning_rate": 0.003,
+      "loss": 4.0477,
+      "step": 6576
+    },
+    {
+      "epoch": 0.06577,
+      "grad_norm": 0.9194977283477783,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 6577
+    },
+    {
+      "epoch": 0.06578,
+      "grad_norm": 0.8564810752868652,
+      "learning_rate": 0.003,
+      "loss": 4.0605,
+      "step": 6578
+    },
+    {
+      "epoch": 0.06579,
+      "grad_norm": 0.9468756318092346,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 6579
+    },
+    {
+      "epoch": 0.0658,
+      "grad_norm": 1.0715759992599487,
+      "learning_rate": 0.003,
+      "loss": 4.0549,
+      "step": 6580
+    },
+    {
+      "epoch": 0.06581,
+      "grad_norm": 1.0081336498260498,
+      "learning_rate": 0.003,
+      "loss": 4.0924,
+      "step": 6581
+    },
+    {
+      "epoch": 0.06582,
+      "grad_norm": 1.1618338823318481,
+      "learning_rate": 0.003,
+      "loss": 4.0976,
+      "step": 6582
+    },
+    {
+      "epoch": 0.06583,
+      "grad_norm": 0.7572551369667053,
+      "learning_rate": 0.003,
+      "loss": 4.0524,
+      "step": 6583
+    },
+    {
+      "epoch": 0.06584,
+      "grad_norm": 0.699311375617981,
+      "learning_rate": 0.003,
+      "loss": 4.0446,
+      "step": 6584
+    },
+    {
+      "epoch": 0.06585,
+      "grad_norm": 0.7904641628265381,
+      "learning_rate": 0.003,
+      "loss": 4.0658,
+      "step": 6585
+    },
+    {
+      "epoch": 0.06586,
+      "grad_norm": 0.7364819049835205,
+      "learning_rate": 0.003,
+      "loss": 4.0745,
+      "step": 6586
+    },
+    {
+      "epoch": 0.06587,
+      "grad_norm": 0.6441203355789185,
+      "learning_rate": 0.003,
+      "loss": 4.0742,
+      "step": 6587
+    },
+    {
+      "epoch": 0.06588,
+      "grad_norm": 0.6132922172546387,
+      "learning_rate": 0.003,
+      "loss": 4.0928,
+      "step": 6588
+    },
+    {
+      "epoch": 0.06589,
+      "grad_norm": 0.5162426829338074,
+      "learning_rate": 0.003,
+      "loss": 4.0777,
+      "step": 6589
+    },
+    {
+      "epoch": 0.0659,
+      "grad_norm": 0.4835585057735443,
+      "learning_rate": 0.003,
+      "loss": 4.0835,
+      "step": 6590
+    },
+    {
+      "epoch": 0.06591,
+      "grad_norm": 0.4747855067253113,
+      "learning_rate": 0.003,
+      "loss": 4.1044,
+      "step": 6591
+    },
+    {
+      "epoch": 0.06592,
+      "grad_norm": 0.4445967674255371,
+      "learning_rate": 0.003,
+      "loss": 4.0657,
+      "step": 6592
+    },
+    {
+      "epoch": 0.06593,
+      "grad_norm": 0.419645220041275,
+      "learning_rate": 0.003,
+      "loss": 4.0448,
+      "step": 6593
+    },
+    {
+      "epoch": 0.06594,
+      "grad_norm": 0.4333605468273163,
+      "learning_rate": 0.003,
+      "loss": 4.065,
+      "step": 6594
+    },
+    {
+      "epoch": 0.06595,
+      "grad_norm": 0.4344804584980011,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 6595
+    },
+    {
+      "epoch": 0.06596,
+      "grad_norm": 0.4777514636516571,
+      "learning_rate": 0.003,
+      "loss": 4.0689,
+      "step": 6596
+    },
+    {
+      "epoch": 0.06597,
+      "grad_norm": 0.4417304992675781,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 6597
+    },
+    {
+      "epoch": 0.06598,
+      "grad_norm": 0.4197903573513031,
+      "learning_rate": 0.003,
+      "loss": 4.0739,
+      "step": 6598
+    },
+    {
+      "epoch": 0.06599,
+      "grad_norm": 0.447392076253891,
+      "learning_rate": 0.003,
+      "loss": 4.056,
+      "step": 6599
+    },
+    {
+      "epoch": 0.066,
+      "grad_norm": 0.43983280658721924,
+      "learning_rate": 0.003,
+      "loss": 4.0624,
+      "step": 6600
+    },
+    {
+      "epoch": 0.06601,
+      "grad_norm": 0.41866225004196167,
+      "learning_rate": 0.003,
+      "loss": 4.0448,
+      "step": 6601
+    },
+    {
+      "epoch": 0.06602,
+      "grad_norm": 0.42563024163246155,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 6602
+    },
+    {
+      "epoch": 0.06603,
+      "grad_norm": 0.4792834520339966,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 6603
+    },
+    {
+      "epoch": 0.06604,
+      "grad_norm": 0.5046424269676208,
+      "learning_rate": 0.003,
+      "loss": 4.034,
+      "step": 6604
+    },
+    {
+      "epoch": 0.06605,
+      "grad_norm": 0.4766392111778259,
+      "learning_rate": 0.003,
+      "loss": 4.0567,
+      "step": 6605
+    },
+    {
+      "epoch": 0.06606,
+      "grad_norm": 0.4675769507884979,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 6606
+    },
+    {
+      "epoch": 0.06607,
+      "grad_norm": 0.5015403032302856,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 6607
+    },
+    {
+      "epoch": 0.06608,
+      "grad_norm": 0.5450870990753174,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 6608
+    },
+    {
+      "epoch": 0.06609,
+      "grad_norm": 0.5614660978317261,
+      "learning_rate": 0.003,
+      "loss": 4.0447,
+      "step": 6609
+    },
+    {
+      "epoch": 0.0661,
+      "grad_norm": 0.6893346905708313,
+      "learning_rate": 0.003,
+      "loss": 4.0488,
+      "step": 6610
+    },
+    {
+      "epoch": 0.06611,
+      "grad_norm": 1.0388861894607544,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 6611
+    },
+    {
+      "epoch": 0.06612,
+      "grad_norm": 1.5366320610046387,
+      "learning_rate": 0.003,
+      "loss": 4.057,
+      "step": 6612
+    },
+    {
+      "epoch": 0.06613,
+      "grad_norm": 0.6268520355224609,
+      "learning_rate": 0.003,
+      "loss": 4.055,
+      "step": 6613
+    },
+    {
+      "epoch": 0.06614,
+      "grad_norm": 0.7668797373771667,
+      "learning_rate": 0.003,
+      "loss": 4.0607,
+      "step": 6614
+    },
+    {
+      "epoch": 0.06615,
+      "grad_norm": 0.9136865735054016,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 6615
+    },
+    {
+      "epoch": 0.06616,
+      "grad_norm": 0.9507898688316345,
+      "learning_rate": 0.003,
+      "loss": 4.0974,
+      "step": 6616
+    },
+    {
+      "epoch": 0.06617,
+      "grad_norm": 0.9568789005279541,
+      "learning_rate": 0.003,
+      "loss": 4.0775,
+      "step": 6617
+    },
+    {
+      "epoch": 0.06618,
+      "grad_norm": 0.8532642126083374,
+      "learning_rate": 0.003,
+      "loss": 4.087,
+      "step": 6618
+    },
+    {
+      "epoch": 0.06619,
+      "grad_norm": 0.8649228811264038,
+      "learning_rate": 0.003,
+      "loss": 4.0548,
+      "step": 6619
+    },
+    {
+      "epoch": 0.0662,
+      "grad_norm": 0.9151912927627563,
+      "learning_rate": 0.003,
+      "loss": 4.0451,
+      "step": 6620
+    },
+    {
+      "epoch": 0.06621,
+      "grad_norm": 0.9457108378410339,
+      "learning_rate": 0.003,
+      "loss": 4.0807,
+      "step": 6621
+    },
+    {
+      "epoch": 0.06622,
+      "grad_norm": 0.9539183378219604,
+      "learning_rate": 0.003,
+      "loss": 4.0779,
+      "step": 6622
+    },
+    {
+      "epoch": 0.06623,
+      "grad_norm": 1.0284817218780518,
+      "learning_rate": 0.003,
+      "loss": 4.0667,
+      "step": 6623
+    },
+    {
+      "epoch": 0.06624,
+      "grad_norm": 0.9398470520973206,
+      "learning_rate": 0.003,
+      "loss": 4.0848,
+      "step": 6624
+    },
+    {
+      "epoch": 0.06625,
+      "grad_norm": 0.9005268812179565,
+      "learning_rate": 0.003,
+      "loss": 4.0742,
+      "step": 6625
+    },
+    {
+      "epoch": 0.06626,
+      "grad_norm": 0.9820789694786072,
+      "learning_rate": 0.003,
+      "loss": 4.095,
+      "step": 6626
+    },
+    {
+      "epoch": 0.06627,
+      "grad_norm": 1.002232551574707,
+      "learning_rate": 0.003,
+      "loss": 4.0848,
+      "step": 6627
+    },
+    {
+      "epoch": 0.06628,
+      "grad_norm": 1.0087987184524536,
+      "learning_rate": 0.003,
+      "loss": 4.0902,
+      "step": 6628
+    },
+    {
+      "epoch": 0.06629,
+      "grad_norm": 0.9122500419616699,
+      "learning_rate": 0.003,
+      "loss": 4.084,
+      "step": 6629
+    },
+    {
+      "epoch": 0.0663,
+      "grad_norm": 0.9479754567146301,
+      "learning_rate": 0.003,
+      "loss": 4.1004,
+      "step": 6630
+    },
+    {
+      "epoch": 0.06631,
+      "grad_norm": 0.840352475643158,
+      "learning_rate": 0.003,
+      "loss": 4.0772,
+      "step": 6631
+    },
+    {
+      "epoch": 0.06632,
+      "grad_norm": 0.8523590564727783,
+      "learning_rate": 0.003,
+      "loss": 4.0789,
+      "step": 6632
+    },
+    {
+      "epoch": 0.06633,
+      "grad_norm": 0.9672338366508484,
+      "learning_rate": 0.003,
+      "loss": 4.1008,
+      "step": 6633
+    },
+    {
+      "epoch": 0.06634,
+      "grad_norm": 1.0756926536560059,
+      "learning_rate": 0.003,
+      "loss": 4.095,
+      "step": 6634
+    },
+    {
+      "epoch": 0.06635,
+      "grad_norm": 0.8541210293769836,
+      "learning_rate": 0.003,
+      "loss": 4.1062,
+      "step": 6635
+    },
+    {
+      "epoch": 0.06636,
+      "grad_norm": 0.7803227305412292,
+      "learning_rate": 0.003,
+      "loss": 4.0547,
+      "step": 6636
+    },
+    {
+      "epoch": 0.06637,
+      "grad_norm": 0.7811536192893982,
+      "learning_rate": 0.003,
+      "loss": 4.0813,
+      "step": 6637
+    },
+    {
+      "epoch": 0.06638,
+      "grad_norm": 0.7923469543457031,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 6638
+    },
+    {
+      "epoch": 0.06639,
+      "grad_norm": 0.7449505925178528,
+      "learning_rate": 0.003,
+      "loss": 4.0814,
+      "step": 6639
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.8193292617797852,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 6640
+    },
+    {
+      "epoch": 0.06641,
+      "grad_norm": 0.731978178024292,
+      "learning_rate": 0.003,
+      "loss": 4.076,
+      "step": 6641
+    },
+    {
+      "epoch": 0.06642,
+      "grad_norm": 0.6337401270866394,
+      "learning_rate": 0.003,
+      "loss": 4.0768,
+      "step": 6642
+    },
+    {
+      "epoch": 0.06643,
+      "grad_norm": 0.633114218711853,
+      "learning_rate": 0.003,
+      "loss": 4.0616,
+      "step": 6643
+    },
+    {
+      "epoch": 0.06644,
+      "grad_norm": 0.6180500388145447,
+      "learning_rate": 0.003,
+      "loss": 4.0634,
+      "step": 6644
+    },
+    {
+      "epoch": 0.06645,
+      "grad_norm": 0.6271146535873413,
+      "learning_rate": 0.003,
+      "loss": 4.0475,
+      "step": 6645
+    },
+    {
+      "epoch": 0.06646,
+      "grad_norm": 0.6926038265228271,
+      "learning_rate": 0.003,
+      "loss": 4.0492,
+      "step": 6646
+    },
+    {
+      "epoch": 0.06647,
+      "grad_norm": 0.6170114278793335,
+      "learning_rate": 0.003,
+      "loss": 4.0756,
+      "step": 6647
+    },
+    {
+      "epoch": 0.06648,
+      "grad_norm": 0.6361090540885925,
+      "learning_rate": 0.003,
+      "loss": 4.0495,
+      "step": 6648
+    },
+    {
+      "epoch": 0.06649,
+      "grad_norm": 0.5704471468925476,
+      "learning_rate": 0.003,
+      "loss": 4.0572,
+      "step": 6649
+    },
+    {
+      "epoch": 0.0665,
+      "grad_norm": 0.5242335200309753,
+      "learning_rate": 0.003,
+      "loss": 4.0532,
+      "step": 6650
+    },
+    {
+      "epoch": 0.06651,
+      "grad_norm": 0.5608069896697998,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 6651
+    },
+    {
+      "epoch": 0.06652,
+      "grad_norm": 0.4782668650150299,
+      "learning_rate": 0.003,
+      "loss": 4.0456,
+      "step": 6652
+    },
+    {
+      "epoch": 0.06653,
+      "grad_norm": 0.5027245879173279,
+      "learning_rate": 0.003,
+      "loss": 4.0718,
+      "step": 6653
+    },
+    {
+      "epoch": 0.06654,
+      "grad_norm": 0.5486674308776855,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 6654
+    },
+    {
+      "epoch": 0.06655,
+      "grad_norm": 0.5735544562339783,
+      "learning_rate": 0.003,
+      "loss": 4.0474,
+      "step": 6655
+    },
+    {
+      "epoch": 0.06656,
+      "grad_norm": 0.6207435131072998,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 6656
+    },
+    {
+      "epoch": 0.06657,
+      "grad_norm": 0.676517128944397,
+      "learning_rate": 0.003,
+      "loss": 4.0526,
+      "step": 6657
+    },
+    {
+      "epoch": 0.06658,
+      "grad_norm": 0.7929190993309021,
+      "learning_rate": 0.003,
+      "loss": 4.0477,
+      "step": 6658
+    },
+    {
+      "epoch": 0.06659,
+      "grad_norm": 0.9687618613243103,
+      "learning_rate": 0.003,
+      "loss": 4.0659,
+      "step": 6659
+    },
+    {
+      "epoch": 0.0666,
+      "grad_norm": 1.0018961429595947,
+      "learning_rate": 0.003,
+      "loss": 4.0534,
+      "step": 6660
+    },
+    {
+      "epoch": 0.06661,
+      "grad_norm": 0.6997591257095337,
+      "learning_rate": 0.003,
+      "loss": 4.0434,
+      "step": 6661
+    },
+    {
+      "epoch": 0.06662,
+      "grad_norm": 0.7275426387786865,
+      "learning_rate": 0.003,
+      "loss": 4.0364,
+      "step": 6662
+    },
+    {
+      "epoch": 0.06663,
+      "grad_norm": 0.895223081111908,
+      "learning_rate": 0.003,
+      "loss": 4.1078,
+      "step": 6663
+    },
+    {
+      "epoch": 0.06664,
+      "grad_norm": 0.8222059011459351,
+      "learning_rate": 0.003,
+      "loss": 4.0649,
+      "step": 6664
+    },
+    {
+      "epoch": 0.06665,
+      "grad_norm": 0.6808730363845825,
+      "learning_rate": 0.003,
+      "loss": 4.0902,
+      "step": 6665
+    },
+    {
+      "epoch": 0.06666,
+      "grad_norm": 0.5744432806968689,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 6666
+    },
+    {
+      "epoch": 0.06667,
+      "grad_norm": 0.6136099100112915,
+      "learning_rate": 0.003,
+      "loss": 4.0452,
+      "step": 6667
+    },
+    {
+      "epoch": 0.06668,
+      "grad_norm": 0.620444118976593,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 6668
+    },
+    {
+      "epoch": 0.06669,
+      "grad_norm": 0.5014281272888184,
+      "learning_rate": 0.003,
+      "loss": 4.0458,
+      "step": 6669
+    },
+    {
+      "epoch": 0.0667,
+      "grad_norm": 0.5208943486213684,
+      "learning_rate": 0.003,
+      "loss": 4.0569,
+      "step": 6670
+    },
+    {
+      "epoch": 0.06671,
+      "grad_norm": 0.6109919548034668,
+      "learning_rate": 0.003,
+      "loss": 4.0531,
+      "step": 6671
+    },
+    {
+      "epoch": 0.06672,
+      "grad_norm": 0.604210376739502,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 6672
+    },
+    {
+      "epoch": 0.06673,
+      "grad_norm": 0.5179533958435059,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 6673
+    },
+    {
+      "epoch": 0.06674,
+      "grad_norm": 0.49810075759887695,
+      "learning_rate": 0.003,
+      "loss": 4.0601,
+      "step": 6674
+    },
+    {
+      "epoch": 0.06675,
+      "grad_norm": 0.480003297328949,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 6675
+    },
+    {
+      "epoch": 0.06676,
+      "grad_norm": 0.4583708345890045,
+      "learning_rate": 0.003,
+      "loss": 4.0534,
+      "step": 6676
+    },
+    {
+      "epoch": 0.06677,
+      "grad_norm": 0.5014526844024658,
+      "learning_rate": 0.003,
+      "loss": 4.061,
+      "step": 6677
+    },
+    {
+      "epoch": 0.06678,
+      "grad_norm": 0.6773420572280884,
+      "learning_rate": 0.003,
+      "loss": 4.0732,
+      "step": 6678
+    },
+    {
+      "epoch": 0.06679,
+      "grad_norm": 0.7450854182243347,
+      "learning_rate": 0.003,
+      "loss": 4.05,
+      "step": 6679
+    },
+    {
+      "epoch": 0.0668,
+      "grad_norm": 0.7281231880187988,
+      "learning_rate": 0.003,
+      "loss": 4.0852,
+      "step": 6680
+    },
+    {
+      "epoch": 0.06681,
+      "grad_norm": 0.7678727507591248,
+      "learning_rate": 0.003,
+      "loss": 4.043,
+      "step": 6681
+    },
+    {
+      "epoch": 0.06682,
+      "grad_norm": 0.8374806642532349,
+      "learning_rate": 0.003,
+      "loss": 4.0753,
+      "step": 6682
+    },
+    {
+      "epoch": 0.06683,
+      "grad_norm": 0.6994584798812866,
+      "learning_rate": 0.003,
+      "loss": 4.0558,
+      "step": 6683
+    },
+    {
+      "epoch": 0.06684,
+      "grad_norm": 0.8886795043945312,
+      "learning_rate": 0.003,
+      "loss": 4.069,
+      "step": 6684
+    },
+    {
+      "epoch": 0.06685,
+      "grad_norm": 0.9739590883255005,
+      "learning_rate": 0.003,
+      "loss": 4.0699,
+      "step": 6685
+    },
+    {
+      "epoch": 0.06686,
+      "grad_norm": 1.0791521072387695,
+      "learning_rate": 0.003,
+      "loss": 4.0658,
+      "step": 6686
+    },
+    {
+      "epoch": 0.06687,
+      "grad_norm": 1.1976380348205566,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 6687
+    },
+    {
+      "epoch": 0.06688,
+      "grad_norm": 0.9513477683067322,
+      "learning_rate": 0.003,
+      "loss": 4.0606,
+      "step": 6688
+    },
+    {
+      "epoch": 0.06689,
+      "grad_norm": 0.8864545822143555,
+      "learning_rate": 0.003,
+      "loss": 4.0971,
+      "step": 6689
+    },
+    {
+      "epoch": 0.0669,
+      "grad_norm": 0.7956053614616394,
+      "learning_rate": 0.003,
+      "loss": 4.0586,
+      "step": 6690
+    },
+    {
+      "epoch": 0.06691,
+      "grad_norm": 0.8103273510932922,
+      "learning_rate": 0.003,
+      "loss": 4.0747,
+      "step": 6691
+    },
+    {
+      "epoch": 0.06692,
+      "grad_norm": 0.8394555449485779,
+      "learning_rate": 0.003,
+      "loss": 4.0577,
+      "step": 6692
+    },
+    {
+      "epoch": 0.06693,
+      "grad_norm": 0.8771018385887146,
+      "learning_rate": 0.003,
+      "loss": 4.0727,
+      "step": 6693
+    },
+    {
+      "epoch": 0.06694,
+      "grad_norm": 0.8685076236724854,
+      "learning_rate": 0.003,
+      "loss": 4.0658,
+      "step": 6694
+    },
+    {
+      "epoch": 0.06695,
+      "grad_norm": 0.8724049925804138,
+      "learning_rate": 0.003,
+      "loss": 4.0679,
+      "step": 6695
+    },
+    {
+      "epoch": 0.06696,
+      "grad_norm": 0.8524837493896484,
+      "learning_rate": 0.003,
+      "loss": 4.0728,
+      "step": 6696
+    },
+    {
+      "epoch": 0.06697,
+      "grad_norm": 0.7606315016746521,
+      "learning_rate": 0.003,
+      "loss": 4.0781,
+      "step": 6697
+    },
+    {
+      "epoch": 0.06698,
+      "grad_norm": 0.7235579490661621,
+      "learning_rate": 0.003,
+      "loss": 4.0634,
+      "step": 6698
+    },
+    {
+      "epoch": 0.06699,
+      "grad_norm": 0.7316949963569641,
+      "learning_rate": 0.003,
+      "loss": 4.0604,
+      "step": 6699
+    },
+    {
+      "epoch": 0.067,
+      "grad_norm": 0.6315829753875732,
+      "learning_rate": 0.003,
+      "loss": 4.0753,
+      "step": 6700
+    },
+    {
+      "epoch": 0.06701,
+      "grad_norm": 0.631214439868927,
+      "learning_rate": 0.003,
+      "loss": 4.0731,
+      "step": 6701
+    },
+    {
+      "epoch": 0.06702,
+      "grad_norm": 0.6383786201477051,
+      "learning_rate": 0.003,
+      "loss": 4.0591,
+      "step": 6702
+    },
+    {
+      "epoch": 0.06703,
+      "grad_norm": 0.6043766736984253,
+      "learning_rate": 0.003,
+      "loss": 4.0714,
+      "step": 6703
+    },
+    {
+      "epoch": 0.06704,
+      "grad_norm": 0.5771951675415039,
+      "learning_rate": 0.003,
+      "loss": 4.0427,
+      "step": 6704
+    },
+    {
+      "epoch": 0.06705,
+      "grad_norm": 0.62209153175354,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 6705
+    },
+    {
+      "epoch": 0.06706,
+      "grad_norm": 0.7116777300834656,
+      "learning_rate": 0.003,
+      "loss": 4.0933,
+      "step": 6706
+    },
+    {
+      "epoch": 0.06707,
+      "grad_norm": 0.8620831370353699,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 6707
+    },
+    {
+      "epoch": 0.06708,
+      "grad_norm": 0.8954139351844788,
+      "learning_rate": 0.003,
+      "loss": 4.0508,
+      "step": 6708
+    },
+    {
+      "epoch": 0.06709,
+      "grad_norm": 0.8915477991104126,
+      "learning_rate": 0.003,
+      "loss": 4.0483,
+      "step": 6709
+    },
+    {
+      "epoch": 0.0671,
+      "grad_norm": 0.8220002055168152,
+      "learning_rate": 0.003,
+      "loss": 4.0732,
+      "step": 6710
+    },
+    {
+      "epoch": 0.06711,
+      "grad_norm": 0.6552330255508423,
+      "learning_rate": 0.003,
+      "loss": 4.0785,
+      "step": 6711
+    },
+    {
+      "epoch": 0.06712,
+      "grad_norm": 0.6547254323959351,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 6712
+    },
+    {
+      "epoch": 0.06713,
+      "grad_norm": 0.7258485555648804,
+      "learning_rate": 0.003,
+      "loss": 4.0476,
+      "step": 6713
+    },
+    {
+      "epoch": 0.06714,
+      "grad_norm": 0.8777948617935181,
+      "learning_rate": 0.003,
+      "loss": 4.0535,
+      "step": 6714
+    },
+    {
+      "epoch": 0.06715,
+      "grad_norm": 0.9278170466423035,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 6715
+    },
+    {
+      "epoch": 0.06716,
+      "grad_norm": 0.7775757312774658,
+      "learning_rate": 0.003,
+      "loss": 4.0602,
+      "step": 6716
+    },
+    {
+      "epoch": 0.06717,
+      "grad_norm": 0.7415366172790527,
+      "learning_rate": 0.003,
+      "loss": 4.08,
+      "step": 6717
+    },
+    {
+      "epoch": 0.06718,
+      "grad_norm": 0.7367812991142273,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 6718
+    },
+    {
+      "epoch": 0.06719,
+      "grad_norm": 0.7651417255401611,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 6719
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.7453299760818481,
+      "learning_rate": 0.003,
+      "loss": 4.0594,
+      "step": 6720
+    },
+    {
+      "epoch": 0.06721,
+      "grad_norm": 0.6883461475372314,
+      "learning_rate": 0.003,
+      "loss": 4.0698,
+      "step": 6721
+    },
+    {
+      "epoch": 0.06722,
+      "grad_norm": 0.7742420434951782,
+      "learning_rate": 0.003,
+      "loss": 4.0801,
+      "step": 6722
+    },
+    {
+      "epoch": 0.06723,
+      "grad_norm": 0.7198421359062195,
+      "learning_rate": 0.003,
+      "loss": 4.0707,
+      "step": 6723
+    },
+    {
+      "epoch": 0.06724,
+      "grad_norm": 0.6653247475624084,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 6724
+    },
+    {
+      "epoch": 0.06725,
+      "grad_norm": 0.6466822028160095,
+      "learning_rate": 0.003,
+      "loss": 4.0629,
+      "step": 6725
+    },
+    {
+      "epoch": 0.06726,
+      "grad_norm": 0.5745895504951477,
+      "learning_rate": 0.003,
+      "loss": 4.0876,
+      "step": 6726
+    },
+    {
+      "epoch": 0.06727,
+      "grad_norm": 0.530097246170044,
+      "learning_rate": 0.003,
+      "loss": 4.0491,
+      "step": 6727
+    },
+    {
+      "epoch": 0.06728,
+      "grad_norm": 0.5702248811721802,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 6728
+    },
+    {
+      "epoch": 0.06729,
+      "grad_norm": 0.7315114140510559,
+      "learning_rate": 0.003,
+      "loss": 4.0444,
+      "step": 6729
+    },
+    {
+      "epoch": 0.0673,
+      "grad_norm": 0.9155875444412231,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 6730
+    },
+    {
+      "epoch": 0.06731,
+      "grad_norm": 1.1372816562652588,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 6731
+    },
+    {
+      "epoch": 0.06732,
+      "grad_norm": 0.925939679145813,
+      "learning_rate": 0.003,
+      "loss": 4.0527,
+      "step": 6732
+    },
+    {
+      "epoch": 0.06733,
+      "grad_norm": 0.6862448453903198,
+      "learning_rate": 0.003,
+      "loss": 4.0617,
+      "step": 6733
+    },
+    {
+      "epoch": 0.06734,
+      "grad_norm": 0.622930109500885,
+      "learning_rate": 0.003,
+      "loss": 4.0564,
+      "step": 6734
+    },
+    {
+      "epoch": 0.06735,
+      "grad_norm": 0.6733621954917908,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 6735
+    },
+    {
+      "epoch": 0.06736,
+      "grad_norm": 0.683295488357544,
+      "learning_rate": 0.003,
+      "loss": 4.0369,
+      "step": 6736
+    },
+    {
+      "epoch": 0.06737,
+      "grad_norm": 0.5437440872192383,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 6737
+    },
+    {
+      "epoch": 0.06738,
+      "grad_norm": 0.5390477180480957,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 6738
+    },
+    {
+      "epoch": 0.06739,
+      "grad_norm": 0.4995149075984955,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 6739
+    },
+    {
+      "epoch": 0.0674,
+      "grad_norm": 0.5221118927001953,
+      "learning_rate": 0.003,
+      "loss": 4.0366,
+      "step": 6740
+    },
+    {
+      "epoch": 0.06741,
+      "grad_norm": 0.6219124794006348,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 6741
+    },
+    {
+      "epoch": 0.06742,
+      "grad_norm": 0.7121551632881165,
+      "learning_rate": 0.003,
+      "loss": 4.0543,
+      "step": 6742
+    },
+    {
+      "epoch": 0.06743,
+      "grad_norm": 0.8330824971199036,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 6743
+    },
+    {
+      "epoch": 0.06744,
+      "grad_norm": 0.7893618941307068,
+      "learning_rate": 0.003,
+      "loss": 4.0641,
+      "step": 6744
+    },
+    {
+      "epoch": 0.06745,
+      "grad_norm": 0.6968402862548828,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 6745
+    },
+    {
+      "epoch": 0.06746,
+      "grad_norm": 0.7016124725341797,
+      "learning_rate": 0.003,
+      "loss": 4.0452,
+      "step": 6746
+    },
+    {
+      "epoch": 0.06747,
+      "grad_norm": 0.8388985395431519,
+      "learning_rate": 0.003,
+      "loss": 4.054,
+      "step": 6747
+    },
+    {
+      "epoch": 0.06748,
+      "grad_norm": 0.8102107644081116,
+      "learning_rate": 0.003,
+      "loss": 4.0559,
+      "step": 6748
+    },
+    {
+      "epoch": 0.06749,
+      "grad_norm": 0.6827043890953064,
+      "learning_rate": 0.003,
+      "loss": 4.0473,
+      "step": 6749
+    },
+    {
+      "epoch": 0.0675,
+      "grad_norm": 0.7439062595367432,
+      "learning_rate": 0.003,
+      "loss": 4.0397,
+      "step": 6750
+    },
+    {
+      "epoch": 0.06751,
+      "grad_norm": 0.7768245935440063,
+      "learning_rate": 0.003,
+      "loss": 4.0791,
+      "step": 6751
+    },
+    {
+      "epoch": 0.06752,
+      "grad_norm": 0.8553226590156555,
+      "learning_rate": 0.003,
+      "loss": 4.0794,
+      "step": 6752
+    },
+    {
+      "epoch": 0.06753,
+      "grad_norm": 0.9639877080917358,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 6753
+    },
+    {
+      "epoch": 0.06754,
+      "grad_norm": 0.9903431534767151,
+      "learning_rate": 0.003,
+      "loss": 4.0731,
+      "step": 6754
+    },
+    {
+      "epoch": 0.06755,
+      "grad_norm": 0.9414642453193665,
+      "learning_rate": 0.003,
+      "loss": 4.0565,
+      "step": 6755
+    },
+    {
+      "epoch": 0.06756,
+      "grad_norm": 1.0175652503967285,
+      "learning_rate": 0.003,
+      "loss": 4.0753,
+      "step": 6756
+    },
+    {
+      "epoch": 0.06757,
+      "grad_norm": 1.0221658945083618,
+      "learning_rate": 0.003,
+      "loss": 4.0815,
+      "step": 6757
+    },
+    {
+      "epoch": 0.06758,
+      "grad_norm": 0.9344669580459595,
+      "learning_rate": 0.003,
+      "loss": 4.0695,
+      "step": 6758
+    },
+    {
+      "epoch": 0.06759,
+      "grad_norm": 0.824642539024353,
+      "learning_rate": 0.003,
+      "loss": 4.0971,
+      "step": 6759
+    },
+    {
+      "epoch": 0.0676,
+      "grad_norm": 0.8851695656776428,
+      "learning_rate": 0.003,
+      "loss": 4.0832,
+      "step": 6760
+    },
+    {
+      "epoch": 0.06761,
+      "grad_norm": 0.8390560150146484,
+      "learning_rate": 0.003,
+      "loss": 4.0615,
+      "step": 6761
+    },
+    {
+      "epoch": 0.06762,
+      "grad_norm": 0.8119285702705383,
+      "learning_rate": 0.003,
+      "loss": 4.0671,
+      "step": 6762
+    },
+    {
+      "epoch": 0.06763,
+      "grad_norm": 0.8722089529037476,
+      "learning_rate": 0.003,
+      "loss": 4.0676,
+      "step": 6763
+    },
+    {
+      "epoch": 0.06764,
+      "grad_norm": 0.8680243492126465,
+      "learning_rate": 0.003,
+      "loss": 4.049,
+      "step": 6764
+    },
+    {
+      "epoch": 0.06765,
+      "grad_norm": 0.7568630576133728,
+      "learning_rate": 0.003,
+      "loss": 4.0627,
+      "step": 6765
+    },
+    {
+      "epoch": 0.06766,
+      "grad_norm": 0.6181029677391052,
+      "learning_rate": 0.003,
+      "loss": 4.0722,
+      "step": 6766
+    },
+    {
+      "epoch": 0.06767,
+      "grad_norm": 0.6267876029014587,
+      "learning_rate": 0.003,
+      "loss": 4.0719,
+      "step": 6767
+    },
+    {
+      "epoch": 0.06768,
+      "grad_norm": 0.6331607103347778,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 6768
+    },
+    {
+      "epoch": 0.06769,
+      "grad_norm": 0.5892623662948608,
+      "learning_rate": 0.003,
+      "loss": 4.0477,
+      "step": 6769
+    },
+    {
+      "epoch": 0.0677,
+      "grad_norm": 0.6032274961471558,
+      "learning_rate": 0.003,
+      "loss": 4.0624,
+      "step": 6770
+    },
+    {
+      "epoch": 0.06771,
+      "grad_norm": 0.6521531343460083,
+      "learning_rate": 0.003,
+      "loss": 4.048,
+      "step": 6771
+    },
+    {
+      "epoch": 0.06772,
+      "grad_norm": 0.5568339824676514,
+      "learning_rate": 0.003,
+      "loss": 4.0536,
+      "step": 6772
+    },
+    {
+      "epoch": 0.06773,
+      "grad_norm": 0.4971171021461487,
+      "learning_rate": 0.003,
+      "loss": 4.0658,
+      "step": 6773
+    },
+    {
+      "epoch": 0.06774,
+      "grad_norm": 0.47084400057792664,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 6774
+    },
+    {
+      "epoch": 0.06775,
+      "grad_norm": 0.48115456104278564,
+      "learning_rate": 0.003,
+      "loss": 4.0514,
+      "step": 6775
+    },
+    {
+      "epoch": 0.06776,
+      "grad_norm": 0.4709925055503845,
+      "learning_rate": 0.003,
+      "loss": 4.0776,
+      "step": 6776
+    },
+    {
+      "epoch": 0.06777,
+      "grad_norm": 0.5252442359924316,
+      "learning_rate": 0.003,
+      "loss": 4.0594,
+      "step": 6777
+    },
+    {
+      "epoch": 0.06778,
+      "grad_norm": 0.503268837928772,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 6778
+    },
+    {
+      "epoch": 0.06779,
+      "grad_norm": 0.5064627528190613,
+      "learning_rate": 0.003,
+      "loss": 4.0507,
+      "step": 6779
+    },
+    {
+      "epoch": 0.0678,
+      "grad_norm": 0.45777052640914917,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 6780
+    },
+    {
+      "epoch": 0.06781,
+      "grad_norm": 0.5485308766365051,
+      "learning_rate": 0.003,
+      "loss": 4.0768,
+      "step": 6781
+    },
+    {
+      "epoch": 0.06782,
+      "grad_norm": 0.7215536236763,
+      "learning_rate": 0.003,
+      "loss": 4.0566,
+      "step": 6782
+    },
+    {
+      "epoch": 0.06783,
+      "grad_norm": 0.9196093082427979,
+      "learning_rate": 0.003,
+      "loss": 4.0691,
+      "step": 6783
+    },
+    {
+      "epoch": 0.06784,
+      "grad_norm": 1.2304813861846924,
+      "learning_rate": 0.003,
+      "loss": 4.0511,
+      "step": 6784
+    },
+    {
+      "epoch": 0.06785,
+      "grad_norm": 0.6971254348754883,
+      "learning_rate": 0.003,
+      "loss": 4.0608,
+      "step": 6785
+    },
+    {
+      "epoch": 0.06786,
+      "grad_norm": 0.6344663500785828,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 6786
+    },
+    {
+      "epoch": 0.06787,
+      "grad_norm": 0.7827377915382385,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 6787
+    },
+    {
+      "epoch": 0.06788,
+      "grad_norm": 0.8472016453742981,
+      "learning_rate": 0.003,
+      "loss": 4.0686,
+      "step": 6788
+    },
+    {
+      "epoch": 0.06789,
+      "grad_norm": 0.8765668869018555,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 6789
+    },
+    {
+      "epoch": 0.0679,
+      "grad_norm": 0.8723348379135132,
+      "learning_rate": 0.003,
+      "loss": 4.0653,
+      "step": 6790
+    },
+    {
+      "epoch": 0.06791,
+      "grad_norm": 0.7648828625679016,
+      "learning_rate": 0.003,
+      "loss": 4.0461,
+      "step": 6791
+    },
+    {
+      "epoch": 0.06792,
+      "grad_norm": 0.7784323692321777,
+      "learning_rate": 0.003,
+      "loss": 4.0685,
+      "step": 6792
+    },
+    {
+      "epoch": 0.06793,
+      "grad_norm": 0.7629876732826233,
+      "learning_rate": 0.003,
+      "loss": 4.0649,
+      "step": 6793
+    },
+    {
+      "epoch": 0.06794,
+      "grad_norm": 0.7403256297111511,
+      "learning_rate": 0.003,
+      "loss": 4.078,
+      "step": 6794
+    },
+    {
+      "epoch": 0.06795,
+      "grad_norm": 0.7046683430671692,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 6795
+    },
+    {
+      "epoch": 0.06796,
+      "grad_norm": 0.7108621597290039,
+      "learning_rate": 0.003,
+      "loss": 4.0666,
+      "step": 6796
+    },
+    {
+      "epoch": 0.06797,
+      "grad_norm": 0.7049769759178162,
+      "learning_rate": 0.003,
+      "loss": 4.0823,
+      "step": 6797
+    },
+    {
+      "epoch": 0.06798,
+      "grad_norm": 0.7718124389648438,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 6798
+    },
+    {
+      "epoch": 0.06799,
+      "grad_norm": 0.8259626030921936,
+      "learning_rate": 0.003,
+      "loss": 4.0891,
+      "step": 6799
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.7455199956893921,
+      "learning_rate": 0.003,
+      "loss": 4.0607,
+      "step": 6800
+    },
+    {
+      "epoch": 0.06801,
+      "grad_norm": 0.6687690615653992,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 6801
+    },
+    {
+      "epoch": 0.06802,
+      "grad_norm": 0.663851797580719,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 6802
+    },
+    {
+      "epoch": 0.06803,
+      "grad_norm": 0.726151168346405,
+      "learning_rate": 0.003,
+      "loss": 4.054,
+      "step": 6803
+    },
+    {
+      "epoch": 0.06804,
+      "grad_norm": 0.7056896686553955,
+      "learning_rate": 0.003,
+      "loss": 4.0683,
+      "step": 6804
+    },
+    {
+      "epoch": 0.06805,
+      "grad_norm": 0.7536913156509399,
+      "learning_rate": 0.003,
+      "loss": 4.0472,
+      "step": 6805
+    },
+    {
+      "epoch": 0.06806,
+      "grad_norm": 1.025210976600647,
+      "learning_rate": 0.003,
+      "loss": 4.0513,
+      "step": 6806
+    },
+    {
+      "epoch": 0.06807,
+      "grad_norm": 1.3468793630599976,
+      "learning_rate": 0.003,
+      "loss": 4.0682,
+      "step": 6807
+    },
+    {
+      "epoch": 0.06808,
+      "grad_norm": 0.5080417990684509,
+      "learning_rate": 0.003,
+      "loss": 4.0537,
+      "step": 6808
+    },
+    {
+      "epoch": 0.06809,
+      "grad_norm": 0.7991033792495728,
+      "learning_rate": 0.003,
+      "loss": 4.0347,
+      "step": 6809
+    },
+    {
+      "epoch": 0.0681,
+      "grad_norm": 0.9151610136032104,
+      "learning_rate": 0.003,
+      "loss": 4.0588,
+      "step": 6810
+    },
+    {
+      "epoch": 0.06811,
+      "grad_norm": 0.8046905994415283,
+      "learning_rate": 0.003,
+      "loss": 4.0852,
+      "step": 6811
+    },
+    {
+      "epoch": 0.06812,
+      "grad_norm": 0.7898740172386169,
+      "learning_rate": 0.003,
+      "loss": 4.0498,
+      "step": 6812
+    },
+    {
+      "epoch": 0.06813,
+      "grad_norm": 0.7531576752662659,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 6813
+    },
+    {
+      "epoch": 0.06814,
+      "grad_norm": 0.7203744649887085,
+      "learning_rate": 0.003,
+      "loss": 4.0648,
+      "step": 6814
+    },
+    {
+      "epoch": 0.06815,
+      "grad_norm": 0.793602705001831,
+      "learning_rate": 0.003,
+      "loss": 4.0777,
+      "step": 6815
+    },
+    {
+      "epoch": 0.06816,
+      "grad_norm": 0.7131784558296204,
+      "learning_rate": 0.003,
+      "loss": 4.074,
+      "step": 6816
+    },
+    {
+      "epoch": 0.06817,
+      "grad_norm": 0.6014184951782227,
+      "learning_rate": 0.003,
+      "loss": 4.042,
+      "step": 6817
+    },
+    {
+      "epoch": 0.06818,
+      "grad_norm": 0.5902239084243774,
+      "learning_rate": 0.003,
+      "loss": 4.062,
+      "step": 6818
+    },
+    {
+      "epoch": 0.06819,
+      "grad_norm": 0.6026824712753296,
+      "learning_rate": 0.003,
+      "loss": 4.0531,
+      "step": 6819
+    },
+    {
+      "epoch": 0.0682,
+      "grad_norm": 0.6438929438591003,
+      "learning_rate": 0.003,
+      "loss": 4.084,
+      "step": 6820
+    },
+    {
+      "epoch": 0.06821,
+      "grad_norm": 0.6496386528015137,
+      "learning_rate": 0.003,
+      "loss": 4.0739,
+      "step": 6821
+    },
+    {
+      "epoch": 0.06822,
+      "grad_norm": 0.6901900768280029,
+      "learning_rate": 0.003,
+      "loss": 4.0366,
+      "step": 6822
+    },
+    {
+      "epoch": 0.06823,
+      "grad_norm": 0.7220962047576904,
+      "learning_rate": 0.003,
+      "loss": 4.0538,
+      "step": 6823
+    },
+    {
+      "epoch": 0.06824,
+      "grad_norm": 0.7161133289337158,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 6824
+    },
+    {
+      "epoch": 0.06825,
+      "grad_norm": 0.7654991149902344,
+      "learning_rate": 0.003,
+      "loss": 4.0522,
+      "step": 6825
+    },
+    {
+      "epoch": 0.06826,
+      "grad_norm": 0.7452932000160217,
+      "learning_rate": 0.003,
+      "loss": 4.0437,
+      "step": 6826
+    },
+    {
+      "epoch": 0.06827,
+      "grad_norm": 0.7094281911849976,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 6827
+    },
+    {
+      "epoch": 0.06828,
+      "grad_norm": 0.8043527603149414,
+      "learning_rate": 0.003,
+      "loss": 4.0615,
+      "step": 6828
+    },
+    {
+      "epoch": 0.06829,
+      "grad_norm": 0.7790755033493042,
+      "learning_rate": 0.003,
+      "loss": 4.0653,
+      "step": 6829
+    },
+    {
+      "epoch": 0.0683,
+      "grad_norm": 0.9166526794433594,
+      "learning_rate": 0.003,
+      "loss": 4.0597,
+      "step": 6830
+    },
+    {
+      "epoch": 0.06831,
+      "grad_norm": 1.1532281637191772,
+      "learning_rate": 0.003,
+      "loss": 4.091,
+      "step": 6831
+    },
+    {
+      "epoch": 0.06832,
+      "grad_norm": 0.878596842288971,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 6832
+    },
+    {
+      "epoch": 0.06833,
+      "grad_norm": 0.6871094107627869,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 6833
+    },
+    {
+      "epoch": 0.06834,
+      "grad_norm": 0.5933970212936401,
+      "learning_rate": 0.003,
+      "loss": 4.0644,
+      "step": 6834
+    },
+    {
+      "epoch": 0.06835,
+      "grad_norm": 0.635981023311615,
+      "learning_rate": 0.003,
+      "loss": 4.0594,
+      "step": 6835
+    },
+    {
+      "epoch": 0.06836,
+      "grad_norm": 0.6953107714653015,
+      "learning_rate": 0.003,
+      "loss": 4.0889,
+      "step": 6836
+    },
+    {
+      "epoch": 0.06837,
+      "grad_norm": 0.7208017706871033,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 6837
+    },
+    {
+      "epoch": 0.06838,
+      "grad_norm": 0.6934213638305664,
+      "learning_rate": 0.003,
+      "loss": 4.0476,
+      "step": 6838
+    },
+    {
+      "epoch": 0.06839,
+      "grad_norm": 0.6325958371162415,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 6839
+    },
+    {
+      "epoch": 0.0684,
+      "grad_norm": 0.592176616191864,
+      "learning_rate": 0.003,
+      "loss": 4.058,
+      "step": 6840
+    },
+    {
+      "epoch": 0.06841,
+      "grad_norm": 0.5508002042770386,
+      "learning_rate": 0.003,
+      "loss": 4.0538,
+      "step": 6841
+    },
+    {
+      "epoch": 0.06842,
+      "grad_norm": 0.5809416770935059,
+      "learning_rate": 0.003,
+      "loss": 4.0511,
+      "step": 6842
+    },
+    {
+      "epoch": 0.06843,
+      "grad_norm": 0.5343518257141113,
+      "learning_rate": 0.003,
+      "loss": 4.0437,
+      "step": 6843
+    },
+    {
+      "epoch": 0.06844,
+      "grad_norm": 0.6117638349533081,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 6844
+    },
+    {
+      "epoch": 0.06845,
+      "grad_norm": 0.6104193329811096,
+      "learning_rate": 0.003,
+      "loss": 4.0573,
+      "step": 6845
+    },
+    {
+      "epoch": 0.06846,
+      "grad_norm": 0.6597003936767578,
+      "learning_rate": 0.003,
+      "loss": 4.047,
+      "step": 6846
+    },
+    {
+      "epoch": 0.06847,
+      "grad_norm": 0.907615065574646,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 6847
+    },
+    {
+      "epoch": 0.06848,
+      "grad_norm": 1.0650358200073242,
+      "learning_rate": 0.003,
+      "loss": 4.0601,
+      "step": 6848
+    },
+    {
+      "epoch": 0.06849,
+      "grad_norm": 0.9667098522186279,
+      "learning_rate": 0.003,
+      "loss": 4.0443,
+      "step": 6849
+    },
+    {
+      "epoch": 0.0685,
+      "grad_norm": 0.7904394865036011,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 6850
+    },
+    {
+      "epoch": 0.06851,
+      "grad_norm": 0.6749697327613831,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 6851
+    },
+    {
+      "epoch": 0.06852,
+      "grad_norm": 0.7591043710708618,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 6852
+    },
+    {
+      "epoch": 0.06853,
+      "grad_norm": 0.723334550857544,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 6853
+    },
+    {
+      "epoch": 0.06854,
+      "grad_norm": 0.6678939461708069,
+      "learning_rate": 0.003,
+      "loss": 4.0669,
+      "step": 6854
+    },
+    {
+      "epoch": 0.06855,
+      "grad_norm": 0.735528290271759,
+      "learning_rate": 0.003,
+      "loss": 4.0798,
+      "step": 6855
+    },
+    {
+      "epoch": 0.06856,
+      "grad_norm": 0.802253246307373,
+      "learning_rate": 0.003,
+      "loss": 4.0791,
+      "step": 6856
+    },
+    {
+      "epoch": 0.06857,
+      "grad_norm": 0.872907817363739,
+      "learning_rate": 0.003,
+      "loss": 4.0666,
+      "step": 6857
+    },
+    {
+      "epoch": 0.06858,
+      "grad_norm": 0.8424869775772095,
+      "learning_rate": 0.003,
+      "loss": 4.0242,
+      "step": 6858
+    },
+    {
+      "epoch": 0.06859,
+      "grad_norm": 0.8607826232910156,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 6859
+    },
+    {
+      "epoch": 0.0686,
+      "grad_norm": 0.8124536871910095,
+      "learning_rate": 0.003,
+      "loss": 4.0486,
+      "step": 6860
+    },
+    {
+      "epoch": 0.06861,
+      "grad_norm": 0.7242681384086609,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 6861
+    },
+    {
+      "epoch": 0.06862,
+      "grad_norm": 0.6859086155891418,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 6862
+    },
+    {
+      "epoch": 0.06863,
+      "grad_norm": 0.6288987994194031,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 6863
+    },
+    {
+      "epoch": 0.06864,
+      "grad_norm": 0.589547336101532,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 6864
+    },
+    {
+      "epoch": 0.06865,
+      "grad_norm": 0.5942288637161255,
+      "learning_rate": 0.003,
+      "loss": 4.0388,
+      "step": 6865
+    },
+    {
+      "epoch": 0.06866,
+      "grad_norm": 0.6688255071640015,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 6866
+    },
+    {
+      "epoch": 0.06867,
+      "grad_norm": 0.7595317363739014,
+      "learning_rate": 0.003,
+      "loss": 4.0731,
+      "step": 6867
+    },
+    {
+      "epoch": 0.06868,
+      "grad_norm": 1.0023117065429688,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 6868
+    },
+    {
+      "epoch": 0.06869,
+      "grad_norm": 1.171606421470642,
+      "learning_rate": 0.003,
+      "loss": 4.0746,
+      "step": 6869
+    },
+    {
+      "epoch": 0.0687,
+      "grad_norm": 0.6550358533859253,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 6870
+    },
+    {
+      "epoch": 0.06871,
+      "grad_norm": 0.5838268995285034,
+      "learning_rate": 0.003,
+      "loss": 4.031,
+      "step": 6871
+    },
+    {
+      "epoch": 0.06872,
+      "grad_norm": 0.6378373503684998,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 6872
+    },
+    {
+      "epoch": 0.06873,
+      "grad_norm": 0.7861857414245605,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 6873
+    },
+    {
+      "epoch": 0.06874,
+      "grad_norm": 0.7961738109588623,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 6874
+    },
+    {
+      "epoch": 0.06875,
+      "grad_norm": 0.7601486444473267,
+      "learning_rate": 0.003,
+      "loss": 4.0645,
+      "step": 6875
+    },
+    {
+      "epoch": 0.06876,
+      "grad_norm": 0.7805059552192688,
+      "learning_rate": 0.003,
+      "loss": 4.069,
+      "step": 6876
+    },
+    {
+      "epoch": 0.06877,
+      "grad_norm": 0.75244140625,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 6877
+    },
+    {
+      "epoch": 0.06878,
+      "grad_norm": 0.6880764365196228,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 6878
+    },
+    {
+      "epoch": 0.06879,
+      "grad_norm": 0.6558811664581299,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 6879
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.6576889157295227,
+      "learning_rate": 0.003,
+      "loss": 4.0498,
+      "step": 6880
+    },
+    {
+      "epoch": 0.06881,
+      "grad_norm": 0.6587197780609131,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 6881
+    },
+    {
+      "epoch": 0.06882,
+      "grad_norm": 0.6243919730186462,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 6882
+    },
+    {
+      "epoch": 0.06883,
+      "grad_norm": 0.6536480188369751,
+      "learning_rate": 0.003,
+      "loss": 4.068,
+      "step": 6883
+    },
+    {
+      "epoch": 0.06884,
+      "grad_norm": 0.6172704696655273,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 6884
+    },
+    {
+      "epoch": 0.06885,
+      "grad_norm": 0.6676458716392517,
+      "learning_rate": 0.003,
+      "loss": 4.0528,
+      "step": 6885
+    },
+    {
+      "epoch": 0.06886,
+      "grad_norm": 0.761619508266449,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 6886
+    },
+    {
+      "epoch": 0.06887,
+      "grad_norm": 0.8133268356323242,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 6887
+    },
+    {
+      "epoch": 0.06888,
+      "grad_norm": 0.9232909679412842,
+      "learning_rate": 0.003,
+      "loss": 4.0409,
+      "step": 6888
+    },
+    {
+      "epoch": 0.06889,
+      "grad_norm": 0.880495011806488,
+      "learning_rate": 0.003,
+      "loss": 4.0517,
+      "step": 6889
+    },
+    {
+      "epoch": 0.0689,
+      "grad_norm": 0.8015482425689697,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 6890
+    },
+    {
+      "epoch": 0.06891,
+      "grad_norm": 0.7700675129890442,
+      "learning_rate": 0.003,
+      "loss": 4.0652,
+      "step": 6891
+    },
+    {
+      "epoch": 0.06892,
+      "grad_norm": 0.8113595843315125,
+      "learning_rate": 0.003,
+      "loss": 4.0429,
+      "step": 6892
+    },
+    {
+      "epoch": 0.06893,
+      "grad_norm": 0.7549485564231873,
+      "learning_rate": 0.003,
+      "loss": 4.0545,
+      "step": 6893
+    },
+    {
+      "epoch": 0.06894,
+      "grad_norm": 0.8220582604408264,
+      "learning_rate": 0.003,
+      "loss": 4.039,
+      "step": 6894
+    },
+    {
+      "epoch": 0.06895,
+      "grad_norm": 0.8129627704620361,
+      "learning_rate": 0.003,
+      "loss": 4.0416,
+      "step": 6895
+    },
+    {
+      "epoch": 0.06896,
+      "grad_norm": 0.6896864771842957,
+      "learning_rate": 0.003,
+      "loss": 4.0332,
+      "step": 6896
+    },
+    {
+      "epoch": 0.06897,
+      "grad_norm": 0.6018128991127014,
+      "learning_rate": 0.003,
+      "loss": 4.0738,
+      "step": 6897
+    },
+    {
+      "epoch": 0.06898,
+      "grad_norm": 0.6657570004463196,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 6898
+    },
+    {
+      "epoch": 0.06899,
+      "grad_norm": 0.7586705088615417,
+      "learning_rate": 0.003,
+      "loss": 4.0814,
+      "step": 6899
+    },
+    {
+      "epoch": 0.069,
+      "grad_norm": 0.6607414484024048,
+      "learning_rate": 0.003,
+      "loss": 4.0447,
+      "step": 6900
+    },
+    {
+      "epoch": 0.06901,
+      "grad_norm": 0.7915401458740234,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 6901
+    },
+    {
+      "epoch": 0.06902,
+      "grad_norm": 0.8700487017631531,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 6902
+    },
+    {
+      "epoch": 0.06903,
+      "grad_norm": 1.0735187530517578,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 6903
+    },
+    {
+      "epoch": 0.06904,
+      "grad_norm": 0.9046341776847839,
+      "learning_rate": 0.003,
+      "loss": 4.0485,
+      "step": 6904
+    },
+    {
+      "epoch": 0.06905,
+      "grad_norm": 0.7316793203353882,
+      "learning_rate": 0.003,
+      "loss": 4.0735,
+      "step": 6905
+    },
+    {
+      "epoch": 0.06906,
+      "grad_norm": 0.5604428648948669,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 6906
+    },
+    {
+      "epoch": 0.06907,
+      "grad_norm": 0.564428448677063,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 6907
+    },
+    {
+      "epoch": 0.06908,
+      "grad_norm": 0.6861115097999573,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 6908
+    },
+    {
+      "epoch": 0.06909,
+      "grad_norm": 0.7282766699790955,
+      "learning_rate": 0.003,
+      "loss": 4.0551,
+      "step": 6909
+    },
+    {
+      "epoch": 0.0691,
+      "grad_norm": 0.6792446970939636,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 6910
+    },
+    {
+      "epoch": 0.06911,
+      "grad_norm": 0.6289098858833313,
+      "learning_rate": 0.003,
+      "loss": 4.0651,
+      "step": 6911
+    },
+    {
+      "epoch": 0.06912,
+      "grad_norm": 0.5979729890823364,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 6912
+    },
+    {
+      "epoch": 0.06913,
+      "grad_norm": 0.7078564167022705,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 6913
+    },
+    {
+      "epoch": 0.06914,
+      "grad_norm": 0.8100939393043518,
+      "learning_rate": 0.003,
+      "loss": 4.0738,
+      "step": 6914
+    },
+    {
+      "epoch": 0.06915,
+      "grad_norm": 0.893566906452179,
+      "learning_rate": 0.003,
+      "loss": 4.083,
+      "step": 6915
+    },
+    {
+      "epoch": 0.06916,
+      "grad_norm": 0.9428468942642212,
+      "learning_rate": 0.003,
+      "loss": 4.0543,
+      "step": 6916
+    },
+    {
+      "epoch": 0.06917,
+      "grad_norm": 0.8902822732925415,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 6917
+    },
+    {
+      "epoch": 0.06918,
+      "grad_norm": 0.9391528367996216,
+      "learning_rate": 0.003,
+      "loss": 4.0802,
+      "step": 6918
+    },
+    {
+      "epoch": 0.06919,
+      "grad_norm": 0.8996408581733704,
+      "learning_rate": 0.003,
+      "loss": 4.0755,
+      "step": 6919
+    },
+    {
+      "epoch": 0.0692,
+      "grad_norm": 0.8599734902381897,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 6920
+    },
+    {
+      "epoch": 0.06921,
+      "grad_norm": 0.8355299234390259,
+      "learning_rate": 0.003,
+      "loss": 4.0667,
+      "step": 6921
+    },
+    {
+      "epoch": 0.06922,
+      "grad_norm": 0.812288224697113,
+      "learning_rate": 0.003,
+      "loss": 4.0624,
+      "step": 6922
+    },
+    {
+      "epoch": 0.06923,
+      "grad_norm": 0.9053574204444885,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 6923
+    },
+    {
+      "epoch": 0.06924,
+      "grad_norm": 1.0341438055038452,
+      "learning_rate": 0.003,
+      "loss": 4.0953,
+      "step": 6924
+    },
+    {
+      "epoch": 0.06925,
+      "grad_norm": 1.2150890827178955,
+      "learning_rate": 0.003,
+      "loss": 4.095,
+      "step": 6925
+    },
+    {
+      "epoch": 0.06926,
+      "grad_norm": 0.722688615322113,
+      "learning_rate": 0.003,
+      "loss": 4.0754,
+      "step": 6926
+    },
+    {
+      "epoch": 0.06927,
+      "grad_norm": 0.6441181302070618,
+      "learning_rate": 0.003,
+      "loss": 4.0676,
+      "step": 6927
+    },
+    {
+      "epoch": 0.06928,
+      "grad_norm": 0.6667382121086121,
+      "learning_rate": 0.003,
+      "loss": 4.047,
+      "step": 6928
+    },
+    {
+      "epoch": 0.06929,
+      "grad_norm": 0.6884969472885132,
+      "learning_rate": 0.003,
+      "loss": 4.0555,
+      "step": 6929
+    },
+    {
+      "epoch": 0.0693,
+      "grad_norm": 0.7797659039497375,
+      "learning_rate": 0.003,
+      "loss": 4.1051,
+      "step": 6930
+    },
+    {
+      "epoch": 0.06931,
+      "grad_norm": 0.7904036641120911,
+      "learning_rate": 0.003,
+      "loss": 4.0705,
+      "step": 6931
+    },
+    {
+      "epoch": 0.06932,
+      "grad_norm": 0.7277095317840576,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 6932
+    },
+    {
+      "epoch": 0.06933,
+      "grad_norm": 0.6969214677810669,
+      "learning_rate": 0.003,
+      "loss": 4.0822,
+      "step": 6933
+    },
+    {
+      "epoch": 0.06934,
+      "grad_norm": 0.7735503911972046,
+      "learning_rate": 0.003,
+      "loss": 4.0474,
+      "step": 6934
+    },
+    {
+      "epoch": 0.06935,
+      "grad_norm": 0.8755772113800049,
+      "learning_rate": 0.003,
+      "loss": 4.0687,
+      "step": 6935
+    },
+    {
+      "epoch": 0.06936,
+      "grad_norm": 0.9457864761352539,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 6936
+    },
+    {
+      "epoch": 0.06937,
+      "grad_norm": 1.0101808309555054,
+      "learning_rate": 0.003,
+      "loss": 4.058,
+      "step": 6937
+    },
+    {
+      "epoch": 0.06938,
+      "grad_norm": 0.7283269762992859,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 6938
+    },
+    {
+      "epoch": 0.06939,
+      "grad_norm": 0.5417292714118958,
+      "learning_rate": 0.003,
+      "loss": 4.0538,
+      "step": 6939
+    },
+    {
+      "epoch": 0.0694,
+      "grad_norm": 0.5455449819564819,
+      "learning_rate": 0.003,
+      "loss": 4.0376,
+      "step": 6940
+    },
+    {
+      "epoch": 0.06941,
+      "grad_norm": 0.603982150554657,
+      "learning_rate": 0.003,
+      "loss": 4.0586,
+      "step": 6941
+    },
+    {
+      "epoch": 0.06942,
+      "grad_norm": 0.6116390228271484,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 6942
+    },
+    {
+      "epoch": 0.06943,
+      "grad_norm": 0.5719929337501526,
+      "learning_rate": 0.003,
+      "loss": 4.0634,
+      "step": 6943
+    },
+    {
+      "epoch": 0.06944,
+      "grad_norm": 0.554226279258728,
+      "learning_rate": 0.003,
+      "loss": 4.0447,
+      "step": 6944
+    },
+    {
+      "epoch": 0.06945,
+      "grad_norm": 0.6480233669281006,
+      "learning_rate": 0.003,
+      "loss": 4.0738,
+      "step": 6945
+    },
+    {
+      "epoch": 0.06946,
+      "grad_norm": 0.6147804856300354,
+      "learning_rate": 0.003,
+      "loss": 4.0485,
+      "step": 6946
+    },
+    {
+      "epoch": 0.06947,
+      "grad_norm": 0.6045358777046204,
+      "learning_rate": 0.003,
+      "loss": 4.0371,
+      "step": 6947
+    },
+    {
+      "epoch": 0.06948,
+      "grad_norm": 0.671883225440979,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 6948
+    },
+    {
+      "epoch": 0.06949,
+      "grad_norm": 0.7604190707206726,
+      "learning_rate": 0.003,
+      "loss": 4.0561,
+      "step": 6949
+    },
+    {
+      "epoch": 0.0695,
+      "grad_norm": 0.8634862303733826,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 6950
+    },
+    {
+      "epoch": 0.06951,
+      "grad_norm": 0.8246872425079346,
+      "learning_rate": 0.003,
+      "loss": 4.071,
+      "step": 6951
+    },
+    {
+      "epoch": 0.06952,
+      "grad_norm": 0.7395027875900269,
+      "learning_rate": 0.003,
+      "loss": 4.0738,
+      "step": 6952
+    },
+    {
+      "epoch": 0.06953,
+      "grad_norm": 0.7343699336051941,
+      "learning_rate": 0.003,
+      "loss": 4.0727,
+      "step": 6953
+    },
+    {
+      "epoch": 0.06954,
+      "grad_norm": 0.7108721733093262,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 6954
+    },
+    {
+      "epoch": 0.06955,
+      "grad_norm": 0.6825199723243713,
+      "learning_rate": 0.003,
+      "loss": 4.0611,
+      "step": 6955
+    },
+    {
+      "epoch": 0.06956,
+      "grad_norm": 0.7615572810173035,
+      "learning_rate": 0.003,
+      "loss": 4.0679,
+      "step": 6956
+    },
+    {
+      "epoch": 0.06957,
+      "grad_norm": 0.8274344801902771,
+      "learning_rate": 0.003,
+      "loss": 4.049,
+      "step": 6957
+    },
+    {
+      "epoch": 0.06958,
+      "grad_norm": 0.8784217238426208,
+      "learning_rate": 0.003,
+      "loss": 4.0716,
+      "step": 6958
+    },
+    {
+      "epoch": 0.06959,
+      "grad_norm": 0.7803829908370972,
+      "learning_rate": 0.003,
+      "loss": 4.0591,
+      "step": 6959
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.8395748138427734,
+      "learning_rate": 0.003,
+      "loss": 4.0727,
+      "step": 6960
+    },
+    {
+      "epoch": 0.06961,
+      "grad_norm": 0.7431996464729309,
+      "learning_rate": 0.003,
+      "loss": 4.0697,
+      "step": 6961
+    },
+    {
+      "epoch": 0.06962,
+      "grad_norm": 0.7766883373260498,
+      "learning_rate": 0.003,
+      "loss": 4.0606,
+      "step": 6962
+    },
+    {
+      "epoch": 0.06963,
+      "grad_norm": 0.8304595947265625,
+      "learning_rate": 0.003,
+      "loss": 4.0644,
+      "step": 6963
+    },
+    {
+      "epoch": 0.06964,
+      "grad_norm": 0.8011763095855713,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 6964
+    },
+    {
+      "epoch": 0.06965,
+      "grad_norm": 0.6499873995780945,
+      "learning_rate": 0.003,
+      "loss": 4.039,
+      "step": 6965
+    },
+    {
+      "epoch": 0.06966,
+      "grad_norm": 0.5533931851387024,
+      "learning_rate": 0.003,
+      "loss": 4.0613,
+      "step": 6966
+    },
+    {
+      "epoch": 0.06967,
+      "grad_norm": 0.6004241704940796,
+      "learning_rate": 0.003,
+      "loss": 4.043,
+      "step": 6967
+    },
+    {
+      "epoch": 0.06968,
+      "grad_norm": 0.6045736074447632,
+      "learning_rate": 0.003,
+      "loss": 4.0504,
+      "step": 6968
+    },
+    {
+      "epoch": 0.06969,
+      "grad_norm": 0.6887038946151733,
+      "learning_rate": 0.003,
+      "loss": 4.0678,
+      "step": 6969
+    },
+    {
+      "epoch": 0.0697,
+      "grad_norm": 0.5907984972000122,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 6970
+    },
+    {
+      "epoch": 0.06971,
+      "grad_norm": 0.5247988700866699,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 6971
+    },
+    {
+      "epoch": 0.06972,
+      "grad_norm": 0.5534252524375916,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 6972
+    },
+    {
+      "epoch": 0.06973,
+      "grad_norm": 0.6565967798233032,
+      "learning_rate": 0.003,
+      "loss": 4.0558,
+      "step": 6973
+    },
+    {
+      "epoch": 0.06974,
+      "grad_norm": 0.7747914791107178,
+      "learning_rate": 0.003,
+      "loss": 4.0467,
+      "step": 6974
+    },
+    {
+      "epoch": 0.06975,
+      "grad_norm": 0.8217228055000305,
+      "learning_rate": 0.003,
+      "loss": 4.0556,
+      "step": 6975
+    },
+    {
+      "epoch": 0.06976,
+      "grad_norm": 0.9226599931716919,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 6976
+    },
+    {
+      "epoch": 0.06977,
+      "grad_norm": 1.0128273963928223,
+      "learning_rate": 0.003,
+      "loss": 4.0969,
+      "step": 6977
+    },
+    {
+      "epoch": 0.06978,
+      "grad_norm": 1.0614304542541504,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 6978
+    },
+    {
+      "epoch": 0.06979,
+      "grad_norm": 0.8185182809829712,
+      "learning_rate": 0.003,
+      "loss": 4.0762,
+      "step": 6979
+    },
+    {
+      "epoch": 0.0698,
+      "grad_norm": 0.7523839473724365,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 6980
+    },
+    {
+      "epoch": 0.06981,
+      "grad_norm": 0.809232234954834,
+      "learning_rate": 0.003,
+      "loss": 4.04,
+      "step": 6981
+    },
+    {
+      "epoch": 0.06982,
+      "grad_norm": 0.8641067147254944,
+      "learning_rate": 0.003,
+      "loss": 4.0559,
+      "step": 6982
+    },
+    {
+      "epoch": 0.06983,
+      "grad_norm": 0.8018274903297424,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 6983
+    },
+    {
+      "epoch": 0.06984,
+      "grad_norm": 0.8893591165542603,
+      "learning_rate": 0.003,
+      "loss": 4.0491,
+      "step": 6984
+    },
+    {
+      "epoch": 0.06985,
+      "grad_norm": 1.0270733833312988,
+      "learning_rate": 0.003,
+      "loss": 4.0893,
+      "step": 6985
+    },
+    {
+      "epoch": 0.06986,
+      "grad_norm": 0.9915245175361633,
+      "learning_rate": 0.003,
+      "loss": 4.0813,
+      "step": 6986
+    },
+    {
+      "epoch": 0.06987,
+      "grad_norm": 0.9925371408462524,
+      "learning_rate": 0.003,
+      "loss": 4.0807,
+      "step": 6987
+    },
+    {
+      "epoch": 0.06988,
+      "grad_norm": 1.083008885383606,
+      "learning_rate": 0.003,
+      "loss": 4.0643,
+      "step": 6988
+    },
+    {
+      "epoch": 0.06989,
+      "grad_norm": 1.013920783996582,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 6989
+    },
+    {
+      "epoch": 0.0699,
+      "grad_norm": 0.8849502205848694,
+      "learning_rate": 0.003,
+      "loss": 4.0476,
+      "step": 6990
+    },
+    {
+      "epoch": 0.06991,
+      "grad_norm": 0.7566224932670593,
+      "learning_rate": 0.003,
+      "loss": 4.0758,
+      "step": 6991
+    },
+    {
+      "epoch": 0.06992,
+      "grad_norm": 0.6404983997344971,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 6992
+    },
+    {
+      "epoch": 0.06993,
+      "grad_norm": 0.5881397128105164,
+      "learning_rate": 0.003,
+      "loss": 4.0398,
+      "step": 6993
+    },
+    {
+      "epoch": 0.06994,
+      "grad_norm": 0.6447122097015381,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 6994
+    },
+    {
+      "epoch": 0.06995,
+      "grad_norm": 0.5685395002365112,
+      "learning_rate": 0.003,
+      "loss": 4.0678,
+      "step": 6995
+    },
+    {
+      "epoch": 0.06996,
+      "grad_norm": 0.5681016445159912,
+      "learning_rate": 0.003,
+      "loss": 4.0505,
+      "step": 6996
+    },
+    {
+      "epoch": 0.06997,
+      "grad_norm": 0.6152894496917725,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 6997
+    },
+    {
+      "epoch": 0.06998,
+      "grad_norm": 0.6342592835426331,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 6998
+    },
+    {
+      "epoch": 0.06999,
+      "grad_norm": 0.6017648577690125,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 6999
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.6009829640388489,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 7000
+    },
+    {
+      "epoch": 0.07001,
+      "grad_norm": 0.5869970321655273,
+      "learning_rate": 0.003,
+      "loss": 4.0919,
+      "step": 7001
+    },
+    {
+      "epoch": 0.07002,
+      "grad_norm": 0.5607997179031372,
+      "learning_rate": 0.003,
+      "loss": 4.0386,
+      "step": 7002
+    },
+    {
+      "epoch": 0.07003,
+      "grad_norm": 0.5857222080230713,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 7003
+    },
+    {
+      "epoch": 0.07004,
+      "grad_norm": 0.6829317212104797,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 7004
+    },
+    {
+      "epoch": 0.07005,
+      "grad_norm": 0.7422044277191162,
+      "learning_rate": 0.003,
+      "loss": 4.045,
+      "step": 7005
+    },
+    {
+      "epoch": 0.07006,
+      "grad_norm": 0.7209708094596863,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 7006
+    },
+    {
+      "epoch": 0.07007,
+      "grad_norm": 0.6910662055015564,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 7007
+    },
+    {
+      "epoch": 0.07008,
+      "grad_norm": 0.724881649017334,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 7008
+    },
+    {
+      "epoch": 0.07009,
+      "grad_norm": 0.7626492977142334,
+      "learning_rate": 0.003,
+      "loss": 4.0567,
+      "step": 7009
+    },
+    {
+      "epoch": 0.0701,
+      "grad_norm": 0.7107502222061157,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 7010
+    },
+    {
+      "epoch": 0.07011,
+      "grad_norm": 0.6228822469711304,
+      "learning_rate": 0.003,
+      "loss": 4.0251,
+      "step": 7011
+    },
+    {
+      "epoch": 0.07012,
+      "grad_norm": 0.6056353449821472,
+      "learning_rate": 0.003,
+      "loss": 4.0571,
+      "step": 7012
+    },
+    {
+      "epoch": 0.07013,
+      "grad_norm": 0.5881388783454895,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 7013
+    },
+    {
+      "epoch": 0.07014,
+      "grad_norm": 0.5778508186340332,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 7014
+    },
+    {
+      "epoch": 0.07015,
+      "grad_norm": 0.6298239231109619,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 7015
+    },
+    {
+      "epoch": 0.07016,
+      "grad_norm": 0.6628670692443848,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 7016
+    },
+    {
+      "epoch": 0.07017,
+      "grad_norm": 0.7037574052810669,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 7017
+    },
+    {
+      "epoch": 0.07018,
+      "grad_norm": 0.7001218199729919,
+      "learning_rate": 0.003,
+      "loss": 4.0543,
+      "step": 7018
+    },
+    {
+      "epoch": 0.07019,
+      "grad_norm": 0.7447029948234558,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 7019
+    },
+    {
+      "epoch": 0.0702,
+      "grad_norm": 0.7939612865447998,
+      "learning_rate": 0.003,
+      "loss": 4.0366,
+      "step": 7020
+    },
+    {
+      "epoch": 0.07021,
+      "grad_norm": 1.0586154460906982,
+      "learning_rate": 0.003,
+      "loss": 4.0582,
+      "step": 7021
+    },
+    {
+      "epoch": 0.07022,
+      "grad_norm": 0.9932177662849426,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 7022
+    },
+    {
+      "epoch": 0.07023,
+      "grad_norm": 0.7585009932518005,
+      "learning_rate": 0.003,
+      "loss": 4.0509,
+      "step": 7023
+    },
+    {
+      "epoch": 0.07024,
+      "grad_norm": 0.583694338798523,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 7024
+    },
+    {
+      "epoch": 0.07025,
+      "grad_norm": 0.5445935130119324,
+      "learning_rate": 0.003,
+      "loss": 4.0534,
+      "step": 7025
+    },
+    {
+      "epoch": 0.07026,
+      "grad_norm": 0.5929006934165955,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 7026
+    },
+    {
+      "epoch": 0.07027,
+      "grad_norm": 0.6477934122085571,
+      "learning_rate": 0.003,
+      "loss": 4.0334,
+      "step": 7027
+    },
+    {
+      "epoch": 0.07028,
+      "grad_norm": 0.8614700436592102,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 7028
+    },
+    {
+      "epoch": 0.07029,
+      "grad_norm": 1.0163159370422363,
+      "learning_rate": 0.003,
+      "loss": 4.0376,
+      "step": 7029
+    },
+    {
+      "epoch": 0.0703,
+      "grad_norm": 0.9567646980285645,
+      "learning_rate": 0.003,
+      "loss": 4.0713,
+      "step": 7030
+    },
+    {
+      "epoch": 0.07031,
+      "grad_norm": 0.8252813816070557,
+      "learning_rate": 0.003,
+      "loss": 4.0271,
+      "step": 7031
+    },
+    {
+      "epoch": 0.07032,
+      "grad_norm": 0.8402538895606995,
+      "learning_rate": 0.003,
+      "loss": 4.0624,
+      "step": 7032
+    },
+    {
+      "epoch": 0.07033,
+      "grad_norm": 0.9886388182640076,
+      "learning_rate": 0.003,
+      "loss": 4.0633,
+      "step": 7033
+    },
+    {
+      "epoch": 0.07034,
+      "grad_norm": 0.9284382462501526,
+      "learning_rate": 0.003,
+      "loss": 4.0563,
+      "step": 7034
+    },
+    {
+      "epoch": 0.07035,
+      "grad_norm": 0.8074457049369812,
+      "learning_rate": 0.003,
+      "loss": 4.0582,
+      "step": 7035
+    },
+    {
+      "epoch": 0.07036,
+      "grad_norm": 0.8691509962081909,
+      "learning_rate": 0.003,
+      "loss": 4.0504,
+      "step": 7036
+    },
+    {
+      "epoch": 0.07037,
+      "grad_norm": 0.9332064986228943,
+      "learning_rate": 0.003,
+      "loss": 4.0451,
+      "step": 7037
+    },
+    {
+      "epoch": 0.07038,
+      "grad_norm": 0.9712073802947998,
+      "learning_rate": 0.003,
+      "loss": 4.0655,
+      "step": 7038
+    },
+    {
+      "epoch": 0.07039,
+      "grad_norm": 0.8838890194892883,
+      "learning_rate": 0.003,
+      "loss": 4.0576,
+      "step": 7039
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.9608893394470215,
+      "learning_rate": 0.003,
+      "loss": 4.0843,
+      "step": 7040
+    },
+    {
+      "epoch": 0.07041,
+      "grad_norm": 0.7875691056251526,
+      "learning_rate": 0.003,
+      "loss": 4.058,
+      "step": 7041
+    },
+    {
+      "epoch": 0.07042,
+      "grad_norm": 0.6502549648284912,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 7042
+    },
+    {
+      "epoch": 0.07043,
+      "grad_norm": 0.6589664816856384,
+      "learning_rate": 0.003,
+      "loss": 4.0734,
+      "step": 7043
+    },
+    {
+      "epoch": 0.07044,
+      "grad_norm": 0.7496007084846497,
+      "learning_rate": 0.003,
+      "loss": 4.0754,
+      "step": 7044
+    },
+    {
+      "epoch": 0.07045,
+      "grad_norm": 0.6947444677352905,
+      "learning_rate": 0.003,
+      "loss": 4.0588,
+      "step": 7045
+    },
+    {
+      "epoch": 0.07046,
+      "grad_norm": 0.619331955909729,
+      "learning_rate": 0.003,
+      "loss": 4.0632,
+      "step": 7046
+    },
+    {
+      "epoch": 0.07047,
+      "grad_norm": 0.6577057838439941,
+      "learning_rate": 0.003,
+      "loss": 4.0614,
+      "step": 7047
+    },
+    {
+      "epoch": 0.07048,
+      "grad_norm": 0.727730393409729,
+      "learning_rate": 0.003,
+      "loss": 4.0509,
+      "step": 7048
+    },
+    {
+      "epoch": 0.07049,
+      "grad_norm": 0.8019500374794006,
+      "learning_rate": 0.003,
+      "loss": 4.052,
+      "step": 7049
+    },
+    {
+      "epoch": 0.0705,
+      "grad_norm": 0.8129701614379883,
+      "learning_rate": 0.003,
+      "loss": 4.0727,
+      "step": 7050
+    },
+    {
+      "epoch": 0.07051,
+      "grad_norm": 0.9163272976875305,
+      "learning_rate": 0.003,
+      "loss": 4.0795,
+      "step": 7051
+    },
+    {
+      "epoch": 0.07052,
+      "grad_norm": 1.0532561540603638,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 7052
+    },
+    {
+      "epoch": 0.07053,
+      "grad_norm": 0.8734548687934875,
+      "learning_rate": 0.003,
+      "loss": 4.0466,
+      "step": 7053
+    },
+    {
+      "epoch": 0.07054,
+      "grad_norm": 0.8844286799430847,
+      "learning_rate": 0.003,
+      "loss": 4.0514,
+      "step": 7054
+    },
+    {
+      "epoch": 0.07055,
+      "grad_norm": 0.8699377179145813,
+      "learning_rate": 0.003,
+      "loss": 4.0543,
+      "step": 7055
+    },
+    {
+      "epoch": 0.07056,
+      "grad_norm": 0.8266685605049133,
+      "learning_rate": 0.003,
+      "loss": 4.0468,
+      "step": 7056
+    },
+    {
+      "epoch": 0.07057,
+      "grad_norm": 0.7403209805488586,
+      "learning_rate": 0.003,
+      "loss": 4.0672,
+      "step": 7057
+    },
+    {
+      "epoch": 0.07058,
+      "grad_norm": 0.7143754959106445,
+      "learning_rate": 0.003,
+      "loss": 4.0489,
+      "step": 7058
+    },
+    {
+      "epoch": 0.07059,
+      "grad_norm": 0.7726312875747681,
+      "learning_rate": 0.003,
+      "loss": 4.0593,
+      "step": 7059
+    },
+    {
+      "epoch": 0.0706,
+      "grad_norm": 0.7227030396461487,
+      "learning_rate": 0.003,
+      "loss": 4.0533,
+      "step": 7060
+    },
+    {
+      "epoch": 0.07061,
+      "grad_norm": 0.7146661281585693,
+      "learning_rate": 0.003,
+      "loss": 4.0491,
+      "step": 7061
+    },
+    {
+      "epoch": 0.07062,
+      "grad_norm": 0.7332003712654114,
+      "learning_rate": 0.003,
+      "loss": 4.0834,
+      "step": 7062
+    },
+    {
+      "epoch": 0.07063,
+      "grad_norm": 0.7866206765174866,
+      "learning_rate": 0.003,
+      "loss": 4.0585,
+      "step": 7063
+    },
+    {
+      "epoch": 0.07064,
+      "grad_norm": 0.8742010593414307,
+      "learning_rate": 0.003,
+      "loss": 4.0722,
+      "step": 7064
+    },
+    {
+      "epoch": 0.07065,
+      "grad_norm": 0.7553660869598389,
+      "learning_rate": 0.003,
+      "loss": 4.0614,
+      "step": 7065
+    },
+    {
+      "epoch": 0.07066,
+      "grad_norm": 0.6427456140518188,
+      "learning_rate": 0.003,
+      "loss": 4.0618,
+      "step": 7066
+    },
+    {
+      "epoch": 0.07067,
+      "grad_norm": 0.561661422252655,
+      "learning_rate": 0.003,
+      "loss": 4.0905,
+      "step": 7067
+    },
+    {
+      "epoch": 0.07068,
+      "grad_norm": 0.49639126658439636,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 7068
+    },
+    {
+      "epoch": 0.07069,
+      "grad_norm": 0.5630697011947632,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 7069
+    },
+    {
+      "epoch": 0.0707,
+      "grad_norm": 0.6418008208274841,
+      "learning_rate": 0.003,
+      "loss": 4.0569,
+      "step": 7070
+    },
+    {
+      "epoch": 0.07071,
+      "grad_norm": 0.5743558406829834,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 7071
+    },
+    {
+      "epoch": 0.07072,
+      "grad_norm": 0.6298477053642273,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 7072
+    },
+    {
+      "epoch": 0.07073,
+      "grad_norm": 0.6862906813621521,
+      "learning_rate": 0.003,
+      "loss": 4.0537,
+      "step": 7073
+    },
+    {
+      "epoch": 0.07074,
+      "grad_norm": 0.7200958728790283,
+      "learning_rate": 0.003,
+      "loss": 4.0402,
+      "step": 7074
+    },
+    {
+      "epoch": 0.07075,
+      "grad_norm": 0.8121984601020813,
+      "learning_rate": 0.003,
+      "loss": 4.0793,
+      "step": 7075
+    },
+    {
+      "epoch": 0.07076,
+      "grad_norm": 1.0043666362762451,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 7076
+    },
+    {
+      "epoch": 0.07077,
+      "grad_norm": 1.0342673063278198,
+      "learning_rate": 0.003,
+      "loss": 4.045,
+      "step": 7077
+    },
+    {
+      "epoch": 0.07078,
+      "grad_norm": 0.8044958710670471,
+      "learning_rate": 0.003,
+      "loss": 4.0687,
+      "step": 7078
+    },
+    {
+      "epoch": 0.07079,
+      "grad_norm": 0.5679816007614136,
+      "learning_rate": 0.003,
+      "loss": 4.0673,
+      "step": 7079
+    },
+    {
+      "epoch": 0.0708,
+      "grad_norm": 0.6305522918701172,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 7080
+    },
+    {
+      "epoch": 0.07081,
+      "grad_norm": 0.6851279139518738,
+      "learning_rate": 0.003,
+      "loss": 4.0526,
+      "step": 7081
+    },
+    {
+      "epoch": 0.07082,
+      "grad_norm": 0.7641652822494507,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 7082
+    },
+    {
+      "epoch": 0.07083,
+      "grad_norm": 0.7183335423469543,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 7083
+    },
+    {
+      "epoch": 0.07084,
+      "grad_norm": 0.6829813718795776,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 7084
+    },
+    {
+      "epoch": 0.07085,
+      "grad_norm": 0.8056393265724182,
+      "learning_rate": 0.003,
+      "loss": 4.0684,
+      "step": 7085
+    },
+    {
+      "epoch": 0.07086,
+      "grad_norm": 0.8371883034706116,
+      "learning_rate": 0.003,
+      "loss": 4.0492,
+      "step": 7086
+    },
+    {
+      "epoch": 0.07087,
+      "grad_norm": 0.8727524876594543,
+      "learning_rate": 0.003,
+      "loss": 4.0471,
+      "step": 7087
+    },
+    {
+      "epoch": 0.07088,
+      "grad_norm": 0.9062744379043579,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 7088
+    },
+    {
+      "epoch": 0.07089,
+      "grad_norm": 0.7283234000205994,
+      "learning_rate": 0.003,
+      "loss": 4.0447,
+      "step": 7089
+    },
+    {
+      "epoch": 0.0709,
+      "grad_norm": 0.6992966532707214,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 7090
+    },
+    {
+      "epoch": 0.07091,
+      "grad_norm": 0.6640942692756653,
+      "learning_rate": 0.003,
+      "loss": 4.039,
+      "step": 7091
+    },
+    {
+      "epoch": 0.07092,
+      "grad_norm": 0.5844820737838745,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 7092
+    },
+    {
+      "epoch": 0.07093,
+      "grad_norm": 0.5870398879051208,
+      "learning_rate": 0.003,
+      "loss": 4.0606,
+      "step": 7093
+    },
+    {
+      "epoch": 0.07094,
+      "grad_norm": 0.635111391544342,
+      "learning_rate": 0.003,
+      "loss": 4.0804,
+      "step": 7094
+    },
+    {
+      "epoch": 0.07095,
+      "grad_norm": 0.7052547335624695,
+      "learning_rate": 0.003,
+      "loss": 4.0678,
+      "step": 7095
+    },
+    {
+      "epoch": 0.07096,
+      "grad_norm": 0.8631133437156677,
+      "learning_rate": 0.003,
+      "loss": 4.0375,
+      "step": 7096
+    },
+    {
+      "epoch": 0.07097,
+      "grad_norm": 0.8540601134300232,
+      "learning_rate": 0.003,
+      "loss": 4.0593,
+      "step": 7097
+    },
+    {
+      "epoch": 0.07098,
+      "grad_norm": 0.6860929727554321,
+      "learning_rate": 0.003,
+      "loss": 4.0615,
+      "step": 7098
+    },
+    {
+      "epoch": 0.07099,
+      "grad_norm": 0.6758686304092407,
+      "learning_rate": 0.003,
+      "loss": 4.0606,
+      "step": 7099
+    },
+    {
+      "epoch": 0.071,
+      "grad_norm": 0.6959379315376282,
+      "learning_rate": 0.003,
+      "loss": 4.0561,
+      "step": 7100
+    },
+    {
+      "epoch": 0.07101,
+      "grad_norm": 0.6961118578910828,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 7101
+    },
+    {
+      "epoch": 0.07102,
+      "grad_norm": 0.7694392204284668,
+      "learning_rate": 0.003,
+      "loss": 4.0525,
+      "step": 7102
+    },
+    {
+      "epoch": 0.07103,
+      "grad_norm": 0.8063876032829285,
+      "learning_rate": 0.003,
+      "loss": 4.067,
+      "step": 7103
+    },
+    {
+      "epoch": 0.07104,
+      "grad_norm": 0.7437668442726135,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 7104
+    },
+    {
+      "epoch": 0.07105,
+      "grad_norm": 0.7525627613067627,
+      "learning_rate": 0.003,
+      "loss": 4.0514,
+      "step": 7105
+    },
+    {
+      "epoch": 0.07106,
+      "grad_norm": 0.6717851758003235,
+      "learning_rate": 0.003,
+      "loss": 4.057,
+      "step": 7106
+    },
+    {
+      "epoch": 0.07107,
+      "grad_norm": 0.6691828966140747,
+      "learning_rate": 0.003,
+      "loss": 4.059,
+      "step": 7107
+    },
+    {
+      "epoch": 0.07108,
+      "grad_norm": 0.5766210556030273,
+      "learning_rate": 0.003,
+      "loss": 4.0614,
+      "step": 7108
+    },
+    {
+      "epoch": 0.07109,
+      "grad_norm": 0.503217875957489,
+      "learning_rate": 0.003,
+      "loss": 4.0458,
+      "step": 7109
+    },
+    {
+      "epoch": 0.0711,
+      "grad_norm": 0.5127413272857666,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 7110
+    },
+    {
+      "epoch": 0.07111,
+      "grad_norm": 0.6104854941368103,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 7111
+    },
+    {
+      "epoch": 0.07112,
+      "grad_norm": 0.7156305313110352,
+      "learning_rate": 0.003,
+      "loss": 4.0434,
+      "step": 7112
+    },
+    {
+      "epoch": 0.07113,
+      "grad_norm": 0.9921921491622925,
+      "learning_rate": 0.003,
+      "loss": 4.082,
+      "step": 7113
+    },
+    {
+      "epoch": 0.07114,
+      "grad_norm": 1.3377934694290161,
+      "learning_rate": 0.003,
+      "loss": 4.0714,
+      "step": 7114
+    },
+    {
+      "epoch": 0.07115,
+      "grad_norm": 0.8231159448623657,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 7115
+    },
+    {
+      "epoch": 0.07116,
+      "grad_norm": 0.8249427676200867,
+      "learning_rate": 0.003,
+      "loss": 4.0521,
+      "step": 7116
+    },
+    {
+      "epoch": 0.07117,
+      "grad_norm": 0.7995317578315735,
+      "learning_rate": 0.003,
+      "loss": 4.0713,
+      "step": 7117
+    },
+    {
+      "epoch": 0.07118,
+      "grad_norm": 0.8792988657951355,
+      "learning_rate": 0.003,
+      "loss": 4.0777,
+      "step": 7118
+    },
+    {
+      "epoch": 0.07119,
+      "grad_norm": 0.870874285697937,
+      "learning_rate": 0.003,
+      "loss": 4.0595,
+      "step": 7119
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.8077058792114258,
+      "learning_rate": 0.003,
+      "loss": 4.0587,
+      "step": 7120
+    },
+    {
+      "epoch": 0.07121,
+      "grad_norm": 0.7849383354187012,
+      "learning_rate": 0.003,
+      "loss": 4.0618,
+      "step": 7121
+    },
+    {
+      "epoch": 0.07122,
+      "grad_norm": 0.6466601490974426,
+      "learning_rate": 0.003,
+      "loss": 4.0454,
+      "step": 7122
+    },
+    {
+      "epoch": 0.07123,
+      "grad_norm": 0.6387749910354614,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 7123
+    },
+    {
+      "epoch": 0.07124,
+      "grad_norm": 0.6785484552383423,
+      "learning_rate": 0.003,
+      "loss": 4.0342,
+      "step": 7124
+    },
+    {
+      "epoch": 0.07125,
+      "grad_norm": 0.7016876935958862,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 7125
+    },
+    {
+      "epoch": 0.07126,
+      "grad_norm": 0.6375389695167542,
+      "learning_rate": 0.003,
+      "loss": 4.0754,
+      "step": 7126
+    },
+    {
+      "epoch": 0.07127,
+      "grad_norm": 0.6511231660842896,
+      "learning_rate": 0.003,
+      "loss": 4.0421,
+      "step": 7127
+    },
+    {
+      "epoch": 0.07128,
+      "grad_norm": 0.6708038449287415,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 7128
+    },
+    {
+      "epoch": 0.07129,
+      "grad_norm": 0.81505286693573,
+      "learning_rate": 0.003,
+      "loss": 4.0632,
+      "step": 7129
+    },
+    {
+      "epoch": 0.0713,
+      "grad_norm": 0.9627731442451477,
+      "learning_rate": 0.003,
+      "loss": 4.0535,
+      "step": 7130
+    },
+    {
+      "epoch": 0.07131,
+      "grad_norm": 1.073629379272461,
+      "learning_rate": 0.003,
+      "loss": 4.0433,
+      "step": 7131
+    },
+    {
+      "epoch": 0.07132,
+      "grad_norm": 0.7830027341842651,
+      "learning_rate": 0.003,
+      "loss": 4.0771,
+      "step": 7132
+    },
+    {
+      "epoch": 0.07133,
+      "grad_norm": 0.6530002951622009,
+      "learning_rate": 0.003,
+      "loss": 4.08,
+      "step": 7133
+    },
+    {
+      "epoch": 0.07134,
+      "grad_norm": 0.6852246522903442,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 7134
+    },
+    {
+      "epoch": 0.07135,
+      "grad_norm": 0.646599292755127,
+      "learning_rate": 0.003,
+      "loss": 4.0452,
+      "step": 7135
+    },
+    {
+      "epoch": 0.07136,
+      "grad_norm": 0.6449059247970581,
+      "learning_rate": 0.003,
+      "loss": 4.0843,
+      "step": 7136
+    },
+    {
+      "epoch": 0.07137,
+      "grad_norm": 0.76864093542099,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 7137
+    },
+    {
+      "epoch": 0.07138,
+      "grad_norm": 0.853398859500885,
+      "learning_rate": 0.003,
+      "loss": 4.0533,
+      "step": 7138
+    },
+    {
+      "epoch": 0.07139,
+      "grad_norm": 0.786717414855957,
+      "learning_rate": 0.003,
+      "loss": 4.0487,
+      "step": 7139
+    },
+    {
+      "epoch": 0.0714,
+      "grad_norm": 0.6665356755256653,
+      "learning_rate": 0.003,
+      "loss": 4.0557,
+      "step": 7140
+    },
+    {
+      "epoch": 0.07141,
+      "grad_norm": 0.5583412051200867,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 7141
+    },
+    {
+      "epoch": 0.07142,
+      "grad_norm": 0.5142695307731628,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 7142
+    },
+    {
+      "epoch": 0.07143,
+      "grad_norm": 0.5608883500099182,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 7143
+    },
+    {
+      "epoch": 0.07144,
+      "grad_norm": 0.5874366760253906,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 7144
+    },
+    {
+      "epoch": 0.07145,
+      "grad_norm": 0.7359901070594788,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 7145
+    },
+    {
+      "epoch": 0.07146,
+      "grad_norm": 0.8212249875068665,
+      "learning_rate": 0.003,
+      "loss": 4.0477,
+      "step": 7146
+    },
+    {
+      "epoch": 0.07147,
+      "grad_norm": 0.8431135416030884,
+      "learning_rate": 0.003,
+      "loss": 4.0581,
+      "step": 7147
+    },
+    {
+      "epoch": 0.07148,
+      "grad_norm": 0.7992448806762695,
+      "learning_rate": 0.003,
+      "loss": 4.0601,
+      "step": 7148
+    },
+    {
+      "epoch": 0.07149,
+      "grad_norm": 0.7589306235313416,
+      "learning_rate": 0.003,
+      "loss": 4.0687,
+      "step": 7149
+    },
+    {
+      "epoch": 0.0715,
+      "grad_norm": 0.9306392669677734,
+      "learning_rate": 0.003,
+      "loss": 4.0565,
+      "step": 7150
+    },
+    {
+      "epoch": 0.07151,
+      "grad_norm": 0.9924577474594116,
+      "learning_rate": 0.003,
+      "loss": 4.0649,
+      "step": 7151
+    },
+    {
+      "epoch": 0.07152,
+      "grad_norm": 0.883642315864563,
+      "learning_rate": 0.003,
+      "loss": 4.0767,
+      "step": 7152
+    },
+    {
+      "epoch": 0.07153,
+      "grad_norm": 0.7332523465156555,
+      "learning_rate": 0.003,
+      "loss": 4.0579,
+      "step": 7153
+    },
+    {
+      "epoch": 0.07154,
+      "grad_norm": 0.7003593444824219,
+      "learning_rate": 0.003,
+      "loss": 4.0909,
+      "step": 7154
+    },
+    {
+      "epoch": 0.07155,
+      "grad_norm": 0.6480372548103333,
+      "learning_rate": 0.003,
+      "loss": 4.0545,
+      "step": 7155
+    },
+    {
+      "epoch": 0.07156,
+      "grad_norm": 0.681715190410614,
+      "learning_rate": 0.003,
+      "loss": 4.0599,
+      "step": 7156
+    },
+    {
+      "epoch": 0.07157,
+      "grad_norm": 0.7040699124336243,
+      "learning_rate": 0.003,
+      "loss": 4.0485,
+      "step": 7157
+    },
+    {
+      "epoch": 0.07158,
+      "grad_norm": 0.6674701571464539,
+      "learning_rate": 0.003,
+      "loss": 4.0525,
+      "step": 7158
+    },
+    {
+      "epoch": 0.07159,
+      "grad_norm": 0.7054368257522583,
+      "learning_rate": 0.003,
+      "loss": 4.0582,
+      "step": 7159
+    },
+    {
+      "epoch": 0.0716,
+      "grad_norm": 0.8435150384902954,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 7160
+    },
+    {
+      "epoch": 0.07161,
+      "grad_norm": 0.7803497314453125,
+      "learning_rate": 0.003,
+      "loss": 4.0617,
+      "step": 7161
+    },
+    {
+      "epoch": 0.07162,
+      "grad_norm": 0.9659714698791504,
+      "learning_rate": 0.003,
+      "loss": 4.0679,
+      "step": 7162
+    },
+    {
+      "epoch": 0.07163,
+      "grad_norm": 1.1324249505996704,
+      "learning_rate": 0.003,
+      "loss": 4.065,
+      "step": 7163
+    },
+    {
+      "epoch": 0.07164,
+      "grad_norm": 0.8073561191558838,
+      "learning_rate": 0.003,
+      "loss": 4.0384,
+      "step": 7164
+    },
+    {
+      "epoch": 0.07165,
+      "grad_norm": 0.6374465823173523,
+      "learning_rate": 0.003,
+      "loss": 4.0398,
+      "step": 7165
+    },
+    {
+      "epoch": 0.07166,
+      "grad_norm": 0.6178154945373535,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 7166
+    },
+    {
+      "epoch": 0.07167,
+      "grad_norm": 0.683618426322937,
+      "learning_rate": 0.003,
+      "loss": 4.0582,
+      "step": 7167
+    },
+    {
+      "epoch": 0.07168,
+      "grad_norm": 0.8149320483207703,
+      "learning_rate": 0.003,
+      "loss": 4.0782,
+      "step": 7168
+    },
+    {
+      "epoch": 0.07169,
+      "grad_norm": 1.0499589443206787,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 7169
+    },
+    {
+      "epoch": 0.0717,
+      "grad_norm": 0.8370349407196045,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 7170
+    },
+    {
+      "epoch": 0.07171,
+      "grad_norm": 0.7322642803192139,
+      "learning_rate": 0.003,
+      "loss": 4.0624,
+      "step": 7171
+    },
+    {
+      "epoch": 0.07172,
+      "grad_norm": 0.7073912620544434,
+      "learning_rate": 0.003,
+      "loss": 4.0646,
+      "step": 7172
+    },
+    {
+      "epoch": 0.07173,
+      "grad_norm": 0.7349017858505249,
+      "learning_rate": 0.003,
+      "loss": 4.0672,
+      "step": 7173
+    },
+    {
+      "epoch": 0.07174,
+      "grad_norm": 0.7687479853630066,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 7174
+    },
+    {
+      "epoch": 0.07175,
+      "grad_norm": 0.778110682964325,
+      "learning_rate": 0.003,
+      "loss": 4.0827,
+      "step": 7175
+    },
+    {
+      "epoch": 0.07176,
+      "grad_norm": 0.7889383435249329,
+      "learning_rate": 0.003,
+      "loss": 4.0543,
+      "step": 7176
+    },
+    {
+      "epoch": 0.07177,
+      "grad_norm": 0.7536366581916809,
+      "learning_rate": 0.003,
+      "loss": 4.0604,
+      "step": 7177
+    },
+    {
+      "epoch": 0.07178,
+      "grad_norm": 0.5942474603652954,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 7178
+    },
+    {
+      "epoch": 0.07179,
+      "grad_norm": 0.607769787311554,
+      "learning_rate": 0.003,
+      "loss": 4.0507,
+      "step": 7179
+    },
+    {
+      "epoch": 0.0718,
+      "grad_norm": 0.6560423970222473,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 7180
+    },
+    {
+      "epoch": 0.07181,
+      "grad_norm": 0.7127732038497925,
+      "learning_rate": 0.003,
+      "loss": 4.0514,
+      "step": 7181
+    },
+    {
+      "epoch": 0.07182,
+      "grad_norm": 0.9144400954246521,
+      "learning_rate": 0.003,
+      "loss": 4.063,
+      "step": 7182
+    },
+    {
+      "epoch": 0.07183,
+      "grad_norm": 1.0960463285446167,
+      "learning_rate": 0.003,
+      "loss": 4.0663,
+      "step": 7183
+    },
+    {
+      "epoch": 0.07184,
+      "grad_norm": 0.7976519465446472,
+      "learning_rate": 0.003,
+      "loss": 4.0364,
+      "step": 7184
+    },
+    {
+      "epoch": 0.07185,
+      "grad_norm": 0.6410822868347168,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 7185
+    },
+    {
+      "epoch": 0.07186,
+      "grad_norm": 0.5536119937896729,
+      "learning_rate": 0.003,
+      "loss": 4.0551,
+      "step": 7186
+    },
+    {
+      "epoch": 0.07187,
+      "grad_norm": 0.562323808670044,
+      "learning_rate": 0.003,
+      "loss": 4.0447,
+      "step": 7187
+    },
+    {
+      "epoch": 0.07188,
+      "grad_norm": 0.5706769227981567,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 7188
+    },
+    {
+      "epoch": 0.07189,
+      "grad_norm": 0.6065142154693604,
+      "learning_rate": 0.003,
+      "loss": 4.0567,
+      "step": 7189
+    },
+    {
+      "epoch": 0.0719,
+      "grad_norm": 0.6539909839630127,
+      "learning_rate": 0.003,
+      "loss": 4.0567,
+      "step": 7190
+    },
+    {
+      "epoch": 0.07191,
+      "grad_norm": 0.716647207736969,
+      "learning_rate": 0.003,
+      "loss": 4.0649,
+      "step": 7191
+    },
+    {
+      "epoch": 0.07192,
+      "grad_norm": 0.7328898310661316,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 7192
+    },
+    {
+      "epoch": 0.07193,
+      "grad_norm": 0.7779203653335571,
+      "learning_rate": 0.003,
+      "loss": 4.0551,
+      "step": 7193
+    },
+    {
+      "epoch": 0.07194,
+      "grad_norm": 0.7531735897064209,
+      "learning_rate": 0.003,
+      "loss": 4.0527,
+      "step": 7194
+    },
+    {
+      "epoch": 0.07195,
+      "grad_norm": 0.7268919944763184,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 7195
+    },
+    {
+      "epoch": 0.07196,
+      "grad_norm": 0.7242192625999451,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 7196
+    },
+    {
+      "epoch": 0.07197,
+      "grad_norm": 0.7115669846534729,
+      "learning_rate": 0.003,
+      "loss": 4.0598,
+      "step": 7197
+    },
+    {
+      "epoch": 0.07198,
+      "grad_norm": 0.6028633117675781,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 7198
+    },
+    {
+      "epoch": 0.07199,
+      "grad_norm": 0.6346673965454102,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 7199
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.6983155012130737,
+      "learning_rate": 0.003,
+      "loss": 4.0628,
+      "step": 7200
+    },
+    {
+      "epoch": 0.07201,
+      "grad_norm": 0.6601306796073914,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 7201
+    },
+    {
+      "epoch": 0.07202,
+      "grad_norm": 0.7019174098968506,
+      "learning_rate": 0.003,
+      "loss": 4.0612,
+      "step": 7202
+    },
+    {
+      "epoch": 0.07203,
+      "grad_norm": 0.7765396237373352,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 7203
+    },
+    {
+      "epoch": 0.07204,
+      "grad_norm": 0.8035591244697571,
+      "learning_rate": 0.003,
+      "loss": 4.0706,
+      "step": 7204
+    },
+    {
+      "epoch": 0.07205,
+      "grad_norm": 0.8030667901039124,
+      "learning_rate": 0.003,
+      "loss": 4.0677,
+      "step": 7205
+    },
+    {
+      "epoch": 0.07206,
+      "grad_norm": 0.7670336365699768,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 7206
+    },
+    {
+      "epoch": 0.07207,
+      "grad_norm": 0.8267470598220825,
+      "learning_rate": 0.003,
+      "loss": 4.0484,
+      "step": 7207
+    },
+    {
+      "epoch": 0.07208,
+      "grad_norm": 0.9432637691497803,
+      "learning_rate": 0.003,
+      "loss": 4.0554,
+      "step": 7208
+    },
+    {
+      "epoch": 0.07209,
+      "grad_norm": 1.0053582191467285,
+      "learning_rate": 0.003,
+      "loss": 4.05,
+      "step": 7209
+    },
+    {
+      "epoch": 0.0721,
+      "grad_norm": 0.8599228262901306,
+      "learning_rate": 0.003,
+      "loss": 4.0472,
+      "step": 7210
+    },
+    {
+      "epoch": 0.07211,
+      "grad_norm": 0.814187228679657,
+      "learning_rate": 0.003,
+      "loss": 4.0651,
+      "step": 7211
+    },
+    {
+      "epoch": 0.07212,
+      "grad_norm": 0.7470682859420776,
+      "learning_rate": 0.003,
+      "loss": 4.0673,
+      "step": 7212
+    },
+    {
+      "epoch": 0.07213,
+      "grad_norm": 0.6890302300453186,
+      "learning_rate": 0.003,
+      "loss": 4.0779,
+      "step": 7213
+    },
+    {
+      "epoch": 0.07214,
+      "grad_norm": 0.6987749934196472,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 7214
+    },
+    {
+      "epoch": 0.07215,
+      "grad_norm": 0.6948664784431458,
+      "learning_rate": 0.003,
+      "loss": 4.0529,
+      "step": 7215
+    },
+    {
+      "epoch": 0.07216,
+      "grad_norm": 0.739776074886322,
+      "learning_rate": 0.003,
+      "loss": 4.0628,
+      "step": 7216
+    },
+    {
+      "epoch": 0.07217,
+      "grad_norm": 0.7527135014533997,
+      "learning_rate": 0.003,
+      "loss": 4.0576,
+      "step": 7217
+    },
+    {
+      "epoch": 0.07218,
+      "grad_norm": 0.9124861359596252,
+      "learning_rate": 0.003,
+      "loss": 4.0605,
+      "step": 7218
+    },
+    {
+      "epoch": 0.07219,
+      "grad_norm": 1.1901919841766357,
+      "learning_rate": 0.003,
+      "loss": 4.0446,
+      "step": 7219
+    },
+    {
+      "epoch": 0.0722,
+      "grad_norm": 0.7887942790985107,
+      "learning_rate": 0.003,
+      "loss": 4.063,
+      "step": 7220
+    },
+    {
+      "epoch": 0.07221,
+      "grad_norm": 0.667094886302948,
+      "learning_rate": 0.003,
+      "loss": 4.061,
+      "step": 7221
+    },
+    {
+      "epoch": 0.07222,
+      "grad_norm": 0.6238069534301758,
+      "learning_rate": 0.003,
+      "loss": 4.0492,
+      "step": 7222
+    },
+    {
+      "epoch": 0.07223,
+      "grad_norm": 0.6278685331344604,
+      "learning_rate": 0.003,
+      "loss": 4.0619,
+      "step": 7223
+    },
+    {
+      "epoch": 0.07224,
+      "grad_norm": 0.5980105400085449,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 7224
+    },
+    {
+      "epoch": 0.07225,
+      "grad_norm": 0.5822548270225525,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 7225
+    },
+    {
+      "epoch": 0.07226,
+      "grad_norm": 0.705129086971283,
+      "learning_rate": 0.003,
+      "loss": 4.0554,
+      "step": 7226
+    },
+    {
+      "epoch": 0.07227,
+      "grad_norm": 0.8607189655303955,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 7227
+    },
+    {
+      "epoch": 0.07228,
+      "grad_norm": 1.0843205451965332,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 7228
+    },
+    {
+      "epoch": 0.07229,
+      "grad_norm": 0.9284544587135315,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 7229
+    },
+    {
+      "epoch": 0.0723,
+      "grad_norm": 0.7482180595397949,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 7230
+    },
+    {
+      "epoch": 0.07231,
+      "grad_norm": 0.7119454741477966,
+      "learning_rate": 0.003,
+      "loss": 4.0604,
+      "step": 7231
+    },
+    {
+      "epoch": 0.07232,
+      "grad_norm": 0.8325626850128174,
+      "learning_rate": 0.003,
+      "loss": 4.0561,
+      "step": 7232
+    },
+    {
+      "epoch": 0.07233,
+      "grad_norm": 0.847053050994873,
+      "learning_rate": 0.003,
+      "loss": 4.0626,
+      "step": 7233
+    },
+    {
+      "epoch": 0.07234,
+      "grad_norm": 0.8324984908103943,
+      "learning_rate": 0.003,
+      "loss": 4.0382,
+      "step": 7234
+    },
+    {
+      "epoch": 0.07235,
+      "grad_norm": 0.9077214598655701,
+      "learning_rate": 0.003,
+      "loss": 4.0884,
+      "step": 7235
+    },
+    {
+      "epoch": 0.07236,
+      "grad_norm": 0.7818212509155273,
+      "learning_rate": 0.003,
+      "loss": 4.0832,
+      "step": 7236
+    },
+    {
+      "epoch": 0.07237,
+      "grad_norm": 0.7284092307090759,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 7237
+    },
+    {
+      "epoch": 0.07238,
+      "grad_norm": 0.706311047077179,
+      "learning_rate": 0.003,
+      "loss": 4.0689,
+      "step": 7238
+    },
+    {
+      "epoch": 0.07239,
+      "grad_norm": 0.8178088068962097,
+      "learning_rate": 0.003,
+      "loss": 4.0722,
+      "step": 7239
+    },
+    {
+      "epoch": 0.0724,
+      "grad_norm": 0.8817880749702454,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 7240
+    },
+    {
+      "epoch": 0.07241,
+      "grad_norm": 0.8495557308197021,
+      "learning_rate": 0.003,
+      "loss": 4.0755,
+      "step": 7241
+    },
+    {
+      "epoch": 0.07242,
+      "grad_norm": 0.766780436038971,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 7242
+    },
+    {
+      "epoch": 0.07243,
+      "grad_norm": 0.7193701863288879,
+      "learning_rate": 0.003,
+      "loss": 4.0617,
+      "step": 7243
+    },
+    {
+      "epoch": 0.07244,
+      "grad_norm": 0.6461489200592041,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 7244
+    },
+    {
+      "epoch": 0.07245,
+      "grad_norm": 0.5884445905685425,
+      "learning_rate": 0.003,
+      "loss": 4.0428,
+      "step": 7245
+    },
+    {
+      "epoch": 0.07246,
+      "grad_norm": 0.5750178694725037,
+      "learning_rate": 0.003,
+      "loss": 4.0524,
+      "step": 7246
+    },
+    {
+      "epoch": 0.07247,
+      "grad_norm": 0.6441517472267151,
+      "learning_rate": 0.003,
+      "loss": 4.0706,
+      "step": 7247
+    },
+    {
+      "epoch": 0.07248,
+      "grad_norm": 0.6365780830383301,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 7248
+    },
+    {
+      "epoch": 0.07249,
+      "grad_norm": 0.6930205821990967,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 7249
+    },
+    {
+      "epoch": 0.0725,
+      "grad_norm": 0.7784088850021362,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 7250
+    },
+    {
+      "epoch": 0.07251,
+      "grad_norm": 0.9219940900802612,
+      "learning_rate": 0.003,
+      "loss": 4.0524,
+      "step": 7251
+    },
+    {
+      "epoch": 0.07252,
+      "grad_norm": 1.1105650663375854,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 7252
+    },
+    {
+      "epoch": 0.07253,
+      "grad_norm": 0.7862104177474976,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 7253
+    },
+    {
+      "epoch": 0.07254,
+      "grad_norm": 0.6501257419586182,
+      "learning_rate": 0.003,
+      "loss": 4.0682,
+      "step": 7254
+    },
+    {
+      "epoch": 0.07255,
+      "grad_norm": 0.7006444334983826,
+      "learning_rate": 0.003,
+      "loss": 4.0645,
+      "step": 7255
+    },
+    {
+      "epoch": 0.07256,
+      "grad_norm": 0.7784441113471985,
+      "learning_rate": 0.003,
+      "loss": 4.0673,
+      "step": 7256
+    },
+    {
+      "epoch": 0.07257,
+      "grad_norm": 0.7117348313331604,
+      "learning_rate": 0.003,
+      "loss": 4.0508,
+      "step": 7257
+    },
+    {
+      "epoch": 0.07258,
+      "grad_norm": 0.6730650663375854,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 7258
+    },
+    {
+      "epoch": 0.07259,
+      "grad_norm": 0.6756656765937805,
+      "learning_rate": 0.003,
+      "loss": 4.0484,
+      "step": 7259
+    },
+    {
+      "epoch": 0.0726,
+      "grad_norm": 0.7697281837463379,
+      "learning_rate": 0.003,
+      "loss": 4.0302,
+      "step": 7260
+    },
+    {
+      "epoch": 0.07261,
+      "grad_norm": 0.7414706945419312,
+      "learning_rate": 0.003,
+      "loss": 4.0544,
+      "step": 7261
+    },
+    {
+      "epoch": 0.07262,
+      "grad_norm": 0.8930975794792175,
+      "learning_rate": 0.003,
+      "loss": 4.0545,
+      "step": 7262
+    },
+    {
+      "epoch": 0.07263,
+      "grad_norm": 0.9588114023208618,
+      "learning_rate": 0.003,
+      "loss": 4.0643,
+      "step": 7263
+    },
+    {
+      "epoch": 0.07264,
+      "grad_norm": 0.765650749206543,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 7264
+    },
+    {
+      "epoch": 0.07265,
+      "grad_norm": 0.6993110775947571,
+      "learning_rate": 0.003,
+      "loss": 4.0529,
+      "step": 7265
+    },
+    {
+      "epoch": 0.07266,
+      "grad_norm": 0.6595311164855957,
+      "learning_rate": 0.003,
+      "loss": 4.0702,
+      "step": 7266
+    },
+    {
+      "epoch": 0.07267,
+      "grad_norm": 0.6637097001075745,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 7267
+    },
+    {
+      "epoch": 0.07268,
+      "grad_norm": 0.6798438429832458,
+      "learning_rate": 0.003,
+      "loss": 4.0642,
+      "step": 7268
+    },
+    {
+      "epoch": 0.07269,
+      "grad_norm": 0.7596190571784973,
+      "learning_rate": 0.003,
+      "loss": 4.052,
+      "step": 7269
+    },
+    {
+      "epoch": 0.0727,
+      "grad_norm": 0.7761021852493286,
+      "learning_rate": 0.003,
+      "loss": 4.0375,
+      "step": 7270
+    },
+    {
+      "epoch": 0.07271,
+      "grad_norm": 0.8009915947914124,
+      "learning_rate": 0.003,
+      "loss": 4.0484,
+      "step": 7271
+    },
+    {
+      "epoch": 0.07272,
+      "grad_norm": 1.0279133319854736,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 7272
+    },
+    {
+      "epoch": 0.07273,
+      "grad_norm": 1.1666834354400635,
+      "learning_rate": 0.003,
+      "loss": 4.0702,
+      "step": 7273
+    },
+    {
+      "epoch": 0.07274,
+      "grad_norm": 0.6210851073265076,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 7274
+    },
+    {
+      "epoch": 0.07275,
+      "grad_norm": 0.6968746781349182,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 7275
+    },
+    {
+      "epoch": 0.07276,
+      "grad_norm": 0.9396815299987793,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 7276
+    },
+    {
+      "epoch": 0.07277,
+      "grad_norm": 0.9510892033576965,
+      "learning_rate": 0.003,
+      "loss": 4.0533,
+      "step": 7277
+    },
+    {
+      "epoch": 0.07278,
+      "grad_norm": 0.8797145485877991,
+      "learning_rate": 0.003,
+      "loss": 4.0623,
+      "step": 7278
+    },
+    {
+      "epoch": 0.07279,
+      "grad_norm": 0.7459223866462708,
+      "learning_rate": 0.003,
+      "loss": 4.0382,
+      "step": 7279
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.7744358777999878,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 7280
+    },
+    {
+      "epoch": 0.07281,
+      "grad_norm": 0.7015716433525085,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 7281
+    },
+    {
+      "epoch": 0.07282,
+      "grad_norm": 0.7247535586357117,
+      "learning_rate": 0.003,
+      "loss": 4.0605,
+      "step": 7282
+    },
+    {
+      "epoch": 0.07283,
+      "grad_norm": 0.726428210735321,
+      "learning_rate": 0.003,
+      "loss": 4.0755,
+      "step": 7283
+    },
+    {
+      "epoch": 0.07284,
+      "grad_norm": 0.7073158621788025,
+      "learning_rate": 0.003,
+      "loss": 4.0332,
+      "step": 7284
+    },
+    {
+      "epoch": 0.07285,
+      "grad_norm": 0.682987630367279,
+      "learning_rate": 0.003,
+      "loss": 4.0608,
+      "step": 7285
+    },
+    {
+      "epoch": 0.07286,
+      "grad_norm": 0.6815469264984131,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 7286
+    },
+    {
+      "epoch": 0.07287,
+      "grad_norm": 0.7032678127288818,
+      "learning_rate": 0.003,
+      "loss": 4.0577,
+      "step": 7287
+    },
+    {
+      "epoch": 0.07288,
+      "grad_norm": 0.6728482246398926,
+      "learning_rate": 0.003,
+      "loss": 4.0351,
+      "step": 7288
+    },
+    {
+      "epoch": 0.07289,
+      "grad_norm": 0.6951834559440613,
+      "learning_rate": 0.003,
+      "loss": 4.0548,
+      "step": 7289
+    },
+    {
+      "epoch": 0.0729,
+      "grad_norm": 0.638177752494812,
+      "learning_rate": 0.003,
+      "loss": 4.0737,
+      "step": 7290
+    },
+    {
+      "epoch": 0.07291,
+      "grad_norm": 0.6703484654426575,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 7291
+    },
+    {
+      "epoch": 0.07292,
+      "grad_norm": 0.6980002522468567,
+      "learning_rate": 0.003,
+      "loss": 4.0494,
+      "step": 7292
+    },
+    {
+      "epoch": 0.07293,
+      "grad_norm": 0.6970522403717041,
+      "learning_rate": 0.003,
+      "loss": 4.0364,
+      "step": 7293
+    },
+    {
+      "epoch": 0.07294,
+      "grad_norm": 0.6544967293739319,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 7294
+    },
+    {
+      "epoch": 0.07295,
+      "grad_norm": 0.6716231107711792,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 7295
+    },
+    {
+      "epoch": 0.07296,
+      "grad_norm": 0.8019461631774902,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 7296
+    },
+    {
+      "epoch": 0.07297,
+      "grad_norm": 0.9077995419502258,
+      "learning_rate": 0.003,
+      "loss": 4.0678,
+      "step": 7297
+    },
+    {
+      "epoch": 0.07298,
+      "grad_norm": 0.8515841364860535,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 7298
+    },
+    {
+      "epoch": 0.07299,
+      "grad_norm": 0.6916643977165222,
+      "learning_rate": 0.003,
+      "loss": 4.039,
+      "step": 7299
+    },
+    {
+      "epoch": 0.073,
+      "grad_norm": 0.6934566497802734,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 7300
+    },
+    {
+      "epoch": 0.07301,
+      "grad_norm": 0.6328970193862915,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 7301
+    },
+    {
+      "epoch": 0.07302,
+      "grad_norm": 0.6315169930458069,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 7302
+    },
+    {
+      "epoch": 0.07303,
+      "grad_norm": 0.731637179851532,
+      "learning_rate": 0.003,
+      "loss": 4.0505,
+      "step": 7303
+    },
+    {
+      "epoch": 0.07304,
+      "grad_norm": 0.7277827858924866,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 7304
+    },
+    {
+      "epoch": 0.07305,
+      "grad_norm": 0.7023566961288452,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 7305
+    },
+    {
+      "epoch": 0.07306,
+      "grad_norm": 0.6430983543395996,
+      "learning_rate": 0.003,
+      "loss": 4.0633,
+      "step": 7306
+    },
+    {
+      "epoch": 0.07307,
+      "grad_norm": 0.708091676235199,
+      "learning_rate": 0.003,
+      "loss": 4.0718,
+      "step": 7307
+    },
+    {
+      "epoch": 0.07308,
+      "grad_norm": 0.9044761657714844,
+      "learning_rate": 0.003,
+      "loss": 4.0489,
+      "step": 7308
+    },
+    {
+      "epoch": 0.07309,
+      "grad_norm": 1.0247414112091064,
+      "learning_rate": 0.003,
+      "loss": 4.0626,
+      "step": 7309
+    },
+    {
+      "epoch": 0.0731,
+      "grad_norm": 1.082109808921814,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 7310
+    },
+    {
+      "epoch": 0.07311,
+      "grad_norm": 0.8486955165863037,
+      "learning_rate": 0.003,
+      "loss": 4.0576,
+      "step": 7311
+    },
+    {
+      "epoch": 0.07312,
+      "grad_norm": 0.9376227259635925,
+      "learning_rate": 0.003,
+      "loss": 4.0752,
+      "step": 7312
+    },
+    {
+      "epoch": 0.07313,
+      "grad_norm": 0.7661151885986328,
+      "learning_rate": 0.003,
+      "loss": 4.0579,
+      "step": 7313
+    },
+    {
+      "epoch": 0.07314,
+      "grad_norm": 0.7679696679115295,
+      "learning_rate": 0.003,
+      "loss": 4.0693,
+      "step": 7314
+    },
+    {
+      "epoch": 0.07315,
+      "grad_norm": 0.7783799767494202,
+      "learning_rate": 0.003,
+      "loss": 4.0536,
+      "step": 7315
+    },
+    {
+      "epoch": 0.07316,
+      "grad_norm": 0.7211359739303589,
+      "learning_rate": 0.003,
+      "loss": 4.0721,
+      "step": 7316
+    },
+    {
+      "epoch": 0.07317,
+      "grad_norm": 0.6663383841514587,
+      "learning_rate": 0.003,
+      "loss": 4.053,
+      "step": 7317
+    },
+    {
+      "epoch": 0.07318,
+      "grad_norm": 0.5428328514099121,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 7318
+    },
+    {
+      "epoch": 0.07319,
+      "grad_norm": 0.5540145039558411,
+      "learning_rate": 0.003,
+      "loss": 4.0549,
+      "step": 7319
+    },
+    {
+      "epoch": 0.0732,
+      "grad_norm": 0.6541301608085632,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 7320
+    },
+    {
+      "epoch": 0.07321,
+      "grad_norm": 0.8049659729003906,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 7321
+    },
+    {
+      "epoch": 0.07322,
+      "grad_norm": 0.926133394241333,
+      "learning_rate": 0.003,
+      "loss": 4.065,
+      "step": 7322
+    },
+    {
+      "epoch": 0.07323,
+      "grad_norm": 0.8449392318725586,
+      "learning_rate": 0.003,
+      "loss": 4.0483,
+      "step": 7323
+    },
+    {
+      "epoch": 0.07324,
+      "grad_norm": 0.7204037308692932,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 7324
+    },
+    {
+      "epoch": 0.07325,
+      "grad_norm": 0.6747505068778992,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 7325
+    },
+    {
+      "epoch": 0.07326,
+      "grad_norm": 0.6664218306541443,
+      "learning_rate": 0.003,
+      "loss": 4.0583,
+      "step": 7326
+    },
+    {
+      "epoch": 0.07327,
+      "grad_norm": 0.781737208366394,
+      "learning_rate": 0.003,
+      "loss": 4.0639,
+      "step": 7327
+    },
+    {
+      "epoch": 0.07328,
+      "grad_norm": 0.8926900625228882,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 7328
+    },
+    {
+      "epoch": 0.07329,
+      "grad_norm": 0.911156177520752,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 7329
+    },
+    {
+      "epoch": 0.0733,
+      "grad_norm": 0.7268263101577759,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 7330
+    },
+    {
+      "epoch": 0.07331,
+      "grad_norm": 0.6712400317192078,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 7331
+    },
+    {
+      "epoch": 0.07332,
+      "grad_norm": 0.5922292470932007,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 7332
+    },
+    {
+      "epoch": 0.07333,
+      "grad_norm": 0.6073193550109863,
+      "learning_rate": 0.003,
+      "loss": 4.0462,
+      "step": 7333
+    },
+    {
+      "epoch": 0.07334,
+      "grad_norm": 0.5941932201385498,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 7334
+    },
+    {
+      "epoch": 0.07335,
+      "grad_norm": 0.6539658904075623,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 7335
+    },
+    {
+      "epoch": 0.07336,
+      "grad_norm": 0.6258936524391174,
+      "learning_rate": 0.003,
+      "loss": 4.0585,
+      "step": 7336
+    },
+    {
+      "epoch": 0.07337,
+      "grad_norm": 0.6982399225234985,
+      "learning_rate": 0.003,
+      "loss": 4.0494,
+      "step": 7337
+    },
+    {
+      "epoch": 0.07338,
+      "grad_norm": 0.7497730851173401,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 7338
+    },
+    {
+      "epoch": 0.07339,
+      "grad_norm": 0.8154767751693726,
+      "learning_rate": 0.003,
+      "loss": 4.0713,
+      "step": 7339
+    },
+    {
+      "epoch": 0.0734,
+      "grad_norm": 0.8396859765052795,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 7340
+    },
+    {
+      "epoch": 0.07341,
+      "grad_norm": 0.8484392762184143,
+      "learning_rate": 0.003,
+      "loss": 4.0505,
+      "step": 7341
+    },
+    {
+      "epoch": 0.07342,
+      "grad_norm": 0.769058108329773,
+      "learning_rate": 0.003,
+      "loss": 4.0684,
+      "step": 7342
+    },
+    {
+      "epoch": 0.07343,
+      "grad_norm": 0.6709862351417542,
+      "learning_rate": 0.003,
+      "loss": 4.0578,
+      "step": 7343
+    },
+    {
+      "epoch": 0.07344,
+      "grad_norm": 0.6540923714637756,
+      "learning_rate": 0.003,
+      "loss": 4.0558,
+      "step": 7344
+    },
+    {
+      "epoch": 0.07345,
+      "grad_norm": 0.6276072859764099,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 7345
+    },
+    {
+      "epoch": 0.07346,
+      "grad_norm": 0.6179201006889343,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 7346
+    },
+    {
+      "epoch": 0.07347,
+      "grad_norm": 0.5960707664489746,
+      "learning_rate": 0.003,
+      "loss": 4.0339,
+      "step": 7347
+    },
+    {
+      "epoch": 0.07348,
+      "grad_norm": 0.5133612751960754,
+      "learning_rate": 0.003,
+      "loss": 4.0339,
+      "step": 7348
+    },
+    {
+      "epoch": 0.07349,
+      "grad_norm": 0.47574296593666077,
+      "learning_rate": 0.003,
+      "loss": 4.0355,
+      "step": 7349
+    },
+    {
+      "epoch": 0.0735,
+      "grad_norm": 0.5420452952384949,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 7350
+    },
+    {
+      "epoch": 0.07351,
+      "grad_norm": 0.6298367977142334,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 7351
+    },
+    {
+      "epoch": 0.07352,
+      "grad_norm": 0.6977446675300598,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 7352
+    },
+    {
+      "epoch": 0.07353,
+      "grad_norm": 0.7407834529876709,
+      "learning_rate": 0.003,
+      "loss": 4.0616,
+      "step": 7353
+    },
+    {
+      "epoch": 0.07354,
+      "grad_norm": 0.8189613819122314,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 7354
+    },
+    {
+      "epoch": 0.07355,
+      "grad_norm": 0.9231334924697876,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 7355
+    },
+    {
+      "epoch": 0.07356,
+      "grad_norm": 0.90891432762146,
+      "learning_rate": 0.003,
+      "loss": 4.0563,
+      "step": 7356
+    },
+    {
+      "epoch": 0.07357,
+      "grad_norm": 0.8406992554664612,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 7357
+    },
+    {
+      "epoch": 0.07358,
+      "grad_norm": 0.8127254843711853,
+      "learning_rate": 0.003,
+      "loss": 4.0542,
+      "step": 7358
+    },
+    {
+      "epoch": 0.07359,
+      "grad_norm": 0.8756438493728638,
+      "learning_rate": 0.003,
+      "loss": 4.0559,
+      "step": 7359
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.9333860278129578,
+      "learning_rate": 0.003,
+      "loss": 4.0714,
+      "step": 7360
+    },
+    {
+      "epoch": 0.07361,
+      "grad_norm": 0.9368488788604736,
+      "learning_rate": 0.003,
+      "loss": 4.0498,
+      "step": 7361
+    },
+    {
+      "epoch": 0.07362,
+      "grad_norm": 0.827850878238678,
+      "learning_rate": 0.003,
+      "loss": 4.0732,
+      "step": 7362
+    },
+    {
+      "epoch": 0.07363,
+      "grad_norm": 0.7587769031524658,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 7363
+    },
+    {
+      "epoch": 0.07364,
+      "grad_norm": 0.881401777267456,
+      "learning_rate": 0.003,
+      "loss": 4.0718,
+      "step": 7364
+    },
+    {
+      "epoch": 0.07365,
+      "grad_norm": 1.1441792249679565,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 7365
+    },
+    {
+      "epoch": 0.07366,
+      "grad_norm": 1.050272822380066,
+      "learning_rate": 0.003,
+      "loss": 4.0581,
+      "step": 7366
+    },
+    {
+      "epoch": 0.07367,
+      "grad_norm": 0.9028184413909912,
+      "learning_rate": 0.003,
+      "loss": 4.0675,
+      "step": 7367
+    },
+    {
+      "epoch": 0.07368,
+      "grad_norm": 0.7709707617759705,
+      "learning_rate": 0.003,
+      "loss": 4.0829,
+      "step": 7368
+    },
+    {
+      "epoch": 0.07369,
+      "grad_norm": 0.8372074365615845,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 7369
+    },
+    {
+      "epoch": 0.0737,
+      "grad_norm": 0.8988859057426453,
+      "learning_rate": 0.003,
+      "loss": 4.0674,
+      "step": 7370
+    },
+    {
+      "epoch": 0.07371,
+      "grad_norm": 0.9398235082626343,
+      "learning_rate": 0.003,
+      "loss": 4.09,
+      "step": 7371
+    },
+    {
+      "epoch": 0.07372,
+      "grad_norm": 0.979543924331665,
+      "learning_rate": 0.003,
+      "loss": 4.0717,
+      "step": 7372
+    },
+    {
+      "epoch": 0.07373,
+      "grad_norm": 0.940352201461792,
+      "learning_rate": 0.003,
+      "loss": 4.0763,
+      "step": 7373
+    },
+    {
+      "epoch": 0.07374,
+      "grad_norm": 0.956783652305603,
+      "learning_rate": 0.003,
+      "loss": 4.0686,
+      "step": 7374
+    },
+    {
+      "epoch": 0.07375,
+      "grad_norm": 0.842356264591217,
+      "learning_rate": 0.003,
+      "loss": 4.0693,
+      "step": 7375
+    },
+    {
+      "epoch": 0.07376,
+      "grad_norm": 0.9043733477592468,
+      "learning_rate": 0.003,
+      "loss": 4.0812,
+      "step": 7376
+    },
+    {
+      "epoch": 0.07377,
+      "grad_norm": 0.9078689217567444,
+      "learning_rate": 0.003,
+      "loss": 4.0754,
+      "step": 7377
+    },
+    {
+      "epoch": 0.07378,
+      "grad_norm": 1.0029642581939697,
+      "learning_rate": 0.003,
+      "loss": 4.084,
+      "step": 7378
+    },
+    {
+      "epoch": 0.07379,
+      "grad_norm": 1.116869568824768,
+      "learning_rate": 0.003,
+      "loss": 4.0595,
+      "step": 7379
+    },
+    {
+      "epoch": 0.0738,
+      "grad_norm": 0.9037948250770569,
+      "learning_rate": 0.003,
+      "loss": 4.0436,
+      "step": 7380
+    },
+    {
+      "epoch": 0.07381,
+      "grad_norm": 0.8272243738174438,
+      "learning_rate": 0.003,
+      "loss": 4.05,
+      "step": 7381
+    },
+    {
+      "epoch": 0.07382,
+      "grad_norm": 0.7809496521949768,
+      "learning_rate": 0.003,
+      "loss": 4.1005,
+      "step": 7382
+    },
+    {
+      "epoch": 0.07383,
+      "grad_norm": 0.915755033493042,
+      "learning_rate": 0.003,
+      "loss": 4.0759,
+      "step": 7383
+    },
+    {
+      "epoch": 0.07384,
+      "grad_norm": 0.9646238684654236,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 7384
+    },
+    {
+      "epoch": 0.07385,
+      "grad_norm": 0.7966357469558716,
+      "learning_rate": 0.003,
+      "loss": 4.0583,
+      "step": 7385
+    },
+    {
+      "epoch": 0.07386,
+      "grad_norm": 0.7612378597259521,
+      "learning_rate": 0.003,
+      "loss": 4.0729,
+      "step": 7386
+    },
+    {
+      "epoch": 0.07387,
+      "grad_norm": 0.7063902616500854,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 7387
+    },
+    {
+      "epoch": 0.07388,
+      "grad_norm": 0.7505108118057251,
+      "learning_rate": 0.003,
+      "loss": 4.0665,
+      "step": 7388
+    },
+    {
+      "epoch": 0.07389,
+      "grad_norm": 0.6540176868438721,
+      "learning_rate": 0.003,
+      "loss": 4.0526,
+      "step": 7389
+    },
+    {
+      "epoch": 0.0739,
+      "grad_norm": 0.5906816720962524,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 7390
+    },
+    {
+      "epoch": 0.07391,
+      "grad_norm": 0.6899387240409851,
+      "learning_rate": 0.003,
+      "loss": 4.0648,
+      "step": 7391
+    },
+    {
+      "epoch": 0.07392,
+      "grad_norm": 0.7748689651489258,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 7392
+    },
+    {
+      "epoch": 0.07393,
+      "grad_norm": 0.7996538877487183,
+      "learning_rate": 0.003,
+      "loss": 4.0614,
+      "step": 7393
+    },
+    {
+      "epoch": 0.07394,
+      "grad_norm": 0.7616129517555237,
+      "learning_rate": 0.003,
+      "loss": 4.0509,
+      "step": 7394
+    },
+    {
+      "epoch": 0.07395,
+      "grad_norm": 0.6533021330833435,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 7395
+    },
+    {
+      "epoch": 0.07396,
+      "grad_norm": 0.4827071726322174,
+      "learning_rate": 0.003,
+      "loss": 4.045,
+      "step": 7396
+    },
+    {
+      "epoch": 0.07397,
+      "grad_norm": 0.530000627040863,
+      "learning_rate": 0.003,
+      "loss": 4.0425,
+      "step": 7397
+    },
+    {
+      "epoch": 0.07398,
+      "grad_norm": 0.5404955148696899,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 7398
+    },
+    {
+      "epoch": 0.07399,
+      "grad_norm": 0.5249134302139282,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 7399
+    },
+    {
+      "epoch": 0.074,
+      "grad_norm": 0.6226985454559326,
+      "learning_rate": 0.003,
+      "loss": 4.0569,
+      "step": 7400
+    },
+    {
+      "epoch": 0.07401,
+      "grad_norm": 0.6949280500411987,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 7401
+    },
+    {
+      "epoch": 0.07402,
+      "grad_norm": 0.6506314873695374,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 7402
+    },
+    {
+      "epoch": 0.07403,
+      "grad_norm": 0.5710362792015076,
+      "learning_rate": 0.003,
+      "loss": 4.0516,
+      "step": 7403
+    },
+    {
+      "epoch": 0.07404,
+      "grad_norm": 0.5003657341003418,
+      "learning_rate": 0.003,
+      "loss": 4.0804,
+      "step": 7404
+    },
+    {
+      "epoch": 0.07405,
+      "grad_norm": 0.4554838538169861,
+      "learning_rate": 0.003,
+      "loss": 4.034,
+      "step": 7405
+    },
+    {
+      "epoch": 0.07406,
+      "grad_norm": 0.5003705024719238,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 7406
+    },
+    {
+      "epoch": 0.07407,
+      "grad_norm": 0.5069988369941711,
+      "learning_rate": 0.003,
+      "loss": 4.0514,
+      "step": 7407
+    },
+    {
+      "epoch": 0.07408,
+      "grad_norm": 0.4921700656414032,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 7408
+    },
+    {
+      "epoch": 0.07409,
+      "grad_norm": 0.5235347151756287,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 7409
+    },
+    {
+      "epoch": 0.0741,
+      "grad_norm": 0.532010555267334,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 7410
+    },
+    {
+      "epoch": 0.07411,
+      "grad_norm": 0.5434505343437195,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 7411
+    },
+    {
+      "epoch": 0.07412,
+      "grad_norm": 0.6439001560211182,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 7412
+    },
+    {
+      "epoch": 0.07413,
+      "grad_norm": 0.7864586710929871,
+      "learning_rate": 0.003,
+      "loss": 4.0332,
+      "step": 7413
+    },
+    {
+      "epoch": 0.07414,
+      "grad_norm": 1.0050026178359985,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 7414
+    },
+    {
+      "epoch": 0.07415,
+      "grad_norm": 1.035174012184143,
+      "learning_rate": 0.003,
+      "loss": 4.0463,
+      "step": 7415
+    },
+    {
+      "epoch": 0.07416,
+      "grad_norm": 0.7707844376564026,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 7416
+    },
+    {
+      "epoch": 0.07417,
+      "grad_norm": 0.7662503719329834,
+      "learning_rate": 0.003,
+      "loss": 4.0469,
+      "step": 7417
+    },
+    {
+      "epoch": 0.07418,
+      "grad_norm": 0.8928897380828857,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 7418
+    },
+    {
+      "epoch": 0.07419,
+      "grad_norm": 0.8406341671943665,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 7419
+    },
+    {
+      "epoch": 0.0742,
+      "grad_norm": 0.8542524576187134,
+      "learning_rate": 0.003,
+      "loss": 4.0622,
+      "step": 7420
+    },
+    {
+      "epoch": 0.07421,
+      "grad_norm": 0.8937854766845703,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 7421
+    },
+    {
+      "epoch": 0.07422,
+      "grad_norm": 0.9040619730949402,
+      "learning_rate": 0.003,
+      "loss": 4.0622,
+      "step": 7422
+    },
+    {
+      "epoch": 0.07423,
+      "grad_norm": 0.7693426012992859,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 7423
+    },
+    {
+      "epoch": 0.07424,
+      "grad_norm": 0.7264552116394043,
+      "learning_rate": 0.003,
+      "loss": 4.06,
+      "step": 7424
+    },
+    {
+      "epoch": 0.07425,
+      "grad_norm": 0.7355172038078308,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 7425
+    },
+    {
+      "epoch": 0.07426,
+      "grad_norm": 0.7334502339363098,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 7426
+    },
+    {
+      "epoch": 0.07427,
+      "grad_norm": 0.8754525780677795,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 7427
+    },
+    {
+      "epoch": 0.07428,
+      "grad_norm": 0.9548163414001465,
+      "learning_rate": 0.003,
+      "loss": 4.0955,
+      "step": 7428
+    },
+    {
+      "epoch": 0.07429,
+      "grad_norm": 0.9186201095581055,
+      "learning_rate": 0.003,
+      "loss": 4.0648,
+      "step": 7429
+    },
+    {
+      "epoch": 0.0743,
+      "grad_norm": 1.0932062864303589,
+      "learning_rate": 0.003,
+      "loss": 4.0581,
+      "step": 7430
+    },
+    {
+      "epoch": 0.07431,
+      "grad_norm": 0.7931951284408569,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 7431
+    },
+    {
+      "epoch": 0.07432,
+      "grad_norm": 0.6674275994300842,
+      "learning_rate": 0.003,
+      "loss": 4.0637,
+      "step": 7432
+    },
+    {
+      "epoch": 0.07433,
+      "grad_norm": 0.5529371500015259,
+      "learning_rate": 0.003,
+      "loss": 4.0428,
+      "step": 7433
+    },
+    {
+      "epoch": 0.07434,
+      "grad_norm": 0.5146396160125732,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 7434
+    },
+    {
+      "epoch": 0.07435,
+      "grad_norm": 0.5399970412254333,
+      "learning_rate": 0.003,
+      "loss": 4.0491,
+      "step": 7435
+    },
+    {
+      "epoch": 0.07436,
+      "grad_norm": 0.6538156867027283,
+      "learning_rate": 0.003,
+      "loss": 4.0713,
+      "step": 7436
+    },
+    {
+      "epoch": 0.07437,
+      "grad_norm": 0.8563846349716187,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 7437
+    },
+    {
+      "epoch": 0.07438,
+      "grad_norm": 1.1433502435684204,
+      "learning_rate": 0.003,
+      "loss": 4.0634,
+      "step": 7438
+    },
+    {
+      "epoch": 0.07439,
+      "grad_norm": 0.8334239721298218,
+      "learning_rate": 0.003,
+      "loss": 4.0498,
+      "step": 7439
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.6448179483413696,
+      "learning_rate": 0.003,
+      "loss": 4.0526,
+      "step": 7440
+    },
+    {
+      "epoch": 0.07441,
+      "grad_norm": 0.7158012986183167,
+      "learning_rate": 0.003,
+      "loss": 4.0885,
+      "step": 7441
+    },
+    {
+      "epoch": 0.07442,
+      "grad_norm": 0.8154329061508179,
+      "learning_rate": 0.003,
+      "loss": 4.061,
+      "step": 7442
+    },
+    {
+      "epoch": 0.07443,
+      "grad_norm": 0.8508999943733215,
+      "learning_rate": 0.003,
+      "loss": 4.0464,
+      "step": 7443
+    },
+    {
+      "epoch": 0.07444,
+      "grad_norm": 0.8207703828811646,
+      "learning_rate": 0.003,
+      "loss": 4.0654,
+      "step": 7444
+    },
+    {
+      "epoch": 0.07445,
+      "grad_norm": 0.7776240110397339,
+      "learning_rate": 0.003,
+      "loss": 4.0451,
+      "step": 7445
+    },
+    {
+      "epoch": 0.07446,
+      "grad_norm": 0.7464262247085571,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 7446
+    },
+    {
+      "epoch": 0.07447,
+      "grad_norm": 0.6316878795623779,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 7447
+    },
+    {
+      "epoch": 0.07448,
+      "grad_norm": 0.6105993390083313,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 7448
+    },
+    {
+      "epoch": 0.07449,
+      "grad_norm": 0.6523858308792114,
+      "learning_rate": 0.003,
+      "loss": 4.0465,
+      "step": 7449
+    },
+    {
+      "epoch": 0.0745,
+      "grad_norm": 0.7932419180870056,
+      "learning_rate": 0.003,
+      "loss": 4.0563,
+      "step": 7450
+    },
+    {
+      "epoch": 0.07451,
+      "grad_norm": 0.736053466796875,
+      "learning_rate": 0.003,
+      "loss": 4.0479,
+      "step": 7451
+    },
+    {
+      "epoch": 0.07452,
+      "grad_norm": 0.7587615251541138,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 7452
+    },
+    {
+      "epoch": 0.07453,
+      "grad_norm": 0.7929425239562988,
+      "learning_rate": 0.003,
+      "loss": 4.052,
+      "step": 7453
+    },
+    {
+      "epoch": 0.07454,
+      "grad_norm": 0.8557668924331665,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 7454
+    },
+    {
+      "epoch": 0.07455,
+      "grad_norm": 0.8376034498214722,
+      "learning_rate": 0.003,
+      "loss": 4.0458,
+      "step": 7455
+    },
+    {
+      "epoch": 0.07456,
+      "grad_norm": 0.6638903021812439,
+      "learning_rate": 0.003,
+      "loss": 4.0478,
+      "step": 7456
+    },
+    {
+      "epoch": 0.07457,
+      "grad_norm": 0.5936045050621033,
+      "learning_rate": 0.003,
+      "loss": 4.0373,
+      "step": 7457
+    },
+    {
+      "epoch": 0.07458,
+      "grad_norm": 0.6461426615715027,
+      "learning_rate": 0.003,
+      "loss": 4.0522,
+      "step": 7458
+    },
+    {
+      "epoch": 0.07459,
+      "grad_norm": 0.7057914137840271,
+      "learning_rate": 0.003,
+      "loss": 4.031,
+      "step": 7459
+    },
+    {
+      "epoch": 0.0746,
+      "grad_norm": 0.7673705220222473,
+      "learning_rate": 0.003,
+      "loss": 4.0446,
+      "step": 7460
+    },
+    {
+      "epoch": 0.07461,
+      "grad_norm": 0.7809221744537354,
+      "learning_rate": 0.003,
+      "loss": 4.0513,
+      "step": 7461
+    },
+    {
+      "epoch": 0.07462,
+      "grad_norm": 0.7208760976791382,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 7462
+    },
+    {
+      "epoch": 0.07463,
+      "grad_norm": 0.5994004011154175,
+      "learning_rate": 0.003,
+      "loss": 4.0715,
+      "step": 7463
+    },
+    {
+      "epoch": 0.07464,
+      "grad_norm": 0.6108066439628601,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 7464
+    },
+    {
+      "epoch": 0.07465,
+      "grad_norm": 0.7504144906997681,
+      "learning_rate": 0.003,
+      "loss": 4.0461,
+      "step": 7465
+    },
+    {
+      "epoch": 0.07466,
+      "grad_norm": 0.8534754514694214,
+      "learning_rate": 0.003,
+      "loss": 4.063,
+      "step": 7466
+    },
+    {
+      "epoch": 0.07467,
+      "grad_norm": 0.8829362988471985,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 7467
+    },
+    {
+      "epoch": 0.07468,
+      "grad_norm": 0.8858988881111145,
+      "learning_rate": 0.003,
+      "loss": 4.0576,
+      "step": 7468
+    },
+    {
+      "epoch": 0.07469,
+      "grad_norm": 0.8821552395820618,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 7469
+    },
+    {
+      "epoch": 0.0747,
+      "grad_norm": 0.8428568840026855,
+      "learning_rate": 0.003,
+      "loss": 4.0724,
+      "step": 7470
+    },
+    {
+      "epoch": 0.07471,
+      "grad_norm": 0.7221834659576416,
+      "learning_rate": 0.003,
+      "loss": 4.0526,
+      "step": 7471
+    },
+    {
+      "epoch": 0.07472,
+      "grad_norm": 0.6946365833282471,
+      "learning_rate": 0.003,
+      "loss": 4.0547,
+      "step": 7472
+    },
+    {
+      "epoch": 0.07473,
+      "grad_norm": 0.6493435502052307,
+      "learning_rate": 0.003,
+      "loss": 4.0572,
+      "step": 7473
+    },
+    {
+      "epoch": 0.07474,
+      "grad_norm": 0.7220835089683533,
+      "learning_rate": 0.003,
+      "loss": 4.04,
+      "step": 7474
+    },
+    {
+      "epoch": 0.07475,
+      "grad_norm": 0.7649142146110535,
+      "learning_rate": 0.003,
+      "loss": 4.054,
+      "step": 7475
+    },
+    {
+      "epoch": 0.07476,
+      "grad_norm": 0.914003312587738,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 7476
+    },
+    {
+      "epoch": 0.07477,
+      "grad_norm": 1.0200934410095215,
+      "learning_rate": 0.003,
+      "loss": 4.0652,
+      "step": 7477
+    },
+    {
+      "epoch": 0.07478,
+      "grad_norm": 1.0416991710662842,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 7478
+    },
+    {
+      "epoch": 0.07479,
+      "grad_norm": 0.8721848130226135,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 7479
+    },
+    {
+      "epoch": 0.0748,
+      "grad_norm": 0.7051395177841187,
+      "learning_rate": 0.003,
+      "loss": 4.0451,
+      "step": 7480
+    },
+    {
+      "epoch": 0.07481,
+      "grad_norm": 0.6780185103416443,
+      "learning_rate": 0.003,
+      "loss": 4.0504,
+      "step": 7481
+    },
+    {
+      "epoch": 0.07482,
+      "grad_norm": 0.6697372198104858,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 7482
+    },
+    {
+      "epoch": 0.07483,
+      "grad_norm": 0.6301858425140381,
+      "learning_rate": 0.003,
+      "loss": 4.07,
+      "step": 7483
+    },
+    {
+      "epoch": 0.07484,
+      "grad_norm": 0.6258836984634399,
+      "learning_rate": 0.003,
+      "loss": 4.0523,
+      "step": 7484
+    },
+    {
+      "epoch": 0.07485,
+      "grad_norm": 0.667647123336792,
+      "learning_rate": 0.003,
+      "loss": 4.0428,
+      "step": 7485
+    },
+    {
+      "epoch": 0.07486,
+      "grad_norm": 0.8375819325447083,
+      "learning_rate": 0.003,
+      "loss": 4.0331,
+      "step": 7486
+    },
+    {
+      "epoch": 0.07487,
+      "grad_norm": 1.0791035890579224,
+      "learning_rate": 0.003,
+      "loss": 4.0541,
+      "step": 7487
+    },
+    {
+      "epoch": 0.07488,
+      "grad_norm": 0.9067569375038147,
+      "learning_rate": 0.003,
+      "loss": 4.0612,
+      "step": 7488
+    },
+    {
+      "epoch": 0.07489,
+      "grad_norm": 0.6088890433311462,
+      "learning_rate": 0.003,
+      "loss": 4.0631,
+      "step": 7489
+    },
+    {
+      "epoch": 0.0749,
+      "grad_norm": 0.5461151599884033,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 7490
+    },
+    {
+      "epoch": 0.07491,
+      "grad_norm": 0.5697436928749084,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 7491
+    },
+    {
+      "epoch": 0.07492,
+      "grad_norm": 0.5373200178146362,
+      "learning_rate": 0.003,
+      "loss": 4.0564,
+      "step": 7492
+    },
+    {
+      "epoch": 0.07493,
+      "grad_norm": 0.45845916867256165,
+      "learning_rate": 0.003,
+      "loss": 4.0435,
+      "step": 7493
+    },
+    {
+      "epoch": 0.07494,
+      "grad_norm": 0.4908093512058258,
+      "learning_rate": 0.003,
+      "loss": 4.0587,
+      "step": 7494
+    },
+    {
+      "epoch": 0.07495,
+      "grad_norm": 0.4465610682964325,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 7495
+    },
+    {
+      "epoch": 0.07496,
+      "grad_norm": 0.44271135330200195,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 7496
+    },
+    {
+      "epoch": 0.07497,
+      "grad_norm": 0.4760777950286865,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 7497
+    },
+    {
+      "epoch": 0.07498,
+      "grad_norm": 0.5491810441017151,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 7498
+    },
+    {
+      "epoch": 0.07499,
+      "grad_norm": 0.5653870105743408,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 7499
+    },
+    {
+      "epoch": 0.075,
+      "grad_norm": 0.5969352722167969,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 7500
+    },
+    {
+      "epoch": 0.07501,
+      "grad_norm": 0.6566980481147766,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 7501
+    },
+    {
+      "epoch": 0.07502,
+      "grad_norm": 0.7576538920402527,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 7502
+    },
+    {
+      "epoch": 0.07503,
+      "grad_norm": 0.9570623636245728,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 7503
+    },
+    {
+      "epoch": 0.07504,
+      "grad_norm": 1.1501446962356567,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 7504
+    },
+    {
+      "epoch": 0.07505,
+      "grad_norm": 0.821244478225708,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 7505
+    },
+    {
+      "epoch": 0.07506,
+      "grad_norm": 0.9706292152404785,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 7506
+    },
+    {
+      "epoch": 0.07507,
+      "grad_norm": 1.2102551460266113,
+      "learning_rate": 0.003,
+      "loss": 4.0544,
+      "step": 7507
+    },
+    {
+      "epoch": 0.07508,
+      "grad_norm": 0.8915047645568848,
+      "learning_rate": 0.003,
+      "loss": 4.0751,
+      "step": 7508
+    },
+    {
+      "epoch": 0.07509,
+      "grad_norm": 0.795073390007019,
+      "learning_rate": 0.003,
+      "loss": 4.0533,
+      "step": 7509
+    },
+    {
+      "epoch": 0.0751,
+      "grad_norm": 0.7662889957427979,
+      "learning_rate": 0.003,
+      "loss": 4.0508,
+      "step": 7510
+    },
+    {
+      "epoch": 0.07511,
+      "grad_norm": 0.7474156022071838,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 7511
+    },
+    {
+      "epoch": 0.07512,
+      "grad_norm": 0.8176290392875671,
+      "learning_rate": 0.003,
+      "loss": 4.0809,
+      "step": 7512
+    },
+    {
+      "epoch": 0.07513,
+      "grad_norm": 0.9439889192581177,
+      "learning_rate": 0.003,
+      "loss": 4.0672,
+      "step": 7513
+    },
+    {
+      "epoch": 0.07514,
+      "grad_norm": 1.1200593709945679,
+      "learning_rate": 0.003,
+      "loss": 4.0332,
+      "step": 7514
+    },
+    {
+      "epoch": 0.07515,
+      "grad_norm": 0.9754398465156555,
+      "learning_rate": 0.003,
+      "loss": 4.0911,
+      "step": 7515
+    },
+    {
+      "epoch": 0.07516,
+      "grad_norm": 0.8390057682991028,
+      "learning_rate": 0.003,
+      "loss": 4.0429,
+      "step": 7516
+    },
+    {
+      "epoch": 0.07517,
+      "grad_norm": 0.7585924863815308,
+      "learning_rate": 0.003,
+      "loss": 4.0639,
+      "step": 7517
+    },
+    {
+      "epoch": 0.07518,
+      "grad_norm": 0.8066619038581848,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 7518
+    },
+    {
+      "epoch": 0.07519,
+      "grad_norm": 0.8263963460922241,
+      "learning_rate": 0.003,
+      "loss": 4.0567,
+      "step": 7519
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.7621636390686035,
+      "learning_rate": 0.003,
+      "loss": 4.0563,
+      "step": 7520
+    },
+    {
+      "epoch": 0.07521,
+      "grad_norm": 0.8395116329193115,
+      "learning_rate": 0.003,
+      "loss": 4.0627,
+      "step": 7521
+    },
+    {
+      "epoch": 0.07522,
+      "grad_norm": 0.855266273021698,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 7522
+    },
+    {
+      "epoch": 0.07523,
+      "grad_norm": 0.9209402203559875,
+      "learning_rate": 0.003,
+      "loss": 4.0434,
+      "step": 7523
+    },
+    {
+      "epoch": 0.07524,
+      "grad_norm": 0.8674631714820862,
+      "learning_rate": 0.003,
+      "loss": 4.0717,
+      "step": 7524
+    },
+    {
+      "epoch": 0.07525,
+      "grad_norm": 0.9512166380882263,
+      "learning_rate": 0.003,
+      "loss": 4.0816,
+      "step": 7525
+    },
+    {
+      "epoch": 0.07526,
+      "grad_norm": 1.0687352418899536,
+      "learning_rate": 0.003,
+      "loss": 4.0833,
+      "step": 7526
+    },
+    {
+      "epoch": 0.07527,
+      "grad_norm": 0.8810518383979797,
+      "learning_rate": 0.003,
+      "loss": 4.0776,
+      "step": 7527
+    },
+    {
+      "epoch": 0.07528,
+      "grad_norm": 0.8249692320823669,
+      "learning_rate": 0.003,
+      "loss": 4.0783,
+      "step": 7528
+    },
+    {
+      "epoch": 0.07529,
+      "grad_norm": 0.779848039150238,
+      "learning_rate": 0.003,
+      "loss": 4.0494,
+      "step": 7529
+    },
+    {
+      "epoch": 0.0753,
+      "grad_norm": 0.5802583694458008,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 7530
+    },
+    {
+      "epoch": 0.07531,
+      "grad_norm": 0.5488690137863159,
+      "learning_rate": 0.003,
+      "loss": 4.0704,
+      "step": 7531
+    },
+    {
+      "epoch": 0.07532,
+      "grad_norm": 0.6669039130210876,
+      "learning_rate": 0.003,
+      "loss": 4.0581,
+      "step": 7532
+    },
+    {
+      "epoch": 0.07533,
+      "grad_norm": 0.7888681292533875,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 7533
+    },
+    {
+      "epoch": 0.07534,
+      "grad_norm": 0.8699842691421509,
+      "learning_rate": 0.003,
+      "loss": 4.0526,
+      "step": 7534
+    },
+    {
+      "epoch": 0.07535,
+      "grad_norm": 0.8488181829452515,
+      "learning_rate": 0.003,
+      "loss": 4.059,
+      "step": 7535
+    },
+    {
+      "epoch": 0.07536,
+      "grad_norm": 0.7273427844047546,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 7536
+    },
+    {
+      "epoch": 0.07537,
+      "grad_norm": 0.6958652138710022,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 7537
+    },
+    {
+      "epoch": 0.07538,
+      "grad_norm": 0.7339373826980591,
+      "learning_rate": 0.003,
+      "loss": 4.0513,
+      "step": 7538
+    },
+    {
+      "epoch": 0.07539,
+      "grad_norm": 0.5761485695838928,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 7539
+    },
+    {
+      "epoch": 0.0754,
+      "grad_norm": 0.6199266314506531,
+      "learning_rate": 0.003,
+      "loss": 4.0416,
+      "step": 7540
+    },
+    {
+      "epoch": 0.07541,
+      "grad_norm": 0.5513209104537964,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 7541
+    },
+    {
+      "epoch": 0.07542,
+      "grad_norm": 0.6161065697669983,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 7542
+    },
+    {
+      "epoch": 0.07543,
+      "grad_norm": 0.7098742723464966,
+      "learning_rate": 0.003,
+      "loss": 4.0617,
+      "step": 7543
+    },
+    {
+      "epoch": 0.07544,
+      "grad_norm": 0.7922773957252502,
+      "learning_rate": 0.003,
+      "loss": 4.057,
+      "step": 7544
+    },
+    {
+      "epoch": 0.07545,
+      "grad_norm": 0.7381566166877747,
+      "learning_rate": 0.003,
+      "loss": 4.0817,
+      "step": 7545
+    },
+    {
+      "epoch": 0.07546,
+      "grad_norm": 0.6755424737930298,
+      "learning_rate": 0.003,
+      "loss": 4.0323,
+      "step": 7546
+    },
+    {
+      "epoch": 0.07547,
+      "grad_norm": 0.8436152338981628,
+      "learning_rate": 0.003,
+      "loss": 4.059,
+      "step": 7547
+    },
+    {
+      "epoch": 0.07548,
+      "grad_norm": 1.0440216064453125,
+      "learning_rate": 0.003,
+      "loss": 4.0744,
+      "step": 7548
+    },
+    {
+      "epoch": 0.07549,
+      "grad_norm": 1.2041999101638794,
+      "learning_rate": 0.003,
+      "loss": 4.0823,
+      "step": 7549
+    },
+    {
+      "epoch": 0.0755,
+      "grad_norm": 0.8359081745147705,
+      "learning_rate": 0.003,
+      "loss": 4.0712,
+      "step": 7550
+    },
+    {
+      "epoch": 0.07551,
+      "grad_norm": 0.7312293648719788,
+      "learning_rate": 0.003,
+      "loss": 4.062,
+      "step": 7551
+    },
+    {
+      "epoch": 0.07552,
+      "grad_norm": 0.6296018362045288,
+      "learning_rate": 0.003,
+      "loss": 4.058,
+      "step": 7552
+    },
+    {
+      "epoch": 0.07553,
+      "grad_norm": 0.6481240391731262,
+      "learning_rate": 0.003,
+      "loss": 4.0626,
+      "step": 7553
+    },
+    {
+      "epoch": 0.07554,
+      "grad_norm": 0.6704967021942139,
+      "learning_rate": 0.003,
+      "loss": 4.059,
+      "step": 7554
+    },
+    {
+      "epoch": 0.07555,
+      "grad_norm": 0.7643706202507019,
+      "learning_rate": 0.003,
+      "loss": 4.049,
+      "step": 7555
+    },
+    {
+      "epoch": 0.07556,
+      "grad_norm": 0.7920114994049072,
+      "learning_rate": 0.003,
+      "loss": 4.0662,
+      "step": 7556
+    },
+    {
+      "epoch": 0.07557,
+      "grad_norm": 0.813525915145874,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 7557
+    },
+    {
+      "epoch": 0.07558,
+      "grad_norm": 0.8534443378448486,
+      "learning_rate": 0.003,
+      "loss": 4.0457,
+      "step": 7558
+    },
+    {
+      "epoch": 0.07559,
+      "grad_norm": 0.8572565317153931,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 7559
+    },
+    {
+      "epoch": 0.0756,
+      "grad_norm": 0.8627861142158508,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 7560
+    },
+    {
+      "epoch": 0.07561,
+      "grad_norm": 0.7403004765510559,
+      "learning_rate": 0.003,
+      "loss": 4.0432,
+      "step": 7561
+    },
+    {
+      "epoch": 0.07562,
+      "grad_norm": 0.695635974407196,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 7562
+    },
+    {
+      "epoch": 0.07563,
+      "grad_norm": 0.6318756937980652,
+      "learning_rate": 0.003,
+      "loss": 4.047,
+      "step": 7563
+    },
+    {
+      "epoch": 0.07564,
+      "grad_norm": 0.5857333540916443,
+      "learning_rate": 0.003,
+      "loss": 4.0673,
+      "step": 7564
+    },
+    {
+      "epoch": 0.07565,
+      "grad_norm": 0.5988854169845581,
+      "learning_rate": 0.003,
+      "loss": 4.0474,
+      "step": 7565
+    },
+    {
+      "epoch": 0.07566,
+      "grad_norm": 0.6155543923377991,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 7566
+    },
+    {
+      "epoch": 0.07567,
+      "grad_norm": 0.625502347946167,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 7567
+    },
+    {
+      "epoch": 0.07568,
+      "grad_norm": 0.6525031328201294,
+      "learning_rate": 0.003,
+      "loss": 4.0486,
+      "step": 7568
+    },
+    {
+      "epoch": 0.07569,
+      "grad_norm": 0.705432116985321,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 7569
+    },
+    {
+      "epoch": 0.0757,
+      "grad_norm": 0.7517141103744507,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 7570
+    },
+    {
+      "epoch": 0.07571,
+      "grad_norm": 0.7912932634353638,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 7571
+    },
+    {
+      "epoch": 0.07572,
+      "grad_norm": 0.6804235577583313,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 7572
+    },
+    {
+      "epoch": 0.07573,
+      "grad_norm": 0.5986480712890625,
+      "learning_rate": 0.003,
+      "loss": 4.0376,
+      "step": 7573
+    },
+    {
+      "epoch": 0.07574,
+      "grad_norm": 0.6231833696365356,
+      "learning_rate": 0.003,
+      "loss": 4.0538,
+      "step": 7574
+    },
+    {
+      "epoch": 0.07575,
+      "grad_norm": 0.6790160536766052,
+      "learning_rate": 0.003,
+      "loss": 4.0332,
+      "step": 7575
+    },
+    {
+      "epoch": 0.07576,
+      "grad_norm": 0.657383918762207,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 7576
+    },
+    {
+      "epoch": 0.07577,
+      "grad_norm": 0.5640320777893066,
+      "learning_rate": 0.003,
+      "loss": 4.0586,
+      "step": 7577
+    },
+    {
+      "epoch": 0.07578,
+      "grad_norm": 0.5741853713989258,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 7578
+    },
+    {
+      "epoch": 0.07579,
+      "grad_norm": 0.5155695080757141,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 7579
+    },
+    {
+      "epoch": 0.0758,
+      "grad_norm": 0.489566445350647,
+      "learning_rate": 0.003,
+      "loss": 4.0446,
+      "step": 7580
+    },
+    {
+      "epoch": 0.07581,
+      "grad_norm": 0.4907761812210083,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 7581
+    },
+    {
+      "epoch": 0.07582,
+      "grad_norm": 0.6237406134605408,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 7582
+    },
+    {
+      "epoch": 0.07583,
+      "grad_norm": 0.6927093267440796,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 7583
+    },
+    {
+      "epoch": 0.07584,
+      "grad_norm": 0.7526013851165771,
+      "learning_rate": 0.003,
+      "loss": 4.0812,
+      "step": 7584
+    },
+    {
+      "epoch": 0.07585,
+      "grad_norm": 0.7571908235549927,
+      "learning_rate": 0.003,
+      "loss": 4.0476,
+      "step": 7585
+    },
+    {
+      "epoch": 0.07586,
+      "grad_norm": 0.794185996055603,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 7586
+    },
+    {
+      "epoch": 0.07587,
+      "grad_norm": 1.0438753366470337,
+      "learning_rate": 0.003,
+      "loss": 4.0746,
+      "step": 7587
+    },
+    {
+      "epoch": 0.07588,
+      "grad_norm": 1.1041924953460693,
+      "learning_rate": 0.003,
+      "loss": 4.0496,
+      "step": 7588
+    },
+    {
+      "epoch": 0.07589,
+      "grad_norm": 0.8445767164230347,
+      "learning_rate": 0.003,
+      "loss": 4.0566,
+      "step": 7589
+    },
+    {
+      "epoch": 0.0759,
+      "grad_norm": 0.6652212738990784,
+      "learning_rate": 0.003,
+      "loss": 4.0467,
+      "step": 7590
+    },
+    {
+      "epoch": 0.07591,
+      "grad_norm": 0.5634509325027466,
+      "learning_rate": 0.003,
+      "loss": 4.0525,
+      "step": 7591
+    },
+    {
+      "epoch": 0.07592,
+      "grad_norm": 0.7388189435005188,
+      "learning_rate": 0.003,
+      "loss": 4.056,
+      "step": 7592
+    },
+    {
+      "epoch": 0.07593,
+      "grad_norm": 0.841446042060852,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 7593
+    },
+    {
+      "epoch": 0.07594,
+      "grad_norm": 0.9196954965591431,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 7594
+    },
+    {
+      "epoch": 0.07595,
+      "grad_norm": 0.9264878630638123,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 7595
+    },
+    {
+      "epoch": 0.07596,
+      "grad_norm": 0.7635065913200378,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 7596
+    },
+    {
+      "epoch": 0.07597,
+      "grad_norm": 0.6822502613067627,
+      "learning_rate": 0.003,
+      "loss": 4.0664,
+      "step": 7597
+    },
+    {
+      "epoch": 0.07598,
+      "grad_norm": 0.7266221642494202,
+      "learning_rate": 0.003,
+      "loss": 4.0624,
+      "step": 7598
+    },
+    {
+      "epoch": 0.07599,
+      "grad_norm": 0.6876087784767151,
+      "learning_rate": 0.003,
+      "loss": 4.0612,
+      "step": 7599
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.6935887932777405,
+      "learning_rate": 0.003,
+      "loss": 4.0508,
+      "step": 7600
+    },
+    {
+      "epoch": 0.07601,
+      "grad_norm": 0.7845306992530823,
+      "learning_rate": 0.003,
+      "loss": 4.0721,
+      "step": 7601
+    },
+    {
+      "epoch": 0.07602,
+      "grad_norm": 0.7442597150802612,
+      "learning_rate": 0.003,
+      "loss": 4.0444,
+      "step": 7602
+    },
+    {
+      "epoch": 0.07603,
+      "grad_norm": 0.7260087728500366,
+      "learning_rate": 0.003,
+      "loss": 4.045,
+      "step": 7603
+    },
+    {
+      "epoch": 0.07604,
+      "grad_norm": 0.6882612705230713,
+      "learning_rate": 0.003,
+      "loss": 4.0594,
+      "step": 7604
+    },
+    {
+      "epoch": 0.07605,
+      "grad_norm": 0.690847635269165,
+      "learning_rate": 0.003,
+      "loss": 4.0488,
+      "step": 7605
+    },
+    {
+      "epoch": 0.07606,
+      "grad_norm": 0.8980867862701416,
+      "learning_rate": 0.003,
+      "loss": 4.1073,
+      "step": 7606
+    },
+    {
+      "epoch": 0.07607,
+      "grad_norm": 1.1527798175811768,
+      "learning_rate": 0.003,
+      "loss": 4.0709,
+      "step": 7607
+    },
+    {
+      "epoch": 0.07608,
+      "grad_norm": 1.0187749862670898,
+      "learning_rate": 0.003,
+      "loss": 4.0819,
+      "step": 7608
+    },
+    {
+      "epoch": 0.07609,
+      "grad_norm": 0.8751260042190552,
+      "learning_rate": 0.003,
+      "loss": 4.0561,
+      "step": 7609
+    },
+    {
+      "epoch": 0.0761,
+      "grad_norm": 0.7832580804824829,
+      "learning_rate": 0.003,
+      "loss": 4.0475,
+      "step": 7610
+    },
+    {
+      "epoch": 0.07611,
+      "grad_norm": 0.8499090671539307,
+      "learning_rate": 0.003,
+      "loss": 4.0791,
+      "step": 7611
+    },
+    {
+      "epoch": 0.07612,
+      "grad_norm": 0.9208655953407288,
+      "learning_rate": 0.003,
+      "loss": 4.0449,
+      "step": 7612
+    },
+    {
+      "epoch": 0.07613,
+      "grad_norm": 1.014868140220642,
+      "learning_rate": 0.003,
+      "loss": 4.0485,
+      "step": 7613
+    },
+    {
+      "epoch": 0.07614,
+      "grad_norm": 0.9037597179412842,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 7614
+    },
+    {
+      "epoch": 0.07615,
+      "grad_norm": 0.675439715385437,
+      "learning_rate": 0.003,
+      "loss": 4.0511,
+      "step": 7615
+    },
+    {
+      "epoch": 0.07616,
+      "grad_norm": 0.6229849457740784,
+      "learning_rate": 0.003,
+      "loss": 4.0503,
+      "step": 7616
+    },
+    {
+      "epoch": 0.07617,
+      "grad_norm": 0.6365832090377808,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 7617
+    },
+    {
+      "epoch": 0.07618,
+      "grad_norm": 0.5526401400566101,
+      "learning_rate": 0.003,
+      "loss": 4.079,
+      "step": 7618
+    },
+    {
+      "epoch": 0.07619,
+      "grad_norm": 0.4938875734806061,
+      "learning_rate": 0.003,
+      "loss": 4.0524,
+      "step": 7619
+    },
+    {
+      "epoch": 0.0762,
+      "grad_norm": 0.5198700428009033,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 7620
+    },
+    {
+      "epoch": 0.07621,
+      "grad_norm": 0.587841272354126,
+      "learning_rate": 0.003,
+      "loss": 4.0585,
+      "step": 7621
+    },
+    {
+      "epoch": 0.07622,
+      "grad_norm": 0.6705599427223206,
+      "learning_rate": 0.003,
+      "loss": 4.0552,
+      "step": 7622
+    },
+    {
+      "epoch": 0.07623,
+      "grad_norm": 0.7022711634635925,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 7623
+    },
+    {
+      "epoch": 0.07624,
+      "grad_norm": 0.7389097809791565,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 7624
+    },
+    {
+      "epoch": 0.07625,
+      "grad_norm": 0.7409002184867859,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 7625
+    },
+    {
+      "epoch": 0.07626,
+      "grad_norm": 0.6842824220657349,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 7626
+    },
+    {
+      "epoch": 0.07627,
+      "grad_norm": 0.635125458240509,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 7627
+    },
+    {
+      "epoch": 0.07628,
+      "grad_norm": 0.49942052364349365,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 7628
+    },
+    {
+      "epoch": 0.07629,
+      "grad_norm": 0.5460994839668274,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 7629
+    },
+    {
+      "epoch": 0.0763,
+      "grad_norm": 0.6246569752693176,
+      "learning_rate": 0.003,
+      "loss": 4.0242,
+      "step": 7630
+    },
+    {
+      "epoch": 0.07631,
+      "grad_norm": 0.7679206728935242,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 7631
+    },
+    {
+      "epoch": 0.07632,
+      "grad_norm": 1.076688528060913,
+      "learning_rate": 0.003,
+      "loss": 4.0256,
+      "step": 7632
+    },
+    {
+      "epoch": 0.07633,
+      "grad_norm": 0.9796765446662903,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 7633
+    },
+    {
+      "epoch": 0.07634,
+      "grad_norm": 0.9849674105644226,
+      "learning_rate": 0.003,
+      "loss": 4.0295,
+      "step": 7634
+    },
+    {
+      "epoch": 0.07635,
+      "grad_norm": 0.8030697107315063,
+      "learning_rate": 0.003,
+      "loss": 4.0487,
+      "step": 7635
+    },
+    {
+      "epoch": 0.07636,
+      "grad_norm": 0.7698909640312195,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 7636
+    },
+    {
+      "epoch": 0.07637,
+      "grad_norm": 1.0407158136367798,
+      "learning_rate": 0.003,
+      "loss": 4.0804,
+      "step": 7637
+    },
+    {
+      "epoch": 0.07638,
+      "grad_norm": 0.8812909722328186,
+      "learning_rate": 0.003,
+      "loss": 4.0495,
+      "step": 7638
+    },
+    {
+      "epoch": 0.07639,
+      "grad_norm": 0.6991906762123108,
+      "learning_rate": 0.003,
+      "loss": 4.0503,
+      "step": 7639
+    },
+    {
+      "epoch": 0.0764,
+      "grad_norm": 0.5978763699531555,
+      "learning_rate": 0.003,
+      "loss": 4.0783,
+      "step": 7640
+    },
+    {
+      "epoch": 0.07641,
+      "grad_norm": 0.631737470626831,
+      "learning_rate": 0.003,
+      "loss": 4.0709,
+      "step": 7641
+    },
+    {
+      "epoch": 0.07642,
+      "grad_norm": 0.6913520097732544,
+      "learning_rate": 0.003,
+      "loss": 4.063,
+      "step": 7642
+    },
+    {
+      "epoch": 0.07643,
+      "grad_norm": 0.7022256851196289,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 7643
+    },
+    {
+      "epoch": 0.07644,
+      "grad_norm": 0.6650403738021851,
+      "learning_rate": 0.003,
+      "loss": 4.0249,
+      "step": 7644
+    },
+    {
+      "epoch": 0.07645,
+      "grad_norm": 0.7082775235176086,
+      "learning_rate": 0.003,
+      "loss": 4.0448,
+      "step": 7645
+    },
+    {
+      "epoch": 0.07646,
+      "grad_norm": 0.7065228819847107,
+      "learning_rate": 0.003,
+      "loss": 4.0487,
+      "step": 7646
+    },
+    {
+      "epoch": 0.07647,
+      "grad_norm": 0.6930513381958008,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 7647
+    },
+    {
+      "epoch": 0.07648,
+      "grad_norm": 0.7207313776016235,
+      "learning_rate": 0.003,
+      "loss": 4.0475,
+      "step": 7648
+    },
+    {
+      "epoch": 0.07649,
+      "grad_norm": 0.6997010111808777,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 7649
+    },
+    {
+      "epoch": 0.0765,
+      "grad_norm": 0.7499129176139832,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 7650
+    },
+    {
+      "epoch": 0.07651,
+      "grad_norm": 0.9787033796310425,
+      "learning_rate": 0.003,
+      "loss": 4.0447,
+      "step": 7651
+    },
+    {
+      "epoch": 0.07652,
+      "grad_norm": 1.251487374305725,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 7652
+    },
+    {
+      "epoch": 0.07653,
+      "grad_norm": 0.899669349193573,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 7653
+    },
+    {
+      "epoch": 0.07654,
+      "grad_norm": 0.9999637007713318,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 7654
+    },
+    {
+      "epoch": 0.07655,
+      "grad_norm": 0.9081646800041199,
+      "learning_rate": 0.003,
+      "loss": 4.0631,
+      "step": 7655
+    },
+    {
+      "epoch": 0.07656,
+      "grad_norm": 0.783086359500885,
+      "learning_rate": 0.003,
+      "loss": 4.0615,
+      "step": 7656
+    },
+    {
+      "epoch": 0.07657,
+      "grad_norm": 0.7844822406768799,
+      "learning_rate": 0.003,
+      "loss": 4.0707,
+      "step": 7657
+    },
+    {
+      "epoch": 0.07658,
+      "grad_norm": 0.7419793009757996,
+      "learning_rate": 0.003,
+      "loss": 4.0503,
+      "step": 7658
+    },
+    {
+      "epoch": 0.07659,
+      "grad_norm": 0.715597927570343,
+      "learning_rate": 0.003,
+      "loss": 4.0447,
+      "step": 7659
+    },
+    {
+      "epoch": 0.0766,
+      "grad_norm": 0.6549267768859863,
+      "learning_rate": 0.003,
+      "loss": 4.0831,
+      "step": 7660
+    },
+    {
+      "epoch": 0.07661,
+      "grad_norm": 0.785686731338501,
+      "learning_rate": 0.003,
+      "loss": 4.043,
+      "step": 7661
+    },
+    {
+      "epoch": 0.07662,
+      "grad_norm": 1.0206282138824463,
+      "learning_rate": 0.003,
+      "loss": 4.0749,
+      "step": 7662
+    },
+    {
+      "epoch": 0.07663,
+      "grad_norm": 1.1866017580032349,
+      "learning_rate": 0.003,
+      "loss": 4.0771,
+      "step": 7663
+    },
+    {
+      "epoch": 0.07664,
+      "grad_norm": 0.7022275328636169,
+      "learning_rate": 0.003,
+      "loss": 4.0598,
+      "step": 7664
+    },
+    {
+      "epoch": 0.07665,
+      "grad_norm": 0.6321223974227905,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 7665
+    },
+    {
+      "epoch": 0.07666,
+      "grad_norm": 0.917163074016571,
+      "learning_rate": 0.003,
+      "loss": 4.0524,
+      "step": 7666
+    },
+    {
+      "epoch": 0.07667,
+      "grad_norm": 1.0060826539993286,
+      "learning_rate": 0.003,
+      "loss": 4.06,
+      "step": 7667
+    },
+    {
+      "epoch": 0.07668,
+      "grad_norm": 0.7803816795349121,
+      "learning_rate": 0.003,
+      "loss": 4.046,
+      "step": 7668
+    },
+    {
+      "epoch": 0.07669,
+      "grad_norm": 0.5439775586128235,
+      "learning_rate": 0.003,
+      "loss": 4.0565,
+      "step": 7669
+    },
+    {
+      "epoch": 0.0767,
+      "grad_norm": 0.5985819697380066,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 7670
+    },
+    {
+      "epoch": 0.07671,
+      "grad_norm": 0.6868377923965454,
+      "learning_rate": 0.003,
+      "loss": 4.0607,
+      "step": 7671
+    },
+    {
+      "epoch": 0.07672,
+      "grad_norm": 0.7403073906898499,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 7672
+    },
+    {
+      "epoch": 0.07673,
+      "grad_norm": 0.754479169845581,
+      "learning_rate": 0.003,
+      "loss": 4.0738,
+      "step": 7673
+    },
+    {
+      "epoch": 0.07674,
+      "grad_norm": 0.6656348705291748,
+      "learning_rate": 0.003,
+      "loss": 4.0389,
+      "step": 7674
+    },
+    {
+      "epoch": 0.07675,
+      "grad_norm": 0.582493007183075,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 7675
+    },
+    {
+      "epoch": 0.07676,
+      "grad_norm": 0.5362693071365356,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 7676
+    },
+    {
+      "epoch": 0.07677,
+      "grad_norm": 0.5358790755271912,
+      "learning_rate": 0.003,
+      "loss": 4.031,
+      "step": 7677
+    },
+    {
+      "epoch": 0.07678,
+      "grad_norm": 0.5923407673835754,
+      "learning_rate": 0.003,
+      "loss": 4.0659,
+      "step": 7678
+    },
+    {
+      "epoch": 0.07679,
+      "grad_norm": 0.6382786631584167,
+      "learning_rate": 0.003,
+      "loss": 4.0575,
+      "step": 7679
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.6146613955497742,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 7680
+    },
+    {
+      "epoch": 0.07681,
+      "grad_norm": 0.5740124583244324,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 7681
+    },
+    {
+      "epoch": 0.07682,
+      "grad_norm": 0.5345171093940735,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 7682
+    },
+    {
+      "epoch": 0.07683,
+      "grad_norm": 0.5641381144523621,
+      "learning_rate": 0.003,
+      "loss": 4.0373,
+      "step": 7683
+    },
+    {
+      "epoch": 0.07684,
+      "grad_norm": 0.6472615599632263,
+      "learning_rate": 0.003,
+      "loss": 4.0542,
+      "step": 7684
+    },
+    {
+      "epoch": 0.07685,
+      "grad_norm": 0.7183139324188232,
+      "learning_rate": 0.003,
+      "loss": 4.0341,
+      "step": 7685
+    },
+    {
+      "epoch": 0.07686,
+      "grad_norm": 0.8126670122146606,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 7686
+    },
+    {
+      "epoch": 0.07687,
+      "grad_norm": 0.8759085536003113,
+      "learning_rate": 0.003,
+      "loss": 4.0558,
+      "step": 7687
+    },
+    {
+      "epoch": 0.07688,
+      "grad_norm": 0.8345987796783447,
+      "learning_rate": 0.003,
+      "loss": 4.0522,
+      "step": 7688
+    },
+    {
+      "epoch": 0.07689,
+      "grad_norm": 0.7326918244361877,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 7689
+    },
+    {
+      "epoch": 0.0769,
+      "grad_norm": 0.7978694438934326,
+      "learning_rate": 0.003,
+      "loss": 4.0585,
+      "step": 7690
+    },
+    {
+      "epoch": 0.07691,
+      "grad_norm": 0.9254701733589172,
+      "learning_rate": 0.003,
+      "loss": 4.0581,
+      "step": 7691
+    },
+    {
+      "epoch": 0.07692,
+      "grad_norm": 1.1151405572891235,
+      "learning_rate": 0.003,
+      "loss": 4.071,
+      "step": 7692
+    },
+    {
+      "epoch": 0.07693,
+      "grad_norm": 0.7501732707023621,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 7693
+    },
+    {
+      "epoch": 0.07694,
+      "grad_norm": 0.6858186721801758,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 7694
+    },
+    {
+      "epoch": 0.07695,
+      "grad_norm": 0.7969062924385071,
+      "learning_rate": 0.003,
+      "loss": 4.0635,
+      "step": 7695
+    },
+    {
+      "epoch": 0.07696,
+      "grad_norm": 0.782137393951416,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 7696
+    },
+    {
+      "epoch": 0.07697,
+      "grad_norm": 0.9599317312240601,
+      "learning_rate": 0.003,
+      "loss": 4.0826,
+      "step": 7697
+    },
+    {
+      "epoch": 0.07698,
+      "grad_norm": 1.1077126264572144,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 7698
+    },
+    {
+      "epoch": 0.07699,
+      "grad_norm": 0.8662140965461731,
+      "learning_rate": 0.003,
+      "loss": 4.0864,
+      "step": 7699
+    },
+    {
+      "epoch": 0.077,
+      "grad_norm": 0.7337355017662048,
+      "learning_rate": 0.003,
+      "loss": 4.0564,
+      "step": 7700
+    },
+    {
+      "epoch": 0.07701,
+      "grad_norm": 0.7257059216499329,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 7701
+    },
+    {
+      "epoch": 0.07702,
+      "grad_norm": 0.6639032959938049,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 7702
+    },
+    {
+      "epoch": 0.07703,
+      "grad_norm": 0.7701454162597656,
+      "learning_rate": 0.003,
+      "loss": 4.0559,
+      "step": 7703
+    },
+    {
+      "epoch": 0.07704,
+      "grad_norm": 0.8333292603492737,
+      "learning_rate": 0.003,
+      "loss": 4.0582,
+      "step": 7704
+    },
+    {
+      "epoch": 0.07705,
+      "grad_norm": 0.71268230676651,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 7705
+    },
+    {
+      "epoch": 0.07706,
+      "grad_norm": 0.7142350077629089,
+      "learning_rate": 0.003,
+      "loss": 4.0614,
+      "step": 7706
+    },
+    {
+      "epoch": 0.07707,
+      "grad_norm": 0.7584391832351685,
+      "learning_rate": 0.003,
+      "loss": 4.0563,
+      "step": 7707
+    },
+    {
+      "epoch": 0.07708,
+      "grad_norm": 0.6686346530914307,
+      "learning_rate": 0.003,
+      "loss": 4.0656,
+      "step": 7708
+    },
+    {
+      "epoch": 0.07709,
+      "grad_norm": 0.6045640707015991,
+      "learning_rate": 0.003,
+      "loss": 4.0483,
+      "step": 7709
+    },
+    {
+      "epoch": 0.0771,
+      "grad_norm": 0.5481950640678406,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 7710
+    },
+    {
+      "epoch": 0.07711,
+      "grad_norm": 0.5819096565246582,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 7711
+    },
+    {
+      "epoch": 0.07712,
+      "grad_norm": 0.703336238861084,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 7712
+    },
+    {
+      "epoch": 0.07713,
+      "grad_norm": 0.6952189803123474,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 7713
+    },
+    {
+      "epoch": 0.07714,
+      "grad_norm": 0.7354077100753784,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 7714
+    },
+    {
+      "epoch": 0.07715,
+      "grad_norm": 0.8077874779701233,
+      "learning_rate": 0.003,
+      "loss": 4.044,
+      "step": 7715
+    },
+    {
+      "epoch": 0.07716,
+      "grad_norm": 1.056091070175171,
+      "learning_rate": 0.003,
+      "loss": 4.0339,
+      "step": 7716
+    },
+    {
+      "epoch": 0.07717,
+      "grad_norm": 1.122626543045044,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 7717
+    },
+    {
+      "epoch": 0.07718,
+      "grad_norm": 0.7326860427856445,
+      "learning_rate": 0.003,
+      "loss": 4.071,
+      "step": 7718
+    },
+    {
+      "epoch": 0.07719,
+      "grad_norm": 0.7065683007240295,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 7719
+    },
+    {
+      "epoch": 0.0772,
+      "grad_norm": 0.825425922870636,
+      "learning_rate": 0.003,
+      "loss": 4.0757,
+      "step": 7720
+    },
+    {
+      "epoch": 0.07721,
+      "grad_norm": 0.8575567007064819,
+      "learning_rate": 0.003,
+      "loss": 4.0351,
+      "step": 7721
+    },
+    {
+      "epoch": 0.07722,
+      "grad_norm": 0.8774976134300232,
+      "learning_rate": 0.003,
+      "loss": 4.0702,
+      "step": 7722
+    },
+    {
+      "epoch": 0.07723,
+      "grad_norm": 0.8449294567108154,
+      "learning_rate": 0.003,
+      "loss": 4.0702,
+      "step": 7723
+    },
+    {
+      "epoch": 0.07724,
+      "grad_norm": 0.9954960942268372,
+      "learning_rate": 0.003,
+      "loss": 4.0562,
+      "step": 7724
+    },
+    {
+      "epoch": 0.07725,
+      "grad_norm": 1.0463624000549316,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 7725
+    },
+    {
+      "epoch": 0.07726,
+      "grad_norm": 0.8136222958564758,
+      "learning_rate": 0.003,
+      "loss": 4.0708,
+      "step": 7726
+    },
+    {
+      "epoch": 0.07727,
+      "grad_norm": 0.9302545189857483,
+      "learning_rate": 0.003,
+      "loss": 4.04,
+      "step": 7727
+    },
+    {
+      "epoch": 0.07728,
+      "grad_norm": 0.8933360576629639,
+      "learning_rate": 0.003,
+      "loss": 4.0495,
+      "step": 7728
+    },
+    {
+      "epoch": 0.07729,
+      "grad_norm": 0.7947314977645874,
+      "learning_rate": 0.003,
+      "loss": 4.0696,
+      "step": 7729
+    },
+    {
+      "epoch": 0.0773,
+      "grad_norm": 0.8221551179885864,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 7730
+    },
+    {
+      "epoch": 0.07731,
+      "grad_norm": 0.7646525502204895,
+      "learning_rate": 0.003,
+      "loss": 4.0524,
+      "step": 7731
+    },
+    {
+      "epoch": 0.07732,
+      "grad_norm": 0.5452698469161987,
+      "learning_rate": 0.003,
+      "loss": 4.0541,
+      "step": 7732
+    },
+    {
+      "epoch": 0.07733,
+      "grad_norm": 0.5334752202033997,
+      "learning_rate": 0.003,
+      "loss": 4.0606,
+      "step": 7733
+    },
+    {
+      "epoch": 0.07734,
+      "grad_norm": 0.5438298583030701,
+      "learning_rate": 0.003,
+      "loss": 4.0568,
+      "step": 7734
+    },
+    {
+      "epoch": 0.07735,
+      "grad_norm": 0.6537331938743591,
+      "learning_rate": 0.003,
+      "loss": 4.0347,
+      "step": 7735
+    },
+    {
+      "epoch": 0.07736,
+      "grad_norm": 0.762072741985321,
+      "learning_rate": 0.003,
+      "loss": 4.0593,
+      "step": 7736
+    },
+    {
+      "epoch": 0.07737,
+      "grad_norm": 0.9448016285896301,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 7737
+    },
+    {
+      "epoch": 0.07738,
+      "grad_norm": 0.943324863910675,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 7738
+    },
+    {
+      "epoch": 0.07739,
+      "grad_norm": 0.761298418045044,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 7739
+    },
+    {
+      "epoch": 0.0774,
+      "grad_norm": 0.6667705178260803,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 7740
+    },
+    {
+      "epoch": 0.07741,
+      "grad_norm": 0.7198627591133118,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 7741
+    },
+    {
+      "epoch": 0.07742,
+      "grad_norm": 0.685585081577301,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 7742
+    },
+    {
+      "epoch": 0.07743,
+      "grad_norm": 0.5970986485481262,
+      "learning_rate": 0.003,
+      "loss": 4.0437,
+      "step": 7743
+    },
+    {
+      "epoch": 0.07744,
+      "grad_norm": 0.5659307837486267,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 7744
+    },
+    {
+      "epoch": 0.07745,
+      "grad_norm": 0.5722436308860779,
+      "learning_rate": 0.003,
+      "loss": 4.0414,
+      "step": 7745
+    },
+    {
+      "epoch": 0.07746,
+      "grad_norm": 0.6215358972549438,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 7746
+    },
+    {
+      "epoch": 0.07747,
+      "grad_norm": 0.638965904712677,
+      "learning_rate": 0.003,
+      "loss": 4.0485,
+      "step": 7747
+    },
+    {
+      "epoch": 0.07748,
+      "grad_norm": 0.5998613834381104,
+      "learning_rate": 0.003,
+      "loss": 4.0702,
+      "step": 7748
+    },
+    {
+      "epoch": 0.07749,
+      "grad_norm": 0.716853141784668,
+      "learning_rate": 0.003,
+      "loss": 4.0653,
+      "step": 7749
+    },
+    {
+      "epoch": 0.0775,
+      "grad_norm": 0.8205824494361877,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 7750
+    },
+    {
+      "epoch": 0.07751,
+      "grad_norm": 0.8468402028083801,
+      "learning_rate": 0.003,
+      "loss": 4.0498,
+      "step": 7751
+    },
+    {
+      "epoch": 0.07752,
+      "grad_norm": 0.8478083610534668,
+      "learning_rate": 0.003,
+      "loss": 4.0686,
+      "step": 7752
+    },
+    {
+      "epoch": 0.07753,
+      "grad_norm": 0.8648585677146912,
+      "learning_rate": 0.003,
+      "loss": 4.0373,
+      "step": 7753
+    },
+    {
+      "epoch": 0.07754,
+      "grad_norm": 0.7602769732475281,
+      "learning_rate": 0.003,
+      "loss": 4.0473,
+      "step": 7754
+    },
+    {
+      "epoch": 0.07755,
+      "grad_norm": 0.7095845341682434,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 7755
+    },
+    {
+      "epoch": 0.07756,
+      "grad_norm": 0.7669501304626465,
+      "learning_rate": 0.003,
+      "loss": 4.066,
+      "step": 7756
+    },
+    {
+      "epoch": 0.07757,
+      "grad_norm": 0.7360534071922302,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 7757
+    },
+    {
+      "epoch": 0.07758,
+      "grad_norm": 0.967415988445282,
+      "learning_rate": 0.003,
+      "loss": 4.0347,
+      "step": 7758
+    },
+    {
+      "epoch": 0.07759,
+      "grad_norm": 1.213719129562378,
+      "learning_rate": 0.003,
+      "loss": 4.0467,
+      "step": 7759
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.8443424701690674,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 7760
+    },
+    {
+      "epoch": 0.07761,
+      "grad_norm": 0.7246764302253723,
+      "learning_rate": 0.003,
+      "loss": 4.0625,
+      "step": 7761
+    },
+    {
+      "epoch": 0.07762,
+      "grad_norm": 0.8051997423171997,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 7762
+    },
+    {
+      "epoch": 0.07763,
+      "grad_norm": 0.8146281242370605,
+      "learning_rate": 0.003,
+      "loss": 4.0643,
+      "step": 7763
+    },
+    {
+      "epoch": 0.07764,
+      "grad_norm": 0.9019950032234192,
+      "learning_rate": 0.003,
+      "loss": 4.0705,
+      "step": 7764
+    },
+    {
+      "epoch": 0.07765,
+      "grad_norm": 1.0602766275405884,
+      "learning_rate": 0.003,
+      "loss": 4.0555,
+      "step": 7765
+    },
+    {
+      "epoch": 0.07766,
+      "grad_norm": 0.9736347794532776,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 7766
+    },
+    {
+      "epoch": 0.07767,
+      "grad_norm": 0.7766427993774414,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 7767
+    },
+    {
+      "epoch": 0.07768,
+      "grad_norm": 0.5827065706253052,
+      "learning_rate": 0.003,
+      "loss": 4.042,
+      "step": 7768
+    },
+    {
+      "epoch": 0.07769,
+      "grad_norm": 0.5819374322891235,
+      "learning_rate": 0.003,
+      "loss": 4.0465,
+      "step": 7769
+    },
+    {
+      "epoch": 0.0777,
+      "grad_norm": 0.6189792156219482,
+      "learning_rate": 0.003,
+      "loss": 4.0593,
+      "step": 7770
+    },
+    {
+      "epoch": 0.07771,
+      "grad_norm": 0.6131955981254578,
+      "learning_rate": 0.003,
+      "loss": 4.0575,
+      "step": 7771
+    },
+    {
+      "epoch": 0.07772,
+      "grad_norm": 0.731395959854126,
+      "learning_rate": 0.003,
+      "loss": 4.0488,
+      "step": 7772
+    },
+    {
+      "epoch": 0.07773,
+      "grad_norm": 0.8028513789176941,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 7773
+    },
+    {
+      "epoch": 0.07774,
+      "grad_norm": 0.8190731406211853,
+      "learning_rate": 0.003,
+      "loss": 4.0463,
+      "step": 7774
+    },
+    {
+      "epoch": 0.07775,
+      "grad_norm": 0.8094491362571716,
+      "learning_rate": 0.003,
+      "loss": 4.0576,
+      "step": 7775
+    },
+    {
+      "epoch": 0.07776,
+      "grad_norm": 0.6882286071777344,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 7776
+    },
+    {
+      "epoch": 0.07777,
+      "grad_norm": 0.6559415459632874,
+      "learning_rate": 0.003,
+      "loss": 4.0568,
+      "step": 7777
+    },
+    {
+      "epoch": 0.07778,
+      "grad_norm": 0.7245689630508423,
+      "learning_rate": 0.003,
+      "loss": 4.0677,
+      "step": 7778
+    },
+    {
+      "epoch": 0.07779,
+      "grad_norm": 0.7859965562820435,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 7779
+    },
+    {
+      "epoch": 0.0778,
+      "grad_norm": 0.7404747605323792,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 7780
+    },
+    {
+      "epoch": 0.07781,
+      "grad_norm": 0.6684058308601379,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 7781
+    },
+    {
+      "epoch": 0.07782,
+      "grad_norm": 0.5081612467765808,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 7782
+    },
+    {
+      "epoch": 0.07783,
+      "grad_norm": 0.5472040176391602,
+      "learning_rate": 0.003,
+      "loss": 4.0341,
+      "step": 7783
+    },
+    {
+      "epoch": 0.07784,
+      "grad_norm": 0.6857728958129883,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 7784
+    },
+    {
+      "epoch": 0.07785,
+      "grad_norm": 0.8567017316818237,
+      "learning_rate": 0.003,
+      "loss": 4.0663,
+      "step": 7785
+    },
+    {
+      "epoch": 0.07786,
+      "grad_norm": 0.9442057609558105,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 7786
+    },
+    {
+      "epoch": 0.07787,
+      "grad_norm": 0.9159606099128723,
+      "learning_rate": 0.003,
+      "loss": 4.0207,
+      "step": 7787
+    },
+    {
+      "epoch": 0.07788,
+      "grad_norm": 0.9067151546478271,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 7788
+    },
+    {
+      "epoch": 0.07789,
+      "grad_norm": 1.0266743898391724,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 7789
+    },
+    {
+      "epoch": 0.0779,
+      "grad_norm": 1.1035289764404297,
+      "learning_rate": 0.003,
+      "loss": 4.0822,
+      "step": 7790
+    },
+    {
+      "epoch": 0.07791,
+      "grad_norm": 0.7223553657531738,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 7791
+    },
+    {
+      "epoch": 0.07792,
+      "grad_norm": 0.5318799018859863,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 7792
+    },
+    {
+      "epoch": 0.07793,
+      "grad_norm": 0.6023153066635132,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 7793
+    },
+    {
+      "epoch": 0.07794,
+      "grad_norm": 0.6616175770759583,
+      "learning_rate": 0.003,
+      "loss": 4.0693,
+      "step": 7794
+    },
+    {
+      "epoch": 0.07795,
+      "grad_norm": 0.674656093120575,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 7795
+    },
+    {
+      "epoch": 0.07796,
+      "grad_norm": 0.6360393166542053,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 7796
+    },
+    {
+      "epoch": 0.07797,
+      "grad_norm": 0.6176856160163879,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 7797
+    },
+    {
+      "epoch": 0.07798,
+      "grad_norm": 0.7055824995040894,
+      "learning_rate": 0.003,
+      "loss": 4.0653,
+      "step": 7798
+    },
+    {
+      "epoch": 0.07799,
+      "grad_norm": 0.7401881217956543,
+      "learning_rate": 0.003,
+      "loss": 4.0703,
+      "step": 7799
+    },
+    {
+      "epoch": 0.078,
+      "grad_norm": 0.7410922646522522,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 7800
+    },
+    {
+      "epoch": 0.07801,
+      "grad_norm": 0.8164018392562866,
+      "learning_rate": 0.003,
+      "loss": 4.0435,
+      "step": 7801
+    },
+    {
+      "epoch": 0.07802,
+      "grad_norm": 1.1281871795654297,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 7802
+    },
+    {
+      "epoch": 0.07803,
+      "grad_norm": 0.9571782946586609,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 7803
+    },
+    {
+      "epoch": 0.07804,
+      "grad_norm": 0.8569245338439941,
+      "learning_rate": 0.003,
+      "loss": 4.0655,
+      "step": 7804
+    },
+    {
+      "epoch": 0.07805,
+      "grad_norm": 0.8247978687286377,
+      "learning_rate": 0.003,
+      "loss": 4.0521,
+      "step": 7805
+    },
+    {
+      "epoch": 0.07806,
+      "grad_norm": 0.881071925163269,
+      "learning_rate": 0.003,
+      "loss": 4.0689,
+      "step": 7806
+    },
+    {
+      "epoch": 0.07807,
+      "grad_norm": 0.7734600305557251,
+      "learning_rate": 0.003,
+      "loss": 4.071,
+      "step": 7807
+    },
+    {
+      "epoch": 0.07808,
+      "grad_norm": 0.6630396246910095,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 7808
+    },
+    {
+      "epoch": 0.07809,
+      "grad_norm": 0.6479990482330322,
+      "learning_rate": 0.003,
+      "loss": 4.0517,
+      "step": 7809
+    },
+    {
+      "epoch": 0.0781,
+      "grad_norm": 0.646216630935669,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 7810
+    },
+    {
+      "epoch": 0.07811,
+      "grad_norm": 0.6451820731163025,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 7811
+    },
+    {
+      "epoch": 0.07812,
+      "grad_norm": 0.7043893933296204,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 7812
+    },
+    {
+      "epoch": 0.07813,
+      "grad_norm": 0.7477477192878723,
+      "learning_rate": 0.003,
+      "loss": 4.0569,
+      "step": 7813
+    },
+    {
+      "epoch": 0.07814,
+      "grad_norm": 0.6975932121276855,
+      "learning_rate": 0.003,
+      "loss": 4.0841,
+      "step": 7814
+    },
+    {
+      "epoch": 0.07815,
+      "grad_norm": 0.6276878714561462,
+      "learning_rate": 0.003,
+      "loss": 4.0465,
+      "step": 7815
+    },
+    {
+      "epoch": 0.07816,
+      "grad_norm": 0.6076654195785522,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 7816
+    },
+    {
+      "epoch": 0.07817,
+      "grad_norm": 0.7753585577011108,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 7817
+    },
+    {
+      "epoch": 0.07818,
+      "grad_norm": 0.9763312935829163,
+      "learning_rate": 0.003,
+      "loss": 4.0355,
+      "step": 7818
+    },
+    {
+      "epoch": 0.07819,
+      "grad_norm": 1.1070810556411743,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 7819
+    },
+    {
+      "epoch": 0.0782,
+      "grad_norm": 0.7738063335418701,
+      "learning_rate": 0.003,
+      "loss": 4.0632,
+      "step": 7820
+    },
+    {
+      "epoch": 0.07821,
+      "grad_norm": 0.6099690198898315,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 7821
+    },
+    {
+      "epoch": 0.07822,
+      "grad_norm": 0.6168242692947388,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 7822
+    },
+    {
+      "epoch": 0.07823,
+      "grad_norm": 0.5779638290405273,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 7823
+    },
+    {
+      "epoch": 0.07824,
+      "grad_norm": 0.5537580847740173,
+      "learning_rate": 0.003,
+      "loss": 4.0602,
+      "step": 7824
+    },
+    {
+      "epoch": 0.07825,
+      "grad_norm": 0.5701668858528137,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 7825
+    },
+    {
+      "epoch": 0.07826,
+      "grad_norm": 0.5944071412086487,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 7826
+    },
+    {
+      "epoch": 0.07827,
+      "grad_norm": 0.5485585927963257,
+      "learning_rate": 0.003,
+      "loss": 4.0414,
+      "step": 7827
+    },
+    {
+      "epoch": 0.07828,
+      "grad_norm": 0.6365675330162048,
+      "learning_rate": 0.003,
+      "loss": 4.0454,
+      "step": 7828
+    },
+    {
+      "epoch": 0.07829,
+      "grad_norm": 0.7511796951293945,
+      "learning_rate": 0.003,
+      "loss": 4.0547,
+      "step": 7829
+    },
+    {
+      "epoch": 0.0783,
+      "grad_norm": 0.8600240349769592,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 7830
+    },
+    {
+      "epoch": 0.07831,
+      "grad_norm": 0.8572528958320618,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 7831
+    },
+    {
+      "epoch": 0.07832,
+      "grad_norm": 0.7038732171058655,
+      "learning_rate": 0.003,
+      "loss": 4.0369,
+      "step": 7832
+    },
+    {
+      "epoch": 0.07833,
+      "grad_norm": 0.7847913503646851,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 7833
+    },
+    {
+      "epoch": 0.07834,
+      "grad_norm": 0.860440731048584,
+      "learning_rate": 0.003,
+      "loss": 4.0936,
+      "step": 7834
+    },
+    {
+      "epoch": 0.07835,
+      "grad_norm": 0.8758241534233093,
+      "learning_rate": 0.003,
+      "loss": 4.0573,
+      "step": 7835
+    },
+    {
+      "epoch": 0.07836,
+      "grad_norm": 0.8424054980278015,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 7836
+    },
+    {
+      "epoch": 0.07837,
+      "grad_norm": 0.7555618286132812,
+      "learning_rate": 0.003,
+      "loss": 4.0469,
+      "step": 7837
+    },
+    {
+      "epoch": 0.07838,
+      "grad_norm": 0.7149111032485962,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 7838
+    },
+    {
+      "epoch": 0.07839,
+      "grad_norm": 0.6796376705169678,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 7839
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.750508725643158,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 7840
+    },
+    {
+      "epoch": 0.07841,
+      "grad_norm": 0.7040848731994629,
+      "learning_rate": 0.003,
+      "loss": 4.0351,
+      "step": 7841
+    },
+    {
+      "epoch": 0.07842,
+      "grad_norm": 0.7227228283882141,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 7842
+    },
+    {
+      "epoch": 0.07843,
+      "grad_norm": 0.8472539782524109,
+      "learning_rate": 0.003,
+      "loss": 4.045,
+      "step": 7843
+    },
+    {
+      "epoch": 0.07844,
+      "grad_norm": 0.7912211418151855,
+      "learning_rate": 0.003,
+      "loss": 4.0474,
+      "step": 7844
+    },
+    {
+      "epoch": 0.07845,
+      "grad_norm": 0.7454449534416199,
+      "learning_rate": 0.003,
+      "loss": 4.0561,
+      "step": 7845
+    },
+    {
+      "epoch": 0.07846,
+      "grad_norm": 0.7517043352127075,
+      "learning_rate": 0.003,
+      "loss": 4.0743,
+      "step": 7846
+    },
+    {
+      "epoch": 0.07847,
+      "grad_norm": 0.7840257287025452,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 7847
+    },
+    {
+      "epoch": 0.07848,
+      "grad_norm": 0.9238049983978271,
+      "learning_rate": 0.003,
+      "loss": 4.0708,
+      "step": 7848
+    },
+    {
+      "epoch": 0.07849,
+      "grad_norm": 1.0469474792480469,
+      "learning_rate": 0.003,
+      "loss": 4.0867,
+      "step": 7849
+    },
+    {
+      "epoch": 0.0785,
+      "grad_norm": 1.0180537700653076,
+      "learning_rate": 0.003,
+      "loss": 4.0583,
+      "step": 7850
+    },
+    {
+      "epoch": 0.07851,
+      "grad_norm": 0.8952250480651855,
+      "learning_rate": 0.003,
+      "loss": 4.0429,
+      "step": 7851
+    },
+    {
+      "epoch": 0.07852,
+      "grad_norm": 0.8214434385299683,
+      "learning_rate": 0.003,
+      "loss": 4.0796,
+      "step": 7852
+    },
+    {
+      "epoch": 0.07853,
+      "grad_norm": 0.8445104956626892,
+      "learning_rate": 0.003,
+      "loss": 4.0505,
+      "step": 7853
+    },
+    {
+      "epoch": 0.07854,
+      "grad_norm": 0.9933289885520935,
+      "learning_rate": 0.003,
+      "loss": 4.0716,
+      "step": 7854
+    },
+    {
+      "epoch": 0.07855,
+      "grad_norm": 1.1062895059585571,
+      "learning_rate": 0.003,
+      "loss": 4.0626,
+      "step": 7855
+    },
+    {
+      "epoch": 0.07856,
+      "grad_norm": 0.9354727864265442,
+      "learning_rate": 0.003,
+      "loss": 4.0602,
+      "step": 7856
+    },
+    {
+      "epoch": 0.07857,
+      "grad_norm": 0.9971434473991394,
+      "learning_rate": 0.003,
+      "loss": 4.0804,
+      "step": 7857
+    },
+    {
+      "epoch": 0.07858,
+      "grad_norm": 1.0467644929885864,
+      "learning_rate": 0.003,
+      "loss": 4.075,
+      "step": 7858
+    },
+    {
+      "epoch": 0.07859,
+      "grad_norm": 0.8451358675956726,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 7859
+    },
+    {
+      "epoch": 0.0786,
+      "grad_norm": 0.8124087452888489,
+      "learning_rate": 0.003,
+      "loss": 4.0532,
+      "step": 7860
+    },
+    {
+      "epoch": 0.07861,
+      "grad_norm": 0.8194620013237,
+      "learning_rate": 0.003,
+      "loss": 4.0723,
+      "step": 7861
+    },
+    {
+      "epoch": 0.07862,
+      "grad_norm": 0.7896692156791687,
+      "learning_rate": 0.003,
+      "loss": 4.0919,
+      "step": 7862
+    },
+    {
+      "epoch": 0.07863,
+      "grad_norm": 0.7973822951316833,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 7863
+    },
+    {
+      "epoch": 0.07864,
+      "grad_norm": 0.745921790599823,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 7864
+    },
+    {
+      "epoch": 0.07865,
+      "grad_norm": 0.6143619418144226,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 7865
+    },
+    {
+      "epoch": 0.07866,
+      "grad_norm": 0.5146561861038208,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 7866
+    },
+    {
+      "epoch": 0.07867,
+      "grad_norm": 0.5350759029388428,
+      "learning_rate": 0.003,
+      "loss": 4.0294,
+      "step": 7867
+    },
+    {
+      "epoch": 0.07868,
+      "grad_norm": 0.5663687586784363,
+      "learning_rate": 0.003,
+      "loss": 4.0623,
+      "step": 7868
+    },
+    {
+      "epoch": 0.07869,
+      "grad_norm": 0.6298394799232483,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 7869
+    },
+    {
+      "epoch": 0.0787,
+      "grad_norm": 0.6779914498329163,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 7870
+    },
+    {
+      "epoch": 0.07871,
+      "grad_norm": 0.6579896211624146,
+      "learning_rate": 0.003,
+      "loss": 4.0737,
+      "step": 7871
+    },
+    {
+      "epoch": 0.07872,
+      "grad_norm": 0.5511500239372253,
+      "learning_rate": 0.003,
+      "loss": 4.0685,
+      "step": 7872
+    },
+    {
+      "epoch": 0.07873,
+      "grad_norm": 0.5043216943740845,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 7873
+    },
+    {
+      "epoch": 0.07874,
+      "grad_norm": 0.46951040625572205,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 7874
+    },
+    {
+      "epoch": 0.07875,
+      "grad_norm": 0.490416556596756,
+      "learning_rate": 0.003,
+      "loss": 4.0486,
+      "step": 7875
+    },
+    {
+      "epoch": 0.07876,
+      "grad_norm": 0.5800589323043823,
+      "learning_rate": 0.003,
+      "loss": 4.0472,
+      "step": 7876
+    },
+    {
+      "epoch": 0.07877,
+      "grad_norm": 0.6350085139274597,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 7877
+    },
+    {
+      "epoch": 0.07878,
+      "grad_norm": 0.792594313621521,
+      "learning_rate": 0.003,
+      "loss": 4.0447,
+      "step": 7878
+    },
+    {
+      "epoch": 0.07879,
+      "grad_norm": 1.0079691410064697,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 7879
+    },
+    {
+      "epoch": 0.0788,
+      "grad_norm": 1.099412441253662,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 7880
+    },
+    {
+      "epoch": 0.07881,
+      "grad_norm": 0.741230845451355,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 7881
+    },
+    {
+      "epoch": 0.07882,
+      "grad_norm": 0.6595082879066467,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 7882
+    },
+    {
+      "epoch": 0.07883,
+      "grad_norm": 0.7658863067626953,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 7883
+    },
+    {
+      "epoch": 0.07884,
+      "grad_norm": 0.826673686504364,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 7884
+    },
+    {
+      "epoch": 0.07885,
+      "grad_norm": 0.7689464092254639,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 7885
+    },
+    {
+      "epoch": 0.07886,
+      "grad_norm": 0.5779816508293152,
+      "learning_rate": 0.003,
+      "loss": 4.0525,
+      "step": 7886
+    },
+    {
+      "epoch": 0.07887,
+      "grad_norm": 0.5393301844596863,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 7887
+    },
+    {
+      "epoch": 0.07888,
+      "grad_norm": 0.6773719787597656,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 7888
+    },
+    {
+      "epoch": 0.07889,
+      "grad_norm": 0.7781038284301758,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 7889
+    },
+    {
+      "epoch": 0.0789,
+      "grad_norm": 0.8151108026504517,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 7890
+    },
+    {
+      "epoch": 0.07891,
+      "grad_norm": 0.9264851212501526,
+      "learning_rate": 0.003,
+      "loss": 4.0389,
+      "step": 7891
+    },
+    {
+      "epoch": 0.07892,
+      "grad_norm": 0.8555210828781128,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 7892
+    },
+    {
+      "epoch": 0.07893,
+      "grad_norm": 0.6806337833404541,
+      "learning_rate": 0.003,
+      "loss": 4.0513,
+      "step": 7893
+    },
+    {
+      "epoch": 0.07894,
+      "grad_norm": 0.6658015251159668,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 7894
+    },
+    {
+      "epoch": 0.07895,
+      "grad_norm": 0.5995755791664124,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 7895
+    },
+    {
+      "epoch": 0.07896,
+      "grad_norm": 0.6623494625091553,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 7896
+    },
+    {
+      "epoch": 0.07897,
+      "grad_norm": 0.6996875405311584,
+      "learning_rate": 0.003,
+      "loss": 4.0607,
+      "step": 7897
+    },
+    {
+      "epoch": 0.07898,
+      "grad_norm": 0.8288904428482056,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 7898
+    },
+    {
+      "epoch": 0.07899,
+      "grad_norm": 0.8660441040992737,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 7899
+    },
+    {
+      "epoch": 0.079,
+      "grad_norm": 0.9470987319946289,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 7900
+    },
+    {
+      "epoch": 0.07901,
+      "grad_norm": 0.9151023030281067,
+      "learning_rate": 0.003,
+      "loss": 4.079,
+      "step": 7901
+    },
+    {
+      "epoch": 0.07902,
+      "grad_norm": 0.8372184634208679,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 7902
+    },
+    {
+      "epoch": 0.07903,
+      "grad_norm": 0.6515103578567505,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 7903
+    },
+    {
+      "epoch": 0.07904,
+      "grad_norm": 0.7328464388847351,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 7904
+    },
+    {
+      "epoch": 0.07905,
+      "grad_norm": 0.8890835046768188,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 7905
+    },
+    {
+      "epoch": 0.07906,
+      "grad_norm": 1.1531867980957031,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 7906
+    },
+    {
+      "epoch": 0.07907,
+      "grad_norm": 1.1185576915740967,
+      "learning_rate": 0.003,
+      "loss": 4.0615,
+      "step": 7907
+    },
+    {
+      "epoch": 0.07908,
+      "grad_norm": 0.7459601759910583,
+      "learning_rate": 0.003,
+      "loss": 4.0593,
+      "step": 7908
+    },
+    {
+      "epoch": 0.07909,
+      "grad_norm": 0.6596913933753967,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 7909
+    },
+    {
+      "epoch": 0.0791,
+      "grad_norm": 0.7314830422401428,
+      "learning_rate": 0.003,
+      "loss": 4.0394,
+      "step": 7910
+    },
+    {
+      "epoch": 0.07911,
+      "grad_norm": 0.6769105792045593,
+      "learning_rate": 0.003,
+      "loss": 4.0584,
+      "step": 7911
+    },
+    {
+      "epoch": 0.07912,
+      "grad_norm": 0.6970905661582947,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 7912
+    },
+    {
+      "epoch": 0.07913,
+      "grad_norm": 0.6443745493888855,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 7913
+    },
+    {
+      "epoch": 0.07914,
+      "grad_norm": 0.5966974496841431,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 7914
+    },
+    {
+      "epoch": 0.07915,
+      "grad_norm": 0.5250338912010193,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 7915
+    },
+    {
+      "epoch": 0.07916,
+      "grad_norm": 0.5534821152687073,
+      "learning_rate": 0.003,
+      "loss": 4.0461,
+      "step": 7916
+    },
+    {
+      "epoch": 0.07917,
+      "grad_norm": 0.4856780767440796,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 7917
+    },
+    {
+      "epoch": 0.07918,
+      "grad_norm": 0.4974648654460907,
+      "learning_rate": 0.003,
+      "loss": 4.0376,
+      "step": 7918
+    },
+    {
+      "epoch": 0.07919,
+      "grad_norm": 0.5157517194747925,
+      "learning_rate": 0.003,
+      "loss": 4.0004,
+      "step": 7919
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.5172123312950134,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 7920
+    },
+    {
+      "epoch": 0.07921,
+      "grad_norm": 0.4519246816635132,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 7921
+    },
+    {
+      "epoch": 0.07922,
+      "grad_norm": 0.48282045125961304,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 7922
+    },
+    {
+      "epoch": 0.07923,
+      "grad_norm": 0.5496918559074402,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 7923
+    },
+    {
+      "epoch": 0.07924,
+      "grad_norm": 0.6421642899513245,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 7924
+    },
+    {
+      "epoch": 0.07925,
+      "grad_norm": 0.7793181538581848,
+      "learning_rate": 0.003,
+      "loss": 4.0469,
+      "step": 7925
+    },
+    {
+      "epoch": 0.07926,
+      "grad_norm": 0.9132375121116638,
+      "learning_rate": 0.003,
+      "loss": 4.0551,
+      "step": 7926
+    },
+    {
+      "epoch": 0.07927,
+      "grad_norm": 0.9230619668960571,
+      "learning_rate": 0.003,
+      "loss": 4.0001,
+      "step": 7927
+    },
+    {
+      "epoch": 0.07928,
+      "grad_norm": 0.7421871423721313,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 7928
+    },
+    {
+      "epoch": 0.07929,
+      "grad_norm": 0.6340886950492859,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 7929
+    },
+    {
+      "epoch": 0.0793,
+      "grad_norm": 0.6207582950592041,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 7930
+    },
+    {
+      "epoch": 0.07931,
+      "grad_norm": 0.698438286781311,
+      "learning_rate": 0.003,
+      "loss": 4.0476,
+      "step": 7931
+    },
+    {
+      "epoch": 0.07932,
+      "grad_norm": 0.932792067527771,
+      "learning_rate": 0.003,
+      "loss": 4.054,
+      "step": 7932
+    },
+    {
+      "epoch": 0.07933,
+      "grad_norm": 0.8964979648590088,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 7933
+    },
+    {
+      "epoch": 0.07934,
+      "grad_norm": 0.8300673365592957,
+      "learning_rate": 0.003,
+      "loss": 4.0643,
+      "step": 7934
+    },
+    {
+      "epoch": 0.07935,
+      "grad_norm": 0.7360064387321472,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 7935
+    },
+    {
+      "epoch": 0.07936,
+      "grad_norm": 0.7895861864089966,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 7936
+    },
+    {
+      "epoch": 0.07937,
+      "grad_norm": 0.9133578538894653,
+      "learning_rate": 0.003,
+      "loss": 4.0095,
+      "step": 7937
+    },
+    {
+      "epoch": 0.07938,
+      "grad_norm": 0.9982143044471741,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 7938
+    },
+    {
+      "epoch": 0.07939,
+      "grad_norm": 1.1373426914215088,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 7939
+    },
+    {
+      "epoch": 0.0794,
+      "grad_norm": 0.8364454507827759,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 7940
+    },
+    {
+      "epoch": 0.07941,
+      "grad_norm": 0.8427025675773621,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 7941
+    },
+    {
+      "epoch": 0.07942,
+      "grad_norm": 0.8358206152915955,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 7942
+    },
+    {
+      "epoch": 0.07943,
+      "grad_norm": 0.70419842004776,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 7943
+    },
+    {
+      "epoch": 0.07944,
+      "grad_norm": 0.6298567056655884,
+      "learning_rate": 0.003,
+      "loss": 4.072,
+      "step": 7944
+    },
+    {
+      "epoch": 0.07945,
+      "grad_norm": 0.7173011898994446,
+      "learning_rate": 0.003,
+      "loss": 4.0479,
+      "step": 7945
+    },
+    {
+      "epoch": 0.07946,
+      "grad_norm": 0.8936134576797485,
+      "learning_rate": 0.003,
+      "loss": 4.0536,
+      "step": 7946
+    },
+    {
+      "epoch": 0.07947,
+      "grad_norm": 0.9170552492141724,
+      "learning_rate": 0.003,
+      "loss": 4.068,
+      "step": 7947
+    },
+    {
+      "epoch": 0.07948,
+      "grad_norm": 0.9906176924705505,
+      "learning_rate": 0.003,
+      "loss": 4.0583,
+      "step": 7948
+    },
+    {
+      "epoch": 0.07949,
+      "grad_norm": 0.9943937659263611,
+      "learning_rate": 0.003,
+      "loss": 4.0731,
+      "step": 7949
+    },
+    {
+      "epoch": 0.0795,
+      "grad_norm": 0.97963547706604,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 7950
+    },
+    {
+      "epoch": 0.07951,
+      "grad_norm": 0.949016273021698,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 7951
+    },
+    {
+      "epoch": 0.07952,
+      "grad_norm": 0.7665191292762756,
+      "learning_rate": 0.003,
+      "loss": 4.059,
+      "step": 7952
+    },
+    {
+      "epoch": 0.07953,
+      "grad_norm": 0.7401639819145203,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 7953
+    },
+    {
+      "epoch": 0.07954,
+      "grad_norm": 0.8128531575202942,
+      "learning_rate": 0.003,
+      "loss": 4.0698,
+      "step": 7954
+    },
+    {
+      "epoch": 0.07955,
+      "grad_norm": 0.8641282320022583,
+      "learning_rate": 0.003,
+      "loss": 4.0698,
+      "step": 7955
+    },
+    {
+      "epoch": 0.07956,
+      "grad_norm": 0.897680401802063,
+      "learning_rate": 0.003,
+      "loss": 4.0554,
+      "step": 7956
+    },
+    {
+      "epoch": 0.07957,
+      "grad_norm": 0.8444660305976868,
+      "learning_rate": 0.003,
+      "loss": 4.0443,
+      "step": 7957
+    },
+    {
+      "epoch": 0.07958,
+      "grad_norm": 0.890282392501831,
+      "learning_rate": 0.003,
+      "loss": 4.0556,
+      "step": 7958
+    },
+    {
+      "epoch": 0.07959,
+      "grad_norm": 0.8856026530265808,
+      "learning_rate": 0.003,
+      "loss": 4.0653,
+      "step": 7959
+    },
+    {
+      "epoch": 0.0796,
+      "grad_norm": 0.792759358882904,
+      "learning_rate": 0.003,
+      "loss": 4.0578,
+      "step": 7960
+    },
+    {
+      "epoch": 0.07961,
+      "grad_norm": 0.8642300963401794,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 7961
+    },
+    {
+      "epoch": 0.07962,
+      "grad_norm": 0.886371374130249,
+      "learning_rate": 0.003,
+      "loss": 4.0704,
+      "step": 7962
+    },
+    {
+      "epoch": 0.07963,
+      "grad_norm": 0.8674740195274353,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 7963
+    },
+    {
+      "epoch": 0.07964,
+      "grad_norm": 0.9062060713768005,
+      "learning_rate": 0.003,
+      "loss": 4.0548,
+      "step": 7964
+    },
+    {
+      "epoch": 0.07965,
+      "grad_norm": 1.045652151107788,
+      "learning_rate": 0.003,
+      "loss": 4.0678,
+      "step": 7965
+    },
+    {
+      "epoch": 0.07966,
+      "grad_norm": 0.8896709680557251,
+      "learning_rate": 0.003,
+      "loss": 4.0428,
+      "step": 7966
+    },
+    {
+      "epoch": 0.07967,
+      "grad_norm": 0.8535459041595459,
+      "learning_rate": 0.003,
+      "loss": 4.0618,
+      "step": 7967
+    },
+    {
+      "epoch": 0.07968,
+      "grad_norm": 0.7971822023391724,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 7968
+    },
+    {
+      "epoch": 0.07969,
+      "grad_norm": 0.7176892161369324,
+      "learning_rate": 0.003,
+      "loss": 4.0756,
+      "step": 7969
+    },
+    {
+      "epoch": 0.0797,
+      "grad_norm": 0.619379997253418,
+      "learning_rate": 0.003,
+      "loss": 4.0491,
+      "step": 7970
+    },
+    {
+      "epoch": 0.07971,
+      "grad_norm": 0.7199423909187317,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 7971
+    },
+    {
+      "epoch": 0.07972,
+      "grad_norm": 0.7708553671836853,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 7972
+    },
+    {
+      "epoch": 0.07973,
+      "grad_norm": 0.8473473191261292,
+      "learning_rate": 0.003,
+      "loss": 4.057,
+      "step": 7973
+    },
+    {
+      "epoch": 0.07974,
+      "grad_norm": 0.8833823204040527,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 7974
+    },
+    {
+      "epoch": 0.07975,
+      "grad_norm": 0.8675002455711365,
+      "learning_rate": 0.003,
+      "loss": 4.0713,
+      "step": 7975
+    },
+    {
+      "epoch": 0.07976,
+      "grad_norm": 0.7916547656059265,
+      "learning_rate": 0.003,
+      "loss": 4.0552,
+      "step": 7976
+    },
+    {
+      "epoch": 0.07977,
+      "grad_norm": 0.5385777354240417,
+      "learning_rate": 0.003,
+      "loss": 4.0403,
+      "step": 7977
+    },
+    {
+      "epoch": 0.07978,
+      "grad_norm": 0.5118996500968933,
+      "learning_rate": 0.003,
+      "loss": 4.0443,
+      "step": 7978
+    },
+    {
+      "epoch": 0.07979,
+      "grad_norm": 0.5327662229537964,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 7979
+    },
+    {
+      "epoch": 0.0798,
+      "grad_norm": 0.5415238738059998,
+      "learning_rate": 0.003,
+      "loss": 4.0551,
+      "step": 7980
+    },
+    {
+      "epoch": 0.07981,
+      "grad_norm": 0.5292704105377197,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 7981
+    },
+    {
+      "epoch": 0.07982,
+      "grad_norm": 0.5772751569747925,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 7982
+    },
+    {
+      "epoch": 0.07983,
+      "grad_norm": 0.6106157302856445,
+      "learning_rate": 0.003,
+      "loss": 4.0648,
+      "step": 7983
+    },
+    {
+      "epoch": 0.07984,
+      "grad_norm": 0.7016780376434326,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 7984
+    },
+    {
+      "epoch": 0.07985,
+      "grad_norm": 0.841801643371582,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 7985
+    },
+    {
+      "epoch": 0.07986,
+      "grad_norm": 0.817077100276947,
+      "learning_rate": 0.003,
+      "loss": 4.0581,
+      "step": 7986
+    },
+    {
+      "epoch": 0.07987,
+      "grad_norm": 0.6411316394805908,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 7987
+    },
+    {
+      "epoch": 0.07988,
+      "grad_norm": 0.6220775246620178,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 7988
+    },
+    {
+      "epoch": 0.07989,
+      "grad_norm": 0.5362831354141235,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 7989
+    },
+    {
+      "epoch": 0.0799,
+      "grad_norm": 0.5406444072723389,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 7990
+    },
+    {
+      "epoch": 0.07991,
+      "grad_norm": 0.5161384344100952,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 7991
+    },
+    {
+      "epoch": 0.07992,
+      "grad_norm": 0.45497363805770874,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 7992
+    },
+    {
+      "epoch": 0.07993,
+      "grad_norm": 0.4724503755569458,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 7993
+    },
+    {
+      "epoch": 0.07994,
+      "grad_norm": 0.4843752980232239,
+      "learning_rate": 0.003,
+      "loss": 4.0382,
+      "step": 7994
+    },
+    {
+      "epoch": 0.07995,
+      "grad_norm": 0.5137373805046082,
+      "learning_rate": 0.003,
+      "loss": 4.0388,
+      "step": 7995
+    },
+    {
+      "epoch": 0.07996,
+      "grad_norm": 0.5532665252685547,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 7996
+    },
+    {
+      "epoch": 0.07997,
+      "grad_norm": 0.6154145002365112,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 7997
+    },
+    {
+      "epoch": 0.07998,
+      "grad_norm": 0.7569679021835327,
+      "learning_rate": 0.003,
+      "loss": 4.062,
+      "step": 7998
+    },
+    {
+      "epoch": 0.07999,
+      "grad_norm": 0.9815395474433899,
+      "learning_rate": 0.003,
+      "loss": 4.0347,
+      "step": 7999
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.1601155996322632,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 8000
+    },
+    {
+      "epoch": 0.08001,
+      "grad_norm": 0.6949255466461182,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 8001
+    },
+    {
+      "epoch": 0.08002,
+      "grad_norm": 0.6404258012771606,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 8002
+    },
+    {
+      "epoch": 0.08003,
+      "grad_norm": 0.6366399526596069,
+      "learning_rate": 0.003,
+      "loss": 4.0433,
+      "step": 8003
+    },
+    {
+      "epoch": 0.08004,
+      "grad_norm": 0.7067066431045532,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 8004
+    },
+    {
+      "epoch": 0.08005,
+      "grad_norm": 0.6715254783630371,
+      "learning_rate": 0.003,
+      "loss": 4.0516,
+      "step": 8005
+    },
+    {
+      "epoch": 0.08006,
+      "grad_norm": 0.6775758266448975,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 8006
+    },
+    {
+      "epoch": 0.08007,
+      "grad_norm": 0.7099198698997498,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 8007
+    },
+    {
+      "epoch": 0.08008,
+      "grad_norm": 0.8098099827766418,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 8008
+    },
+    {
+      "epoch": 0.08009,
+      "grad_norm": 0.9449841976165771,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 8009
+    },
+    {
+      "epoch": 0.0801,
+      "grad_norm": 1.0425149202346802,
+      "learning_rate": 0.003,
+      "loss": 4.0397,
+      "step": 8010
+    },
+    {
+      "epoch": 0.08011,
+      "grad_norm": 0.9982419013977051,
+      "learning_rate": 0.003,
+      "loss": 4.0518,
+      "step": 8011
+    },
+    {
+      "epoch": 0.08012,
+      "grad_norm": 0.8389059901237488,
+      "learning_rate": 0.003,
+      "loss": 4.0462,
+      "step": 8012
+    },
+    {
+      "epoch": 0.08013,
+      "grad_norm": 0.8514482378959656,
+      "learning_rate": 0.003,
+      "loss": 4.0494,
+      "step": 8013
+    },
+    {
+      "epoch": 0.08014,
+      "grad_norm": 0.8291131258010864,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 8014
+    },
+    {
+      "epoch": 0.08015,
+      "grad_norm": 0.8924878835678101,
+      "learning_rate": 0.003,
+      "loss": 4.0688,
+      "step": 8015
+    },
+    {
+      "epoch": 0.08016,
+      "grad_norm": 0.8806622624397278,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 8016
+    },
+    {
+      "epoch": 0.08017,
+      "grad_norm": 0.7676506638526917,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 8017
+    },
+    {
+      "epoch": 0.08018,
+      "grad_norm": 0.6425415873527527,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 8018
+    },
+    {
+      "epoch": 0.08019,
+      "grad_norm": 0.7521240711212158,
+      "learning_rate": 0.003,
+      "loss": 4.0689,
+      "step": 8019
+    },
+    {
+      "epoch": 0.0802,
+      "grad_norm": 1.029622197151184,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 8020
+    },
+    {
+      "epoch": 0.08021,
+      "grad_norm": 1.132253885269165,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 8021
+    },
+    {
+      "epoch": 0.08022,
+      "grad_norm": 0.6701259016990662,
+      "learning_rate": 0.003,
+      "loss": 4.0513,
+      "step": 8022
+    },
+    {
+      "epoch": 0.08023,
+      "grad_norm": 0.6120415925979614,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 8023
+    },
+    {
+      "epoch": 0.08024,
+      "grad_norm": 0.7347866892814636,
+      "learning_rate": 0.003,
+      "loss": 4.0583,
+      "step": 8024
+    },
+    {
+      "epoch": 0.08025,
+      "grad_norm": 0.6548226475715637,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 8025
+    },
+    {
+      "epoch": 0.08026,
+      "grad_norm": 0.5551509857177734,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 8026
+    },
+    {
+      "epoch": 0.08027,
+      "grad_norm": 0.5772050023078918,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 8027
+    },
+    {
+      "epoch": 0.08028,
+      "grad_norm": 0.5994802117347717,
+      "learning_rate": 0.003,
+      "loss": 4.0289,
+      "step": 8028
+    },
+    {
+      "epoch": 0.08029,
+      "grad_norm": 0.654687762260437,
+      "learning_rate": 0.003,
+      "loss": 4.0536,
+      "step": 8029
+    },
+    {
+      "epoch": 0.0803,
+      "grad_norm": 0.8716540932655334,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 8030
+    },
+    {
+      "epoch": 0.08031,
+      "grad_norm": 1.1394952535629272,
+      "learning_rate": 0.003,
+      "loss": 4.0531,
+      "step": 8031
+    },
+    {
+      "epoch": 0.08032,
+      "grad_norm": 0.8767929673194885,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 8032
+    },
+    {
+      "epoch": 0.08033,
+      "grad_norm": 0.6166559457778931,
+      "learning_rate": 0.003,
+      "loss": 4.0546,
+      "step": 8033
+    },
+    {
+      "epoch": 0.08034,
+      "grad_norm": 0.5533866882324219,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 8034
+    },
+    {
+      "epoch": 0.08035,
+      "grad_norm": 0.7748246788978577,
+      "learning_rate": 0.003,
+      "loss": 4.06,
+      "step": 8035
+    },
+    {
+      "epoch": 0.08036,
+      "grad_norm": 0.8875789642333984,
+      "learning_rate": 0.003,
+      "loss": 4.0789,
+      "step": 8036
+    },
+    {
+      "epoch": 0.08037,
+      "grad_norm": 0.893352210521698,
+      "learning_rate": 0.003,
+      "loss": 4.0403,
+      "step": 8037
+    },
+    {
+      "epoch": 0.08038,
+      "grad_norm": 0.7569796442985535,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 8038
+    },
+    {
+      "epoch": 0.08039,
+      "grad_norm": 0.7512573003768921,
+      "learning_rate": 0.003,
+      "loss": 4.044,
+      "step": 8039
+    },
+    {
+      "epoch": 0.0804,
+      "grad_norm": 0.7568359971046448,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 8040
+    },
+    {
+      "epoch": 0.08041,
+      "grad_norm": 0.7010595202445984,
+      "learning_rate": 0.003,
+      "loss": 4.0584,
+      "step": 8041
+    },
+    {
+      "epoch": 0.08042,
+      "grad_norm": 0.7021541595458984,
+      "learning_rate": 0.003,
+      "loss": 4.045,
+      "step": 8042
+    },
+    {
+      "epoch": 0.08043,
+      "grad_norm": 0.7211666703224182,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 8043
+    },
+    {
+      "epoch": 0.08044,
+      "grad_norm": 0.669906497001648,
+      "learning_rate": 0.003,
+      "loss": 4.0168,
+      "step": 8044
+    },
+    {
+      "epoch": 0.08045,
+      "grad_norm": 0.6533012390136719,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 8045
+    },
+    {
+      "epoch": 0.08046,
+      "grad_norm": 0.5999561548233032,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 8046
+    },
+    {
+      "epoch": 0.08047,
+      "grad_norm": 0.7110080122947693,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 8047
+    },
+    {
+      "epoch": 0.08048,
+      "grad_norm": 0.7337324619293213,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 8048
+    },
+    {
+      "epoch": 0.08049,
+      "grad_norm": 0.7428856492042542,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 8049
+    },
+    {
+      "epoch": 0.0805,
+      "grad_norm": 0.683529794216156,
+      "learning_rate": 0.003,
+      "loss": 4.0518,
+      "step": 8050
+    },
+    {
+      "epoch": 0.08051,
+      "grad_norm": 0.5760079622268677,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 8051
+    },
+    {
+      "epoch": 0.08052,
+      "grad_norm": 0.6353437304496765,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 8052
+    },
+    {
+      "epoch": 0.08053,
+      "grad_norm": 0.7857322096824646,
+      "learning_rate": 0.003,
+      "loss": 4.0493,
+      "step": 8053
+    },
+    {
+      "epoch": 0.08054,
+      "grad_norm": 0.9808282852172852,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 8054
+    },
+    {
+      "epoch": 0.08055,
+      "grad_norm": 1.1264392137527466,
+      "learning_rate": 0.003,
+      "loss": 4.015,
+      "step": 8055
+    },
+    {
+      "epoch": 0.08056,
+      "grad_norm": 0.8243107795715332,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 8056
+    },
+    {
+      "epoch": 0.08057,
+      "grad_norm": 0.7461767196655273,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 8057
+    },
+    {
+      "epoch": 0.08058,
+      "grad_norm": 0.8168344497680664,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 8058
+    },
+    {
+      "epoch": 0.08059,
+      "grad_norm": 0.8754106760025024,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 8059
+    },
+    {
+      "epoch": 0.0806,
+      "grad_norm": 0.9352402687072754,
+      "learning_rate": 0.003,
+      "loss": 4.0406,
+      "step": 8060
+    },
+    {
+      "epoch": 0.08061,
+      "grad_norm": 0.9238023161888123,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 8061
+    },
+    {
+      "epoch": 0.08062,
+      "grad_norm": 0.8781481385231018,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 8062
+    },
+    {
+      "epoch": 0.08063,
+      "grad_norm": 0.9294982552528381,
+      "learning_rate": 0.003,
+      "loss": 4.0841,
+      "step": 8063
+    },
+    {
+      "epoch": 0.08064,
+      "grad_norm": 0.9283519983291626,
+      "learning_rate": 0.003,
+      "loss": 4.0467,
+      "step": 8064
+    },
+    {
+      "epoch": 0.08065,
+      "grad_norm": 0.7961224913597107,
+      "learning_rate": 0.003,
+      "loss": 4.0572,
+      "step": 8065
+    },
+    {
+      "epoch": 0.08066,
+      "grad_norm": 0.7206274271011353,
+      "learning_rate": 0.003,
+      "loss": 4.0722,
+      "step": 8066
+    },
+    {
+      "epoch": 0.08067,
+      "grad_norm": 0.7221958041191101,
+      "learning_rate": 0.003,
+      "loss": 4.0602,
+      "step": 8067
+    },
+    {
+      "epoch": 0.08068,
+      "grad_norm": 0.687168300151825,
+      "learning_rate": 0.003,
+      "loss": 4.0562,
+      "step": 8068
+    },
+    {
+      "epoch": 0.08069,
+      "grad_norm": 0.6589773297309875,
+      "learning_rate": 0.003,
+      "loss": 4.057,
+      "step": 8069
+    },
+    {
+      "epoch": 0.0807,
+      "grad_norm": 0.7254470586776733,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 8070
+    },
+    {
+      "epoch": 0.08071,
+      "grad_norm": 0.7805096507072449,
+      "learning_rate": 0.003,
+      "loss": 4.0485,
+      "step": 8071
+    },
+    {
+      "epoch": 0.08072,
+      "grad_norm": 0.7921481728553772,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 8072
+    },
+    {
+      "epoch": 0.08073,
+      "grad_norm": 0.7291877865791321,
+      "learning_rate": 0.003,
+      "loss": 4.0376,
+      "step": 8073
+    },
+    {
+      "epoch": 0.08074,
+      "grad_norm": 0.6952309012413025,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 8074
+    },
+    {
+      "epoch": 0.08075,
+      "grad_norm": 0.6951455473899841,
+      "learning_rate": 0.003,
+      "loss": 4.0667,
+      "step": 8075
+    },
+    {
+      "epoch": 0.08076,
+      "grad_norm": 0.7872135043144226,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 8076
+    },
+    {
+      "epoch": 0.08077,
+      "grad_norm": 0.8542959094047546,
+      "learning_rate": 0.003,
+      "loss": 4.0683,
+      "step": 8077
+    },
+    {
+      "epoch": 0.08078,
+      "grad_norm": 1.0374029874801636,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 8078
+    },
+    {
+      "epoch": 0.08079,
+      "grad_norm": 1.025814175605774,
+      "learning_rate": 0.003,
+      "loss": 4.0516,
+      "step": 8079
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.65848308801651,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 8080
+    },
+    {
+      "epoch": 0.08081,
+      "grad_norm": 0.5402346849441528,
+      "learning_rate": 0.003,
+      "loss": 4.0449,
+      "step": 8081
+    },
+    {
+      "epoch": 0.08082,
+      "grad_norm": 0.6986470222473145,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 8082
+    },
+    {
+      "epoch": 0.08083,
+      "grad_norm": 0.8801277875900269,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 8083
+    },
+    {
+      "epoch": 0.08084,
+      "grad_norm": 0.9807546734809875,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 8084
+    },
+    {
+      "epoch": 0.08085,
+      "grad_norm": 0.9804517030715942,
+      "learning_rate": 0.003,
+      "loss": 4.0734,
+      "step": 8085
+    },
+    {
+      "epoch": 0.08086,
+      "grad_norm": 0.8663859963417053,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 8086
+    },
+    {
+      "epoch": 0.08087,
+      "grad_norm": 0.8679655194282532,
+      "learning_rate": 0.003,
+      "loss": 4.0421,
+      "step": 8087
+    },
+    {
+      "epoch": 0.08088,
+      "grad_norm": 0.8380336165428162,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 8088
+    },
+    {
+      "epoch": 0.08089,
+      "grad_norm": 0.8112128973007202,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 8089
+    },
+    {
+      "epoch": 0.0809,
+      "grad_norm": 0.8074503540992737,
+      "learning_rate": 0.003,
+      "loss": 4.0494,
+      "step": 8090
+    },
+    {
+      "epoch": 0.08091,
+      "grad_norm": 0.8186045289039612,
+      "learning_rate": 0.003,
+      "loss": 4.0701,
+      "step": 8091
+    },
+    {
+      "epoch": 0.08092,
+      "grad_norm": 0.777157723903656,
+      "learning_rate": 0.003,
+      "loss": 4.0687,
+      "step": 8092
+    },
+    {
+      "epoch": 0.08093,
+      "grad_norm": 0.6288996338844299,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 8093
+    },
+    {
+      "epoch": 0.08094,
+      "grad_norm": 0.6444592475891113,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 8094
+    },
+    {
+      "epoch": 0.08095,
+      "grad_norm": 0.6421931385993958,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 8095
+    },
+    {
+      "epoch": 0.08096,
+      "grad_norm": 0.6471747756004333,
+      "learning_rate": 0.003,
+      "loss": 4.0509,
+      "step": 8096
+    },
+    {
+      "epoch": 0.08097,
+      "grad_norm": 0.6687425971031189,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 8097
+    },
+    {
+      "epoch": 0.08098,
+      "grad_norm": 0.6552982330322266,
+      "learning_rate": 0.003,
+      "loss": 4.0704,
+      "step": 8098
+    },
+    {
+      "epoch": 0.08099,
+      "grad_norm": 0.6883662939071655,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 8099
+    },
+    {
+      "epoch": 0.081,
+      "grad_norm": 0.7512494921684265,
+      "learning_rate": 0.003,
+      "loss": 4.0463,
+      "step": 8100
+    },
+    {
+      "epoch": 0.08101,
+      "grad_norm": 0.7864393591880798,
+      "learning_rate": 0.003,
+      "loss": 4.068,
+      "step": 8101
+    },
+    {
+      "epoch": 0.08102,
+      "grad_norm": 0.7945188879966736,
+      "learning_rate": 0.003,
+      "loss": 4.0441,
+      "step": 8102
+    },
+    {
+      "epoch": 0.08103,
+      "grad_norm": 0.8009957671165466,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 8103
+    },
+    {
+      "epoch": 0.08104,
+      "grad_norm": 0.7752313613891602,
+      "learning_rate": 0.003,
+      "loss": 4.0429,
+      "step": 8104
+    },
+    {
+      "epoch": 0.08105,
+      "grad_norm": 0.7074246406555176,
+      "learning_rate": 0.003,
+      "loss": 4.055,
+      "step": 8105
+    },
+    {
+      "epoch": 0.08106,
+      "grad_norm": 0.6730649471282959,
+      "learning_rate": 0.003,
+      "loss": 4.044,
+      "step": 8106
+    },
+    {
+      "epoch": 0.08107,
+      "grad_norm": 0.597204327583313,
+      "learning_rate": 0.003,
+      "loss": 4.0436,
+      "step": 8107
+    },
+    {
+      "epoch": 0.08108,
+      "grad_norm": 0.5866988301277161,
+      "learning_rate": 0.003,
+      "loss": 3.9898,
+      "step": 8108
+    },
+    {
+      "epoch": 0.08109,
+      "grad_norm": 0.541022777557373,
+      "learning_rate": 0.003,
+      "loss": 4.0311,
+      "step": 8109
+    },
+    {
+      "epoch": 0.0811,
+      "grad_norm": 0.5541371703147888,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 8110
+    },
+    {
+      "epoch": 0.08111,
+      "grad_norm": 0.607240617275238,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 8111
+    },
+    {
+      "epoch": 0.08112,
+      "grad_norm": 0.6710116267204285,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 8112
+    },
+    {
+      "epoch": 0.08113,
+      "grad_norm": 0.7832820415496826,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 8113
+    },
+    {
+      "epoch": 0.08114,
+      "grad_norm": 0.9054648876190186,
+      "learning_rate": 0.003,
+      "loss": 4.0554,
+      "step": 8114
+    },
+    {
+      "epoch": 0.08115,
+      "grad_norm": 0.9504172205924988,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 8115
+    },
+    {
+      "epoch": 0.08116,
+      "grad_norm": 0.819379448890686,
+      "learning_rate": 0.003,
+      "loss": 4.0564,
+      "step": 8116
+    },
+    {
+      "epoch": 0.08117,
+      "grad_norm": 0.5076575875282288,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 8117
+    },
+    {
+      "epoch": 0.08118,
+      "grad_norm": 0.5695910453796387,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 8118
+    },
+    {
+      "epoch": 0.08119,
+      "grad_norm": 0.6920680999755859,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 8119
+    },
+    {
+      "epoch": 0.0812,
+      "grad_norm": 0.841937780380249,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 8120
+    },
+    {
+      "epoch": 0.08121,
+      "grad_norm": 0.9552656412124634,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 8121
+    },
+    {
+      "epoch": 0.08122,
+      "grad_norm": 0.8833564519882202,
+      "learning_rate": 0.003,
+      "loss": 4.0496,
+      "step": 8122
+    },
+    {
+      "epoch": 0.08123,
+      "grad_norm": 0.8226878046989441,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 8123
+    },
+    {
+      "epoch": 0.08124,
+      "grad_norm": 0.8926153779029846,
+      "learning_rate": 0.003,
+      "loss": 4.0537,
+      "step": 8124
+    },
+    {
+      "epoch": 0.08125,
+      "grad_norm": 0.8978990316390991,
+      "learning_rate": 0.003,
+      "loss": 4.075,
+      "step": 8125
+    },
+    {
+      "epoch": 0.08126,
+      "grad_norm": 0.9611136317253113,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 8126
+    },
+    {
+      "epoch": 0.08127,
+      "grad_norm": 0.7953163981437683,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 8127
+    },
+    {
+      "epoch": 0.08128,
+      "grad_norm": 0.6702011823654175,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 8128
+    },
+    {
+      "epoch": 0.08129,
+      "grad_norm": 0.6014225482940674,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 8129
+    },
+    {
+      "epoch": 0.0813,
+      "grad_norm": 0.6430684328079224,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 8130
+    },
+    {
+      "epoch": 0.08131,
+      "grad_norm": 0.5604881048202515,
+      "learning_rate": 0.003,
+      "loss": 4.044,
+      "step": 8131
+    },
+    {
+      "epoch": 0.08132,
+      "grad_norm": 0.6205559372901917,
+      "learning_rate": 0.003,
+      "loss": 4.0357,
+      "step": 8132
+    },
+    {
+      "epoch": 0.08133,
+      "grad_norm": 0.5798535943031311,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 8133
+    },
+    {
+      "epoch": 0.08134,
+      "grad_norm": 0.5685412287712097,
+      "learning_rate": 0.003,
+      "loss": 4.0234,
+      "step": 8134
+    },
+    {
+      "epoch": 0.08135,
+      "grad_norm": 0.6919317841529846,
+      "learning_rate": 0.003,
+      "loss": 4.0256,
+      "step": 8135
+    },
+    {
+      "epoch": 0.08136,
+      "grad_norm": 0.7103545069694519,
+      "learning_rate": 0.003,
+      "loss": 4.0586,
+      "step": 8136
+    },
+    {
+      "epoch": 0.08137,
+      "grad_norm": 0.7415788769721985,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 8137
+    },
+    {
+      "epoch": 0.08138,
+      "grad_norm": 0.9382996559143066,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 8138
+    },
+    {
+      "epoch": 0.08139,
+      "grad_norm": 1.084199070930481,
+      "learning_rate": 0.003,
+      "loss": 4.046,
+      "step": 8139
+    },
+    {
+      "epoch": 0.0814,
+      "grad_norm": 0.9915632605552673,
+      "learning_rate": 0.003,
+      "loss": 4.0557,
+      "step": 8140
+    },
+    {
+      "epoch": 0.08141,
+      "grad_norm": 0.8965795040130615,
+      "learning_rate": 0.003,
+      "loss": 4.0556,
+      "step": 8141
+    },
+    {
+      "epoch": 0.08142,
+      "grad_norm": 0.8707264065742493,
+      "learning_rate": 0.003,
+      "loss": 4.0649,
+      "step": 8142
+    },
+    {
+      "epoch": 0.08143,
+      "grad_norm": 0.8575231432914734,
+      "learning_rate": 0.003,
+      "loss": 4.0599,
+      "step": 8143
+    },
+    {
+      "epoch": 0.08144,
+      "grad_norm": 0.7178121209144592,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 8144
+    },
+    {
+      "epoch": 0.08145,
+      "grad_norm": 0.7352826595306396,
+      "learning_rate": 0.003,
+      "loss": 4.0342,
+      "step": 8145
+    },
+    {
+      "epoch": 0.08146,
+      "grad_norm": 0.806141197681427,
+      "learning_rate": 0.003,
+      "loss": 4.0683,
+      "step": 8146
+    },
+    {
+      "epoch": 0.08147,
+      "grad_norm": 0.8929685354232788,
+      "learning_rate": 0.003,
+      "loss": 4.0693,
+      "step": 8147
+    },
+    {
+      "epoch": 0.08148,
+      "grad_norm": 0.9094710350036621,
+      "learning_rate": 0.003,
+      "loss": 4.0263,
+      "step": 8148
+    },
+    {
+      "epoch": 0.08149,
+      "grad_norm": 0.9839245676994324,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 8149
+    },
+    {
+      "epoch": 0.0815,
+      "grad_norm": 0.9399632811546326,
+      "learning_rate": 0.003,
+      "loss": 4.0357,
+      "step": 8150
+    },
+    {
+      "epoch": 0.08151,
+      "grad_norm": 0.9006126523017883,
+      "learning_rate": 0.003,
+      "loss": 4.0587,
+      "step": 8151
+    },
+    {
+      "epoch": 0.08152,
+      "grad_norm": 0.893135666847229,
+      "learning_rate": 0.003,
+      "loss": 4.0618,
+      "step": 8152
+    },
+    {
+      "epoch": 0.08153,
+      "grad_norm": 0.8419569730758667,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 8153
+    },
+    {
+      "epoch": 0.08154,
+      "grad_norm": 0.675416886806488,
+      "learning_rate": 0.003,
+      "loss": 4.0462,
+      "step": 8154
+    },
+    {
+      "epoch": 0.08155,
+      "grad_norm": 0.6207221746444702,
+      "learning_rate": 0.003,
+      "loss": 4.0611,
+      "step": 8155
+    },
+    {
+      "epoch": 0.08156,
+      "grad_norm": 0.6687820553779602,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 8156
+    },
+    {
+      "epoch": 0.08157,
+      "grad_norm": 0.7092146277427673,
+      "learning_rate": 0.003,
+      "loss": 3.9925,
+      "step": 8157
+    },
+    {
+      "epoch": 0.08158,
+      "grad_norm": 0.8258717060089111,
+      "learning_rate": 0.003,
+      "loss": 4.0738,
+      "step": 8158
+    },
+    {
+      "epoch": 0.08159,
+      "grad_norm": 0.9466468095779419,
+      "learning_rate": 0.003,
+      "loss": 4.0754,
+      "step": 8159
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.951337993144989,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 8160
+    },
+    {
+      "epoch": 0.08161,
+      "grad_norm": 0.813227653503418,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 8161
+    },
+    {
+      "epoch": 0.08162,
+      "grad_norm": 0.7988176941871643,
+      "learning_rate": 0.003,
+      "loss": 4.0542,
+      "step": 8162
+    },
+    {
+      "epoch": 0.08163,
+      "grad_norm": 0.7750150561332703,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 8163
+    },
+    {
+      "epoch": 0.08164,
+      "grad_norm": 0.7018991112709045,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 8164
+    },
+    {
+      "epoch": 0.08165,
+      "grad_norm": 0.7162664532661438,
+      "learning_rate": 0.003,
+      "loss": 4.0428,
+      "step": 8165
+    },
+    {
+      "epoch": 0.08166,
+      "grad_norm": 0.7163441777229309,
+      "learning_rate": 0.003,
+      "loss": 4.0435,
+      "step": 8166
+    },
+    {
+      "epoch": 0.08167,
+      "grad_norm": 0.6863809823989868,
+      "learning_rate": 0.003,
+      "loss": 4.0443,
+      "step": 8167
+    },
+    {
+      "epoch": 0.08168,
+      "grad_norm": 0.620455265045166,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 8168
+    },
+    {
+      "epoch": 0.08169,
+      "grad_norm": 0.6343228220939636,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 8169
+    },
+    {
+      "epoch": 0.0817,
+      "grad_norm": 0.6355931162834167,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 8170
+    },
+    {
+      "epoch": 0.08171,
+      "grad_norm": 0.6743160486221313,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 8171
+    },
+    {
+      "epoch": 0.08172,
+      "grad_norm": 0.7267938256263733,
+      "learning_rate": 0.003,
+      "loss": 4.0548,
+      "step": 8172
+    },
+    {
+      "epoch": 0.08173,
+      "grad_norm": 0.8629770278930664,
+      "learning_rate": 0.003,
+      "loss": 4.0793,
+      "step": 8173
+    },
+    {
+      "epoch": 0.08174,
+      "grad_norm": 0.9949284195899963,
+      "learning_rate": 0.003,
+      "loss": 4.0747,
+      "step": 8174
+    },
+    {
+      "epoch": 0.08175,
+      "grad_norm": 1.0875071287155151,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 8175
+    },
+    {
+      "epoch": 0.08176,
+      "grad_norm": 0.7273131608963013,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 8176
+    },
+    {
+      "epoch": 0.08177,
+      "grad_norm": 0.5219792723655701,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 8177
+    },
+    {
+      "epoch": 0.08178,
+      "grad_norm": 0.5456799864768982,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 8178
+    },
+    {
+      "epoch": 0.08179,
+      "grad_norm": 0.5848241448402405,
+      "learning_rate": 0.003,
+      "loss": 4.0573,
+      "step": 8179
+    },
+    {
+      "epoch": 0.0818,
+      "grad_norm": 0.5659416317939758,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 8180
+    },
+    {
+      "epoch": 0.08181,
+      "grad_norm": 0.5623244047164917,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 8181
+    },
+    {
+      "epoch": 0.08182,
+      "grad_norm": 0.5621029138565063,
+      "learning_rate": 0.003,
+      "loss": 4.0625,
+      "step": 8182
+    },
+    {
+      "epoch": 0.08183,
+      "grad_norm": 0.6393095254898071,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 8183
+    },
+    {
+      "epoch": 0.08184,
+      "grad_norm": 0.776958167552948,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 8184
+    },
+    {
+      "epoch": 0.08185,
+      "grad_norm": 0.8038877248764038,
+      "learning_rate": 0.003,
+      "loss": 4.0427,
+      "step": 8185
+    },
+    {
+      "epoch": 0.08186,
+      "grad_norm": 0.7159028649330139,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 8186
+    },
+    {
+      "epoch": 0.08187,
+      "grad_norm": 0.6306201219558716,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 8187
+    },
+    {
+      "epoch": 0.08188,
+      "grad_norm": 0.6069129109382629,
+      "learning_rate": 0.003,
+      "loss": 4.0533,
+      "step": 8188
+    },
+    {
+      "epoch": 0.08189,
+      "grad_norm": 0.7105355262756348,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 8189
+    },
+    {
+      "epoch": 0.0819,
+      "grad_norm": 0.8592032790184021,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 8190
+    },
+    {
+      "epoch": 0.08191,
+      "grad_norm": 1.0450857877731323,
+      "learning_rate": 0.003,
+      "loss": 4.0366,
+      "step": 8191
+    },
+    {
+      "epoch": 0.08192,
+      "grad_norm": 0.8729857802391052,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 8192
+    },
+    {
+      "epoch": 0.08193,
+      "grad_norm": 0.6815803050994873,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 8193
+    },
+    {
+      "epoch": 0.08194,
+      "grad_norm": 0.7523767352104187,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 8194
+    },
+    {
+      "epoch": 0.08195,
+      "grad_norm": 0.9143834710121155,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 8195
+    },
+    {
+      "epoch": 0.08196,
+      "grad_norm": 1.0875829458236694,
+      "learning_rate": 0.003,
+      "loss": 4.0652,
+      "step": 8196
+    },
+    {
+      "epoch": 0.08197,
+      "grad_norm": 0.944886326789856,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 8197
+    },
+    {
+      "epoch": 0.08198,
+      "grad_norm": 0.9827715754508972,
+      "learning_rate": 0.003,
+      "loss": 4.045,
+      "step": 8198
+    },
+    {
+      "epoch": 0.08199,
+      "grad_norm": 1.0734338760375977,
+      "learning_rate": 0.003,
+      "loss": 4.0615,
+      "step": 8199
+    },
+    {
+      "epoch": 0.082,
+      "grad_norm": 0.953004777431488,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 8200
+    },
+    {
+      "epoch": 0.08201,
+      "grad_norm": 0.9488988518714905,
+      "learning_rate": 0.003,
+      "loss": 4.0447,
+      "step": 8201
+    },
+    {
+      "epoch": 0.08202,
+      "grad_norm": 0.9760498404502869,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 8202
+    },
+    {
+      "epoch": 0.08203,
+      "grad_norm": 1.027105689048767,
+      "learning_rate": 0.003,
+      "loss": 4.0664,
+      "step": 8203
+    },
+    {
+      "epoch": 0.08204,
+      "grad_norm": 1.0658153295516968,
+      "learning_rate": 0.003,
+      "loss": 4.0635,
+      "step": 8204
+    },
+    {
+      "epoch": 0.08205,
+      "grad_norm": 1.0252691507339478,
+      "learning_rate": 0.003,
+      "loss": 4.0565,
+      "step": 8205
+    },
+    {
+      "epoch": 0.08206,
+      "grad_norm": 0.7699146866798401,
+      "learning_rate": 0.003,
+      "loss": 4.0854,
+      "step": 8206
+    },
+    {
+      "epoch": 0.08207,
+      "grad_norm": 0.7636826634407043,
+      "learning_rate": 0.003,
+      "loss": 4.0566,
+      "step": 8207
+    },
+    {
+      "epoch": 0.08208,
+      "grad_norm": 0.8411189913749695,
+      "learning_rate": 0.003,
+      "loss": 4.0597,
+      "step": 8208
+    },
+    {
+      "epoch": 0.08209,
+      "grad_norm": 0.9034724831581116,
+      "learning_rate": 0.003,
+      "loss": 4.0761,
+      "step": 8209
+    },
+    {
+      "epoch": 0.0821,
+      "grad_norm": 0.9367898106575012,
+      "learning_rate": 0.003,
+      "loss": 4.0644,
+      "step": 8210
+    },
+    {
+      "epoch": 0.08211,
+      "grad_norm": 0.7969280481338501,
+      "learning_rate": 0.003,
+      "loss": 4.0605,
+      "step": 8211
+    },
+    {
+      "epoch": 0.08212,
+      "grad_norm": 0.7595282196998596,
+      "learning_rate": 0.003,
+      "loss": 4.0578,
+      "step": 8212
+    },
+    {
+      "epoch": 0.08213,
+      "grad_norm": 0.6862097382545471,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 8213
+    },
+    {
+      "epoch": 0.08214,
+      "grad_norm": 0.6680603623390198,
+      "learning_rate": 0.003,
+      "loss": 4.072,
+      "step": 8214
+    },
+    {
+      "epoch": 0.08215,
+      "grad_norm": 0.6800533533096313,
+      "learning_rate": 0.003,
+      "loss": 4.0736,
+      "step": 8215
+    },
+    {
+      "epoch": 0.08216,
+      "grad_norm": 0.673785388469696,
+      "learning_rate": 0.003,
+      "loss": 4.0493,
+      "step": 8216
+    },
+    {
+      "epoch": 0.08217,
+      "grad_norm": 0.5008993148803711,
+      "learning_rate": 0.003,
+      "loss": 4.0732,
+      "step": 8217
+    },
+    {
+      "epoch": 0.08218,
+      "grad_norm": 0.562592089176178,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 8218
+    },
+    {
+      "epoch": 0.08219,
+      "grad_norm": 0.49677103757858276,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 8219
+    },
+    {
+      "epoch": 0.0822,
+      "grad_norm": 0.46024832129478455,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 8220
+    },
+    {
+      "epoch": 0.08221,
+      "grad_norm": 0.6039679050445557,
+      "learning_rate": 0.003,
+      "loss": 4.0557,
+      "step": 8221
+    },
+    {
+      "epoch": 0.08222,
+      "grad_norm": 0.7332128286361694,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 8222
+    },
+    {
+      "epoch": 0.08223,
+      "grad_norm": 0.9698393940925598,
+      "learning_rate": 0.003,
+      "loss": 4.0559,
+      "step": 8223
+    },
+    {
+      "epoch": 0.08224,
+      "grad_norm": 1.07571280002594,
+      "learning_rate": 0.003,
+      "loss": 4.0558,
+      "step": 8224
+    },
+    {
+      "epoch": 0.08225,
+      "grad_norm": 0.7147070169448853,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 8225
+    },
+    {
+      "epoch": 0.08226,
+      "grad_norm": 0.6036560535430908,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 8226
+    },
+    {
+      "epoch": 0.08227,
+      "grad_norm": 0.6823557615280151,
+      "learning_rate": 0.003,
+      "loss": 4.0653,
+      "step": 8227
+    },
+    {
+      "epoch": 0.08228,
+      "grad_norm": 0.6876879930496216,
+      "learning_rate": 0.003,
+      "loss": 4.0503,
+      "step": 8228
+    },
+    {
+      "epoch": 0.08229,
+      "grad_norm": 0.6408465504646301,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 8229
+    },
+    {
+      "epoch": 0.0823,
+      "grad_norm": 0.5930390357971191,
+      "learning_rate": 0.003,
+      "loss": 4.0563,
+      "step": 8230
+    },
+    {
+      "epoch": 0.08231,
+      "grad_norm": 0.6377511024475098,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 8231
+    },
+    {
+      "epoch": 0.08232,
+      "grad_norm": 0.6173083782196045,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 8232
+    },
+    {
+      "epoch": 0.08233,
+      "grad_norm": 0.6037210822105408,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 8233
+    },
+    {
+      "epoch": 0.08234,
+      "grad_norm": 0.6184289455413818,
+      "learning_rate": 0.003,
+      "loss": 4.044,
+      "step": 8234
+    },
+    {
+      "epoch": 0.08235,
+      "grad_norm": 0.6736436486244202,
+      "learning_rate": 0.003,
+      "loss": 4.0639,
+      "step": 8235
+    },
+    {
+      "epoch": 0.08236,
+      "grad_norm": 0.7247467637062073,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 8236
+    },
+    {
+      "epoch": 0.08237,
+      "grad_norm": 0.7914360761642456,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 8237
+    },
+    {
+      "epoch": 0.08238,
+      "grad_norm": 0.8948922157287598,
+      "learning_rate": 0.003,
+      "loss": 4.0432,
+      "step": 8238
+    },
+    {
+      "epoch": 0.08239,
+      "grad_norm": 0.8952855467796326,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 8239
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.710752546787262,
+      "learning_rate": 0.003,
+      "loss": 4.0526,
+      "step": 8240
+    },
+    {
+      "epoch": 0.08241,
+      "grad_norm": 0.7061362266540527,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 8241
+    },
+    {
+      "epoch": 0.08242,
+      "grad_norm": 0.755864679813385,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 8242
+    },
+    {
+      "epoch": 0.08243,
+      "grad_norm": 0.7905144095420837,
+      "learning_rate": 0.003,
+      "loss": 4.0605,
+      "step": 8243
+    },
+    {
+      "epoch": 0.08244,
+      "grad_norm": 0.743241548538208,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 8244
+    },
+    {
+      "epoch": 0.08245,
+      "grad_norm": 0.7604659199714661,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 8245
+    },
+    {
+      "epoch": 0.08246,
+      "grad_norm": 0.6285468935966492,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 8246
+    },
+    {
+      "epoch": 0.08247,
+      "grad_norm": 0.6213167905807495,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 8247
+    },
+    {
+      "epoch": 0.08248,
+      "grad_norm": 0.6580025553703308,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 8248
+    },
+    {
+      "epoch": 0.08249,
+      "grad_norm": 0.7325441241264343,
+      "learning_rate": 0.003,
+      "loss": 4.0479,
+      "step": 8249
+    },
+    {
+      "epoch": 0.0825,
+      "grad_norm": 0.698697566986084,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 8250
+    },
+    {
+      "epoch": 0.08251,
+      "grad_norm": 0.7475115656852722,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 8251
+    },
+    {
+      "epoch": 0.08252,
+      "grad_norm": 0.888262152671814,
+      "learning_rate": 0.003,
+      "loss": 4.0503,
+      "step": 8252
+    },
+    {
+      "epoch": 0.08253,
+      "grad_norm": 0.888322651386261,
+      "learning_rate": 0.003,
+      "loss": 4.0425,
+      "step": 8253
+    },
+    {
+      "epoch": 0.08254,
+      "grad_norm": 0.9198958873748779,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 8254
+    },
+    {
+      "epoch": 0.08255,
+      "grad_norm": 0.8950197100639343,
+      "learning_rate": 0.003,
+      "loss": 4.0251,
+      "step": 8255
+    },
+    {
+      "epoch": 0.08256,
+      "grad_norm": 0.9500605463981628,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 8256
+    },
+    {
+      "epoch": 0.08257,
+      "grad_norm": 0.9830167293548584,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 8257
+    },
+    {
+      "epoch": 0.08258,
+      "grad_norm": 0.9901215434074402,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 8258
+    },
+    {
+      "epoch": 0.08259,
+      "grad_norm": 0.8542518019676208,
+      "learning_rate": 0.003,
+      "loss": 4.0785,
+      "step": 8259
+    },
+    {
+      "epoch": 0.0826,
+      "grad_norm": 0.7408918738365173,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 8260
+    },
+    {
+      "epoch": 0.08261,
+      "grad_norm": 0.760269045829773,
+      "learning_rate": 0.003,
+      "loss": 4.0371,
+      "step": 8261
+    },
+    {
+      "epoch": 0.08262,
+      "grad_norm": 0.7092325687408447,
+      "learning_rate": 0.003,
+      "loss": 4.0722,
+      "step": 8262
+    },
+    {
+      "epoch": 0.08263,
+      "grad_norm": 0.6450809240341187,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 8263
+    },
+    {
+      "epoch": 0.08264,
+      "grad_norm": 0.7137148976325989,
+      "learning_rate": 0.003,
+      "loss": 4.0523,
+      "step": 8264
+    },
+    {
+      "epoch": 0.08265,
+      "grad_norm": 0.6923044323921204,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 8265
+    },
+    {
+      "epoch": 0.08266,
+      "grad_norm": 0.6888769865036011,
+      "learning_rate": 0.003,
+      "loss": 4.0428,
+      "step": 8266
+    },
+    {
+      "epoch": 0.08267,
+      "grad_norm": 0.5442489981651306,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 8267
+    },
+    {
+      "epoch": 0.08268,
+      "grad_norm": 0.4939229190349579,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 8268
+    },
+    {
+      "epoch": 0.08269,
+      "grad_norm": 0.5606672167778015,
+      "learning_rate": 0.003,
+      "loss": 4.0532,
+      "step": 8269
+    },
+    {
+      "epoch": 0.0827,
+      "grad_norm": 0.5966683030128479,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 8270
+    },
+    {
+      "epoch": 0.08271,
+      "grad_norm": 0.6150082349777222,
+      "learning_rate": 0.003,
+      "loss": 4.0497,
+      "step": 8271
+    },
+    {
+      "epoch": 0.08272,
+      "grad_norm": 0.697607159614563,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 8272
+    },
+    {
+      "epoch": 0.08273,
+      "grad_norm": 0.7327117323875427,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 8273
+    },
+    {
+      "epoch": 0.08274,
+      "grad_norm": 0.6971558332443237,
+      "learning_rate": 0.003,
+      "loss": 4.0514,
+      "step": 8274
+    },
+    {
+      "epoch": 0.08275,
+      "grad_norm": 0.7999626994132996,
+      "learning_rate": 0.003,
+      "loss": 4.0316,
+      "step": 8275
+    },
+    {
+      "epoch": 0.08276,
+      "grad_norm": 0.9401102066040039,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 8276
+    },
+    {
+      "epoch": 0.08277,
+      "grad_norm": 0.8911102414131165,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 8277
+    },
+    {
+      "epoch": 0.08278,
+      "grad_norm": 0.9046621918678284,
+      "learning_rate": 0.003,
+      "loss": 4.0444,
+      "step": 8278
+    },
+    {
+      "epoch": 0.08279,
+      "grad_norm": 1.0346699953079224,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 8279
+    },
+    {
+      "epoch": 0.0828,
+      "grad_norm": 1.2517218589782715,
+      "learning_rate": 0.003,
+      "loss": 4.0801,
+      "step": 8280
+    },
+    {
+      "epoch": 0.08281,
+      "grad_norm": 0.8035468459129333,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 8281
+    },
+    {
+      "epoch": 0.08282,
+      "grad_norm": 0.7403268218040466,
+      "learning_rate": 0.003,
+      "loss": 4.0384,
+      "step": 8282
+    },
+    {
+      "epoch": 0.08283,
+      "grad_norm": 0.7293475866317749,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 8283
+    },
+    {
+      "epoch": 0.08284,
+      "grad_norm": 0.7586977481842041,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 8284
+    },
+    {
+      "epoch": 0.08285,
+      "grad_norm": 0.8085505366325378,
+      "learning_rate": 0.003,
+      "loss": 4.0523,
+      "step": 8285
+    },
+    {
+      "epoch": 0.08286,
+      "grad_norm": 0.8938177824020386,
+      "learning_rate": 0.003,
+      "loss": 4.0641,
+      "step": 8286
+    },
+    {
+      "epoch": 0.08287,
+      "grad_norm": 0.9943509101867676,
+      "learning_rate": 0.003,
+      "loss": 4.0479,
+      "step": 8287
+    },
+    {
+      "epoch": 0.08288,
+      "grad_norm": 0.9675386548042297,
+      "learning_rate": 0.003,
+      "loss": 4.0321,
+      "step": 8288
+    },
+    {
+      "epoch": 0.08289,
+      "grad_norm": 0.9429525136947632,
+      "learning_rate": 0.003,
+      "loss": 4.0539,
+      "step": 8289
+    },
+    {
+      "epoch": 0.0829,
+      "grad_norm": 0.9028465151786804,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 8290
+    },
+    {
+      "epoch": 0.08291,
+      "grad_norm": 0.8231788873672485,
+      "learning_rate": 0.003,
+      "loss": 4.0409,
+      "step": 8291
+    },
+    {
+      "epoch": 0.08292,
+      "grad_norm": 0.8863356113433838,
+      "learning_rate": 0.003,
+      "loss": 4.0479,
+      "step": 8292
+    },
+    {
+      "epoch": 0.08293,
+      "grad_norm": 0.7855710983276367,
+      "learning_rate": 0.003,
+      "loss": 4.0596,
+      "step": 8293
+    },
+    {
+      "epoch": 0.08294,
+      "grad_norm": 0.7704231142997742,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 8294
+    },
+    {
+      "epoch": 0.08295,
+      "grad_norm": 0.799931526184082,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 8295
+    },
+    {
+      "epoch": 0.08296,
+      "grad_norm": 0.8029519319534302,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 8296
+    },
+    {
+      "epoch": 0.08297,
+      "grad_norm": 0.8360529541969299,
+      "learning_rate": 0.003,
+      "loss": 4.0501,
+      "step": 8297
+    },
+    {
+      "epoch": 0.08298,
+      "grad_norm": 0.8334509134292603,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 8298
+    },
+    {
+      "epoch": 0.08299,
+      "grad_norm": 0.8397669792175293,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 8299
+    },
+    {
+      "epoch": 0.083,
+      "grad_norm": 0.6753296256065369,
+      "learning_rate": 0.003,
+      "loss": 4.0715,
+      "step": 8300
+    },
+    {
+      "epoch": 0.08301,
+      "grad_norm": 0.598036527633667,
+      "learning_rate": 0.003,
+      "loss": 4.0476,
+      "step": 8301
+    },
+    {
+      "epoch": 0.08302,
+      "grad_norm": 0.5940173864364624,
+      "learning_rate": 0.003,
+      "loss": 4.0586,
+      "step": 8302
+    },
+    {
+      "epoch": 0.08303,
+      "grad_norm": 0.6859657168388367,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 8303
+    },
+    {
+      "epoch": 0.08304,
+      "grad_norm": 0.7473353147506714,
+      "learning_rate": 0.003,
+      "loss": 4.0603,
+      "step": 8304
+    },
+    {
+      "epoch": 0.08305,
+      "grad_norm": 0.7625237703323364,
+      "learning_rate": 0.003,
+      "loss": 4.0323,
+      "step": 8305
+    },
+    {
+      "epoch": 0.08306,
+      "grad_norm": 0.6629177331924438,
+      "learning_rate": 0.003,
+      "loss": 4.0414,
+      "step": 8306
+    },
+    {
+      "epoch": 0.08307,
+      "grad_norm": 0.6138797402381897,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 8307
+    },
+    {
+      "epoch": 0.08308,
+      "grad_norm": 0.6568807363510132,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 8308
+    },
+    {
+      "epoch": 0.08309,
+      "grad_norm": 0.5892602801322937,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 8309
+    },
+    {
+      "epoch": 0.0831,
+      "grad_norm": 0.5024274587631226,
+      "learning_rate": 0.003,
+      "loss": 4.0323,
+      "step": 8310
+    },
+    {
+      "epoch": 0.08311,
+      "grad_norm": 0.5848341584205627,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 8311
+    },
+    {
+      "epoch": 0.08312,
+      "grad_norm": 0.6992815732955933,
+      "learning_rate": 0.003,
+      "loss": 4.0568,
+      "step": 8312
+    },
+    {
+      "epoch": 0.08313,
+      "grad_norm": 0.7933979630470276,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 8313
+    },
+    {
+      "epoch": 0.08314,
+      "grad_norm": 0.7513596415519714,
+      "learning_rate": 0.003,
+      "loss": 4.0514,
+      "step": 8314
+    },
+    {
+      "epoch": 0.08315,
+      "grad_norm": 0.6562137007713318,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 8315
+    },
+    {
+      "epoch": 0.08316,
+      "grad_norm": 0.7130046486854553,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 8316
+    },
+    {
+      "epoch": 0.08317,
+      "grad_norm": 0.9234418869018555,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 8317
+    },
+    {
+      "epoch": 0.08318,
+      "grad_norm": 0.9351674318313599,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 8318
+    },
+    {
+      "epoch": 0.08319,
+      "grad_norm": 0.9117304682731628,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 8319
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.8410887122154236,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 8320
+    },
+    {
+      "epoch": 0.08321,
+      "grad_norm": 0.6563390493392944,
+      "learning_rate": 0.003,
+      "loss": 4.0416,
+      "step": 8321
+    },
+    {
+      "epoch": 0.08322,
+      "grad_norm": 0.6313189268112183,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 8322
+    },
+    {
+      "epoch": 0.08323,
+      "grad_norm": 0.6625282168388367,
+      "learning_rate": 0.003,
+      "loss": 4.0095,
+      "step": 8323
+    },
+    {
+      "epoch": 0.08324,
+      "grad_norm": 0.7624503970146179,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 8324
+    },
+    {
+      "epoch": 0.08325,
+      "grad_norm": 0.828724205493927,
+      "learning_rate": 0.003,
+      "loss": 4.0466,
+      "step": 8325
+    },
+    {
+      "epoch": 0.08326,
+      "grad_norm": 0.9103044271469116,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 8326
+    },
+    {
+      "epoch": 0.08327,
+      "grad_norm": 0.7918485999107361,
+      "learning_rate": 0.003,
+      "loss": 4.0302,
+      "step": 8327
+    },
+    {
+      "epoch": 0.08328,
+      "grad_norm": 0.8280613422393799,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 8328
+    },
+    {
+      "epoch": 0.08329,
+      "grad_norm": 0.8416042327880859,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 8329
+    },
+    {
+      "epoch": 0.0833,
+      "grad_norm": 0.8294980525970459,
+      "learning_rate": 0.003,
+      "loss": 4.0469,
+      "step": 8330
+    },
+    {
+      "epoch": 0.08331,
+      "grad_norm": 0.9347625970840454,
+      "learning_rate": 0.003,
+      "loss": 4.0334,
+      "step": 8331
+    },
+    {
+      "epoch": 0.08332,
+      "grad_norm": 1.0204062461853027,
+      "learning_rate": 0.003,
+      "loss": 4.0552,
+      "step": 8332
+    },
+    {
+      "epoch": 0.08333,
+      "grad_norm": 1.0275133848190308,
+      "learning_rate": 0.003,
+      "loss": 4.0527,
+      "step": 8333
+    },
+    {
+      "epoch": 0.08334,
+      "grad_norm": 0.8066427111625671,
+      "learning_rate": 0.003,
+      "loss": 4.0514,
+      "step": 8334
+    },
+    {
+      "epoch": 0.08335,
+      "grad_norm": 0.6177697777748108,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 8335
+    },
+    {
+      "epoch": 0.08336,
+      "grad_norm": 0.5968998670578003,
+      "learning_rate": 0.003,
+      "loss": 4.0658,
+      "step": 8336
+    },
+    {
+      "epoch": 0.08337,
+      "grad_norm": 0.6666345000267029,
+      "learning_rate": 0.003,
+      "loss": 4.0538,
+      "step": 8337
+    },
+    {
+      "epoch": 0.08338,
+      "grad_norm": 0.7190283536911011,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 8338
+    },
+    {
+      "epoch": 0.08339,
+      "grad_norm": 0.7866709232330322,
+      "learning_rate": 0.003,
+      "loss": 4.0376,
+      "step": 8339
+    },
+    {
+      "epoch": 0.0834,
+      "grad_norm": 0.8350096344947815,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 8340
+    },
+    {
+      "epoch": 0.08341,
+      "grad_norm": 0.8239350914955139,
+      "learning_rate": 0.003,
+      "loss": 4.067,
+      "step": 8341
+    },
+    {
+      "epoch": 0.08342,
+      "grad_norm": 0.8311486840248108,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 8342
+    },
+    {
+      "epoch": 0.08343,
+      "grad_norm": 0.7377791404724121,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 8343
+    },
+    {
+      "epoch": 0.08344,
+      "grad_norm": 0.8086702823638916,
+      "learning_rate": 0.003,
+      "loss": 4.0347,
+      "step": 8344
+    },
+    {
+      "epoch": 0.08345,
+      "grad_norm": 0.9363798499107361,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 8345
+    },
+    {
+      "epoch": 0.08346,
+      "grad_norm": 0.959582507610321,
+      "learning_rate": 0.003,
+      "loss": 4.0558,
+      "step": 8346
+    },
+    {
+      "epoch": 0.08347,
+      "grad_norm": 0.9427142143249512,
+      "learning_rate": 0.003,
+      "loss": 4.0557,
+      "step": 8347
+    },
+    {
+      "epoch": 0.08348,
+      "grad_norm": 0.9667640924453735,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 8348
+    },
+    {
+      "epoch": 0.08349,
+      "grad_norm": 0.89124596118927,
+      "learning_rate": 0.003,
+      "loss": 4.0601,
+      "step": 8349
+    },
+    {
+      "epoch": 0.0835,
+      "grad_norm": 0.7665905952453613,
+      "learning_rate": 0.003,
+      "loss": 4.0812,
+      "step": 8350
+    },
+    {
+      "epoch": 0.08351,
+      "grad_norm": 0.7947695851325989,
+      "learning_rate": 0.003,
+      "loss": 4.0735,
+      "step": 8351
+    },
+    {
+      "epoch": 0.08352,
+      "grad_norm": 0.7680903673171997,
+      "learning_rate": 0.003,
+      "loss": 4.0671,
+      "step": 8352
+    },
+    {
+      "epoch": 0.08353,
+      "grad_norm": 0.7358306050300598,
+      "learning_rate": 0.003,
+      "loss": 4.0452,
+      "step": 8353
+    },
+    {
+      "epoch": 0.08354,
+      "grad_norm": 0.7130650281906128,
+      "learning_rate": 0.003,
+      "loss": 4.053,
+      "step": 8354
+    },
+    {
+      "epoch": 0.08355,
+      "grad_norm": 0.6731503009796143,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 8355
+    },
+    {
+      "epoch": 0.08356,
+      "grad_norm": 0.711292028427124,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 8356
+    },
+    {
+      "epoch": 0.08357,
+      "grad_norm": 0.7001312375068665,
+      "learning_rate": 0.003,
+      "loss": 4.0521,
+      "step": 8357
+    },
+    {
+      "epoch": 0.08358,
+      "grad_norm": 0.6773588061332703,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 8358
+    },
+    {
+      "epoch": 0.08359,
+      "grad_norm": 0.7479463815689087,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 8359
+    },
+    {
+      "epoch": 0.0836,
+      "grad_norm": 0.8306698203086853,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 8360
+    },
+    {
+      "epoch": 0.08361,
+      "grad_norm": 0.878010094165802,
+      "learning_rate": 0.003,
+      "loss": 4.0437,
+      "step": 8361
+    },
+    {
+      "epoch": 0.08362,
+      "grad_norm": 1.0022238492965698,
+      "learning_rate": 0.003,
+      "loss": 4.0603,
+      "step": 8362
+    },
+    {
+      "epoch": 0.08363,
+      "grad_norm": 1.0330588817596436,
+      "learning_rate": 0.003,
+      "loss": 4.0755,
+      "step": 8363
+    },
+    {
+      "epoch": 0.08364,
+      "grad_norm": 0.718574047088623,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 8364
+    },
+    {
+      "epoch": 0.08365,
+      "grad_norm": 0.5936159491539001,
+      "learning_rate": 0.003,
+      "loss": 4.0491,
+      "step": 8365
+    },
+    {
+      "epoch": 0.08366,
+      "grad_norm": 0.7422720193862915,
+      "learning_rate": 0.003,
+      "loss": 4.0608,
+      "step": 8366
+    },
+    {
+      "epoch": 0.08367,
+      "grad_norm": 0.743090808391571,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 8367
+    },
+    {
+      "epoch": 0.08368,
+      "grad_norm": 0.7947729825973511,
+      "learning_rate": 0.003,
+      "loss": 4.0552,
+      "step": 8368
+    },
+    {
+      "epoch": 0.08369,
+      "grad_norm": 0.837806224822998,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 8369
+    },
+    {
+      "epoch": 0.0837,
+      "grad_norm": 0.783185601234436,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 8370
+    },
+    {
+      "epoch": 0.08371,
+      "grad_norm": 0.642991304397583,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 8371
+    },
+    {
+      "epoch": 0.08372,
+      "grad_norm": 0.5675342082977295,
+      "learning_rate": 0.003,
+      "loss": 4.0584,
+      "step": 8372
+    },
+    {
+      "epoch": 0.08373,
+      "grad_norm": 0.6473720073699951,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 8373
+    },
+    {
+      "epoch": 0.08374,
+      "grad_norm": 0.7581577897071838,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 8374
+    },
+    {
+      "epoch": 0.08375,
+      "grad_norm": 0.710021436214447,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 8375
+    },
+    {
+      "epoch": 0.08376,
+      "grad_norm": 0.6476665735244751,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 8376
+    },
+    {
+      "epoch": 0.08377,
+      "grad_norm": 0.5653111338615417,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 8377
+    },
+    {
+      "epoch": 0.08378,
+      "grad_norm": 0.5026406645774841,
+      "learning_rate": 0.003,
+      "loss": 4.0331,
+      "step": 8378
+    },
+    {
+      "epoch": 0.08379,
+      "grad_norm": 0.5006636381149292,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 8379
+    },
+    {
+      "epoch": 0.0838,
+      "grad_norm": 0.5302611589431763,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 8380
+    },
+    {
+      "epoch": 0.08381,
+      "grad_norm": 0.49844783544540405,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 8381
+    },
+    {
+      "epoch": 0.08382,
+      "grad_norm": 0.5049042105674744,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 8382
+    },
+    {
+      "epoch": 0.08383,
+      "grad_norm": 0.5383378267288208,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 8383
+    },
+    {
+      "epoch": 0.08384,
+      "grad_norm": 0.5285283327102661,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 8384
+    },
+    {
+      "epoch": 0.08385,
+      "grad_norm": 0.6071128249168396,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 8385
+    },
+    {
+      "epoch": 0.08386,
+      "grad_norm": 0.7259279489517212,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 8386
+    },
+    {
+      "epoch": 0.08387,
+      "grad_norm": 0.9096173644065857,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 8387
+    },
+    {
+      "epoch": 0.08388,
+      "grad_norm": 1.0137066841125488,
+      "learning_rate": 0.003,
+      "loss": 4.0028,
+      "step": 8388
+    },
+    {
+      "epoch": 0.08389,
+      "grad_norm": 0.9626190662384033,
+      "learning_rate": 0.003,
+      "loss": 4.0509,
+      "step": 8389
+    },
+    {
+      "epoch": 0.0839,
+      "grad_norm": 0.7948998212814331,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 8390
+    },
+    {
+      "epoch": 0.08391,
+      "grad_norm": 0.8329476714134216,
+      "learning_rate": 0.003,
+      "loss": 4.0508,
+      "step": 8391
+    },
+    {
+      "epoch": 0.08392,
+      "grad_norm": 0.9139432311058044,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 8392
+    },
+    {
+      "epoch": 0.08393,
+      "grad_norm": 0.7734724283218384,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 8393
+    },
+    {
+      "epoch": 0.08394,
+      "grad_norm": 0.7710996270179749,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 8394
+    },
+    {
+      "epoch": 0.08395,
+      "grad_norm": 0.8262189030647278,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 8395
+    },
+    {
+      "epoch": 0.08396,
+      "grad_norm": 0.9476003646850586,
+      "learning_rate": 0.003,
+      "loss": 4.0606,
+      "step": 8396
+    },
+    {
+      "epoch": 0.08397,
+      "grad_norm": 1.077710509300232,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 8397
+    },
+    {
+      "epoch": 0.08398,
+      "grad_norm": 0.8334670066833496,
+      "learning_rate": 0.003,
+      "loss": 4.0503,
+      "step": 8398
+    },
+    {
+      "epoch": 0.08399,
+      "grad_norm": 0.8015785217285156,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 8399
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.78190016746521,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 8400
+    },
+    {
+      "epoch": 0.08401,
+      "grad_norm": 0.7719368934631348,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 8401
+    },
+    {
+      "epoch": 0.08402,
+      "grad_norm": 0.7329378128051758,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 8402
+    },
+    {
+      "epoch": 0.08403,
+      "grad_norm": 0.6700614094734192,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 8403
+    },
+    {
+      "epoch": 0.08404,
+      "grad_norm": 0.6247296333312988,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 8404
+    },
+    {
+      "epoch": 0.08405,
+      "grad_norm": 0.7085357904434204,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 8405
+    },
+    {
+      "epoch": 0.08406,
+      "grad_norm": 0.864506185054779,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 8406
+    },
+    {
+      "epoch": 0.08407,
+      "grad_norm": 0.9666787981987,
+      "learning_rate": 0.003,
+      "loss": 4.0931,
+      "step": 8407
+    },
+    {
+      "epoch": 0.08408,
+      "grad_norm": 0.9957094788551331,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 8408
+    },
+    {
+      "epoch": 0.08409,
+      "grad_norm": 1.145738124847412,
+      "learning_rate": 0.003,
+      "loss": 4.0679,
+      "step": 8409
+    },
+    {
+      "epoch": 0.0841,
+      "grad_norm": 0.8387814164161682,
+      "learning_rate": 0.003,
+      "loss": 4.0581,
+      "step": 8410
+    },
+    {
+      "epoch": 0.08411,
+      "grad_norm": 0.8150733709335327,
+      "learning_rate": 0.003,
+      "loss": 4.0823,
+      "step": 8411
+    },
+    {
+      "epoch": 0.08412,
+      "grad_norm": 0.8348231315612793,
+      "learning_rate": 0.003,
+      "loss": 4.0616,
+      "step": 8412
+    },
+    {
+      "epoch": 0.08413,
+      "grad_norm": 0.8107559680938721,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 8413
+    },
+    {
+      "epoch": 0.08414,
+      "grad_norm": 0.7736573219299316,
+      "learning_rate": 0.003,
+      "loss": 4.0616,
+      "step": 8414
+    },
+    {
+      "epoch": 0.08415,
+      "grad_norm": 0.7075899839401245,
+      "learning_rate": 0.003,
+      "loss": 4.0207,
+      "step": 8415
+    },
+    {
+      "epoch": 0.08416,
+      "grad_norm": 0.7205349206924438,
+      "learning_rate": 0.003,
+      "loss": 4.0524,
+      "step": 8416
+    },
+    {
+      "epoch": 0.08417,
+      "grad_norm": 0.6560434699058533,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 8417
+    },
+    {
+      "epoch": 0.08418,
+      "grad_norm": 0.5477179288864136,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 8418
+    },
+    {
+      "epoch": 0.08419,
+      "grad_norm": 0.549923300743103,
+      "learning_rate": 0.003,
+      "loss": 4.0414,
+      "step": 8419
+    },
+    {
+      "epoch": 0.0842,
+      "grad_norm": 0.4624122679233551,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 8420
+    },
+    {
+      "epoch": 0.08421,
+      "grad_norm": 0.4707403779029846,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 8421
+    },
+    {
+      "epoch": 0.08422,
+      "grad_norm": 0.6087749004364014,
+      "learning_rate": 0.003,
+      "loss": 4.0471,
+      "step": 8422
+    },
+    {
+      "epoch": 0.08423,
+      "grad_norm": 0.7090073227882385,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 8423
+    },
+    {
+      "epoch": 0.08424,
+      "grad_norm": 0.8543527126312256,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 8424
+    },
+    {
+      "epoch": 0.08425,
+      "grad_norm": 1.0288057327270508,
+      "learning_rate": 0.003,
+      "loss": 4.0509,
+      "step": 8425
+    },
+    {
+      "epoch": 0.08426,
+      "grad_norm": 0.989617109298706,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 8426
+    },
+    {
+      "epoch": 0.08427,
+      "grad_norm": 0.8570108413696289,
+      "learning_rate": 0.003,
+      "loss": 4.0576,
+      "step": 8427
+    },
+    {
+      "epoch": 0.08428,
+      "grad_norm": 0.6294158101081848,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 8428
+    },
+    {
+      "epoch": 0.08429,
+      "grad_norm": 0.6182265877723694,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 8429
+    },
+    {
+      "epoch": 0.0843,
+      "grad_norm": 0.8876853585243225,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 8430
+    },
+    {
+      "epoch": 0.08431,
+      "grad_norm": 0.9176228046417236,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 8431
+    },
+    {
+      "epoch": 0.08432,
+      "grad_norm": 0.8123117089271545,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 8432
+    },
+    {
+      "epoch": 0.08433,
+      "grad_norm": 0.8903416991233826,
+      "learning_rate": 0.003,
+      "loss": 4.0564,
+      "step": 8433
+    },
+    {
+      "epoch": 0.08434,
+      "grad_norm": 0.9202600121498108,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 8434
+    },
+    {
+      "epoch": 0.08435,
+      "grad_norm": 0.8581541180610657,
+      "learning_rate": 0.003,
+      "loss": 4.0444,
+      "step": 8435
+    },
+    {
+      "epoch": 0.08436,
+      "grad_norm": 0.8430383205413818,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 8436
+    },
+    {
+      "epoch": 0.08437,
+      "grad_norm": 0.73482346534729,
+      "learning_rate": 0.003,
+      "loss": 4.0537,
+      "step": 8437
+    },
+    {
+      "epoch": 0.08438,
+      "grad_norm": 0.851243257522583,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 8438
+    },
+    {
+      "epoch": 0.08439,
+      "grad_norm": 0.9350183606147766,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 8439
+    },
+    {
+      "epoch": 0.0844,
+      "grad_norm": 0.8600278496742249,
+      "learning_rate": 0.003,
+      "loss": 4.0671,
+      "step": 8440
+    },
+    {
+      "epoch": 0.08441,
+      "grad_norm": 0.9163636565208435,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 8441
+    },
+    {
+      "epoch": 0.08442,
+      "grad_norm": 0.9736191034317017,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 8442
+    },
+    {
+      "epoch": 0.08443,
+      "grad_norm": 0.9900960922241211,
+      "learning_rate": 0.003,
+      "loss": 4.053,
+      "step": 8443
+    },
+    {
+      "epoch": 0.08444,
+      "grad_norm": 0.9450505971908569,
+      "learning_rate": 0.003,
+      "loss": 4.0823,
+      "step": 8444
+    },
+    {
+      "epoch": 0.08445,
+      "grad_norm": 0.8399696946144104,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 8445
+    },
+    {
+      "epoch": 0.08446,
+      "grad_norm": 0.715254008769989,
+      "learning_rate": 0.003,
+      "loss": 4.0509,
+      "step": 8446
+    },
+    {
+      "epoch": 0.08447,
+      "grad_norm": 0.7299708127975464,
+      "learning_rate": 0.003,
+      "loss": 4.0554,
+      "step": 8447
+    },
+    {
+      "epoch": 0.08448,
+      "grad_norm": 0.7352356314659119,
+      "learning_rate": 0.003,
+      "loss": 4.0565,
+      "step": 8448
+    },
+    {
+      "epoch": 0.08449,
+      "grad_norm": 0.7627168893814087,
+      "learning_rate": 0.003,
+      "loss": 4.068,
+      "step": 8449
+    },
+    {
+      "epoch": 0.0845,
+      "grad_norm": 0.8209385871887207,
+      "learning_rate": 0.003,
+      "loss": 4.0376,
+      "step": 8450
+    },
+    {
+      "epoch": 0.08451,
+      "grad_norm": 0.9701636433601379,
+      "learning_rate": 0.003,
+      "loss": 4.0655,
+      "step": 8451
+    },
+    {
+      "epoch": 0.08452,
+      "grad_norm": 0.9062989950180054,
+      "learning_rate": 0.003,
+      "loss": 4.043,
+      "step": 8452
+    },
+    {
+      "epoch": 0.08453,
+      "grad_norm": 0.7730754613876343,
+      "learning_rate": 0.003,
+      "loss": 4.0636,
+      "step": 8453
+    },
+    {
+      "epoch": 0.08454,
+      "grad_norm": 0.6160298585891724,
+      "learning_rate": 0.003,
+      "loss": 4.0539,
+      "step": 8454
+    },
+    {
+      "epoch": 0.08455,
+      "grad_norm": 0.5016544461250305,
+      "learning_rate": 0.003,
+      "loss": 4.0251,
+      "step": 8455
+    },
+    {
+      "epoch": 0.08456,
+      "grad_norm": 0.4829849302768707,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 8456
+    },
+    {
+      "epoch": 0.08457,
+      "grad_norm": 0.6312108039855957,
+      "learning_rate": 0.003,
+      "loss": 4.0703,
+      "step": 8457
+    },
+    {
+      "epoch": 0.08458,
+      "grad_norm": 0.6965609788894653,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 8458
+    },
+    {
+      "epoch": 0.08459,
+      "grad_norm": 0.6963859796524048,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 8459
+    },
+    {
+      "epoch": 0.0846,
+      "grad_norm": 0.6759956479072571,
+      "learning_rate": 0.003,
+      "loss": 4.0464,
+      "step": 8460
+    },
+    {
+      "epoch": 0.08461,
+      "grad_norm": 0.5745083093643188,
+      "learning_rate": 0.003,
+      "loss": 4.0479,
+      "step": 8461
+    },
+    {
+      "epoch": 0.08462,
+      "grad_norm": 0.5628439784049988,
+      "learning_rate": 0.003,
+      "loss": 4.0493,
+      "step": 8462
+    },
+    {
+      "epoch": 0.08463,
+      "grad_norm": 0.5284938216209412,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 8463
+    },
+    {
+      "epoch": 0.08464,
+      "grad_norm": 0.4526340961456299,
+      "learning_rate": 0.003,
+      "loss": 4.0507,
+      "step": 8464
+    },
+    {
+      "epoch": 0.08465,
+      "grad_norm": 0.42585697770118713,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 8465
+    },
+    {
+      "epoch": 0.08466,
+      "grad_norm": 0.45660510659217834,
+      "learning_rate": 0.003,
+      "loss": 4.047,
+      "step": 8466
+    },
+    {
+      "epoch": 0.08467,
+      "grad_norm": 0.4530385434627533,
+      "learning_rate": 0.003,
+      "loss": 4.0462,
+      "step": 8467
+    },
+    {
+      "epoch": 0.08468,
+      "grad_norm": 0.5353749394416809,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 8468
+    },
+    {
+      "epoch": 0.08469,
+      "grad_norm": 0.557474672794342,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 8469
+    },
+    {
+      "epoch": 0.0847,
+      "grad_norm": 0.5882935523986816,
+      "learning_rate": 0.003,
+      "loss": 4.0256,
+      "step": 8470
+    },
+    {
+      "epoch": 0.08471,
+      "grad_norm": 0.7039132118225098,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 8471
+    },
+    {
+      "epoch": 0.08472,
+      "grad_norm": 0.9956209063529968,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 8472
+    },
+    {
+      "epoch": 0.08473,
+      "grad_norm": 1.4970457553863525,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 8473
+    },
+    {
+      "epoch": 0.08474,
+      "grad_norm": 0.5254650115966797,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 8474
+    },
+    {
+      "epoch": 0.08475,
+      "grad_norm": 0.8430723547935486,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 8475
+    },
+    {
+      "epoch": 0.08476,
+      "grad_norm": 0.8634896278381348,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 8476
+    },
+    {
+      "epoch": 0.08477,
+      "grad_norm": 0.8257098197937012,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 8477
+    },
+    {
+      "epoch": 0.08478,
+      "grad_norm": 0.8612629175186157,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 8478
+    },
+    {
+      "epoch": 0.08479,
+      "grad_norm": 0.9358077645301819,
+      "learning_rate": 0.003,
+      "loss": 4.0602,
+      "step": 8479
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.8783276677131653,
+      "learning_rate": 0.003,
+      "loss": 4.0579,
+      "step": 8480
+    },
+    {
+      "epoch": 0.08481,
+      "grad_norm": 0.9027069211006165,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 8481
+    },
+    {
+      "epoch": 0.08482,
+      "grad_norm": 0.7429633140563965,
+      "learning_rate": 0.003,
+      "loss": 4.0489,
+      "step": 8482
+    },
+    {
+      "epoch": 0.08483,
+      "grad_norm": 0.749813437461853,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 8483
+    },
+    {
+      "epoch": 0.08484,
+      "grad_norm": 0.8067286014556885,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 8484
+    },
+    {
+      "epoch": 0.08485,
+      "grad_norm": 0.8065735697746277,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 8485
+    },
+    {
+      "epoch": 0.08486,
+      "grad_norm": 0.6747359037399292,
+      "learning_rate": 0.003,
+      "loss": 4.0667,
+      "step": 8486
+    },
+    {
+      "epoch": 0.08487,
+      "grad_norm": 0.6390485763549805,
+      "learning_rate": 0.003,
+      "loss": 4.063,
+      "step": 8487
+    },
+    {
+      "epoch": 0.08488,
+      "grad_norm": 0.6853650808334351,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 8488
+    },
+    {
+      "epoch": 0.08489,
+      "grad_norm": 0.7082487940788269,
+      "learning_rate": 0.003,
+      "loss": 4.0483,
+      "step": 8489
+    },
+    {
+      "epoch": 0.0849,
+      "grad_norm": 0.6889592409133911,
+      "learning_rate": 0.003,
+      "loss": 4.0549,
+      "step": 8490
+    },
+    {
+      "epoch": 0.08491,
+      "grad_norm": 0.6803368926048279,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 8491
+    },
+    {
+      "epoch": 0.08492,
+      "grad_norm": 0.7069889307022095,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 8492
+    },
+    {
+      "epoch": 0.08493,
+      "grad_norm": 0.7506759166717529,
+      "learning_rate": 0.003,
+      "loss": 4.0465,
+      "step": 8493
+    },
+    {
+      "epoch": 0.08494,
+      "grad_norm": 0.7356934547424316,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 8494
+    },
+    {
+      "epoch": 0.08495,
+      "grad_norm": 0.7744423747062683,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 8495
+    },
+    {
+      "epoch": 0.08496,
+      "grad_norm": 0.7731289863586426,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 8496
+    },
+    {
+      "epoch": 0.08497,
+      "grad_norm": 0.7522779107093811,
+      "learning_rate": 0.003,
+      "loss": 4.0341,
+      "step": 8497
+    },
+    {
+      "epoch": 0.08498,
+      "grad_norm": 0.7073262333869934,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 8498
+    },
+    {
+      "epoch": 0.08499,
+      "grad_norm": 0.7390640377998352,
+      "learning_rate": 0.003,
+      "loss": 4.0652,
+      "step": 8499
+    },
+    {
+      "epoch": 0.085,
+      "grad_norm": 0.7686527967453003,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 8500
+    },
+    {
+      "epoch": 0.08501,
+      "grad_norm": 0.723380446434021,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 8501
+    },
+    {
+      "epoch": 0.08502,
+      "grad_norm": 0.6123513579368591,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 8502
+    },
+    {
+      "epoch": 0.08503,
+      "grad_norm": 0.6763044595718384,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 8503
+    },
+    {
+      "epoch": 0.08504,
+      "grad_norm": 0.7245738506317139,
+      "learning_rate": 0.003,
+      "loss": 4.0689,
+      "step": 8504
+    },
+    {
+      "epoch": 0.08505,
+      "grad_norm": 0.6983570456504822,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 8505
+    },
+    {
+      "epoch": 0.08506,
+      "grad_norm": 0.6399862170219421,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 8506
+    },
+    {
+      "epoch": 0.08507,
+      "grad_norm": 0.6284667253494263,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 8507
+    },
+    {
+      "epoch": 0.08508,
+      "grad_norm": 0.6627932786941528,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 8508
+    },
+    {
+      "epoch": 0.08509,
+      "grad_norm": 0.7929001450538635,
+      "learning_rate": 0.003,
+      "loss": 4.0632,
+      "step": 8509
+    },
+    {
+      "epoch": 0.0851,
+      "grad_norm": 1.0402476787567139,
+      "learning_rate": 0.003,
+      "loss": 4.0409,
+      "step": 8510
+    },
+    {
+      "epoch": 0.08511,
+      "grad_norm": 1.2262110710144043,
+      "learning_rate": 0.003,
+      "loss": 4.0552,
+      "step": 8511
+    },
+    {
+      "epoch": 0.08512,
+      "grad_norm": 0.6842292547225952,
+      "learning_rate": 0.003,
+      "loss": 4.0706,
+      "step": 8512
+    },
+    {
+      "epoch": 0.08513,
+      "grad_norm": 0.7122688293457031,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 8513
+    },
+    {
+      "epoch": 0.08514,
+      "grad_norm": 0.8808112144470215,
+      "learning_rate": 0.003,
+      "loss": 4.0443,
+      "step": 8514
+    },
+    {
+      "epoch": 0.08515,
+      "grad_norm": 0.9627430438995361,
+      "learning_rate": 0.003,
+      "loss": 4.039,
+      "step": 8515
+    },
+    {
+      "epoch": 0.08516,
+      "grad_norm": 1.0835819244384766,
+      "learning_rate": 0.003,
+      "loss": 4.0744,
+      "step": 8516
+    },
+    {
+      "epoch": 0.08517,
+      "grad_norm": 1.015756368637085,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 8517
+    },
+    {
+      "epoch": 0.08518,
+      "grad_norm": 0.8399810791015625,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 8518
+    },
+    {
+      "epoch": 0.08519,
+      "grad_norm": 0.8079990744590759,
+      "learning_rate": 0.003,
+      "loss": 4.0342,
+      "step": 8519
+    },
+    {
+      "epoch": 0.0852,
+      "grad_norm": 0.8063580393791199,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 8520
+    },
+    {
+      "epoch": 0.08521,
+      "grad_norm": 0.7848697900772095,
+      "learning_rate": 0.003,
+      "loss": 4.0579,
+      "step": 8521
+    },
+    {
+      "epoch": 0.08522,
+      "grad_norm": 0.7846394181251526,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 8522
+    },
+    {
+      "epoch": 0.08523,
+      "grad_norm": 0.8157345056533813,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 8523
+    },
+    {
+      "epoch": 0.08524,
+      "grad_norm": 0.949707567691803,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 8524
+    },
+    {
+      "epoch": 0.08525,
+      "grad_norm": 1.0673058032989502,
+      "learning_rate": 0.003,
+      "loss": 4.0473,
+      "step": 8525
+    },
+    {
+      "epoch": 0.08526,
+      "grad_norm": 0.9210948944091797,
+      "learning_rate": 0.003,
+      "loss": 4.0432,
+      "step": 8526
+    },
+    {
+      "epoch": 0.08527,
+      "grad_norm": 1.0052591562271118,
+      "learning_rate": 0.003,
+      "loss": 4.0523,
+      "step": 8527
+    },
+    {
+      "epoch": 0.08528,
+      "grad_norm": 1.0522360801696777,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 8528
+    },
+    {
+      "epoch": 0.08529,
+      "grad_norm": 0.8486297130584717,
+      "learning_rate": 0.003,
+      "loss": 4.0547,
+      "step": 8529
+    },
+    {
+      "epoch": 0.0853,
+      "grad_norm": 0.7309673428535461,
+      "learning_rate": 0.003,
+      "loss": 4.0468,
+      "step": 8530
+    },
+    {
+      "epoch": 0.08531,
+      "grad_norm": 0.7416118383407593,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 8531
+    },
+    {
+      "epoch": 0.08532,
+      "grad_norm": 0.7339397668838501,
+      "learning_rate": 0.003,
+      "loss": 4.0263,
+      "step": 8532
+    },
+    {
+      "epoch": 0.08533,
+      "grad_norm": 0.5952752232551575,
+      "learning_rate": 0.003,
+      "loss": 4.0538,
+      "step": 8533
+    },
+    {
+      "epoch": 0.08534,
+      "grad_norm": 0.47096192836761475,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 8534
+    },
+    {
+      "epoch": 0.08535,
+      "grad_norm": 0.48998767137527466,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 8535
+    },
+    {
+      "epoch": 0.08536,
+      "grad_norm": 0.4841568171977997,
+      "learning_rate": 0.003,
+      "loss": 4.0441,
+      "step": 8536
+    },
+    {
+      "epoch": 0.08537,
+      "grad_norm": 0.560813844203949,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 8537
+    },
+    {
+      "epoch": 0.08538,
+      "grad_norm": 0.6467888951301575,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 8538
+    },
+    {
+      "epoch": 0.08539,
+      "grad_norm": 0.7536951899528503,
+      "learning_rate": 0.003,
+      "loss": 4.0541,
+      "step": 8539
+    },
+    {
+      "epoch": 0.0854,
+      "grad_norm": 0.8326345682144165,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 8540
+    },
+    {
+      "epoch": 0.08541,
+      "grad_norm": 0.7367427349090576,
+      "learning_rate": 0.003,
+      "loss": 4.015,
+      "step": 8541
+    },
+    {
+      "epoch": 0.08542,
+      "grad_norm": 0.5583395957946777,
+      "learning_rate": 0.003,
+      "loss": 4.0494,
+      "step": 8542
+    },
+    {
+      "epoch": 0.08543,
+      "grad_norm": 0.48341140151023865,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 8543
+    },
+    {
+      "epoch": 0.08544,
+      "grad_norm": 0.5066325664520264,
+      "learning_rate": 0.003,
+      "loss": 4.015,
+      "step": 8544
+    },
+    {
+      "epoch": 0.08545,
+      "grad_norm": 0.4845677614212036,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 8545
+    },
+    {
+      "epoch": 0.08546,
+      "grad_norm": 0.4747871458530426,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 8546
+    },
+    {
+      "epoch": 0.08547,
+      "grad_norm": 0.4408625364303589,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 8547
+    },
+    {
+      "epoch": 0.08548,
+      "grad_norm": 0.5065740942955017,
+      "learning_rate": 0.003,
+      "loss": 4.0432,
+      "step": 8548
+    },
+    {
+      "epoch": 0.08549,
+      "grad_norm": 0.5792751908302307,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 8549
+    },
+    {
+      "epoch": 0.0855,
+      "grad_norm": 0.6657066345214844,
+      "learning_rate": 0.003,
+      "loss": 4.0514,
+      "step": 8550
+    },
+    {
+      "epoch": 0.08551,
+      "grad_norm": 0.7656866312026978,
+      "learning_rate": 0.003,
+      "loss": 4.0513,
+      "step": 8551
+    },
+    {
+      "epoch": 0.08552,
+      "grad_norm": 0.9525842666625977,
+      "learning_rate": 0.003,
+      "loss": 4.0414,
+      "step": 8552
+    },
+    {
+      "epoch": 0.08553,
+      "grad_norm": 0.986405611038208,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 8553
+    },
+    {
+      "epoch": 0.08554,
+      "grad_norm": 0.9743383526802063,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 8554
+    },
+    {
+      "epoch": 0.08555,
+      "grad_norm": 1.08023202419281,
+      "learning_rate": 0.003,
+      "loss": 4.0689,
+      "step": 8555
+    },
+    {
+      "epoch": 0.08556,
+      "grad_norm": 0.8308687210083008,
+      "learning_rate": 0.003,
+      "loss": 4.0495,
+      "step": 8556
+    },
+    {
+      "epoch": 0.08557,
+      "grad_norm": 0.8413991928100586,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 8557
+    },
+    {
+      "epoch": 0.08558,
+      "grad_norm": 0.8547917008399963,
+      "learning_rate": 0.003,
+      "loss": 4.0436,
+      "step": 8558
+    },
+    {
+      "epoch": 0.08559,
+      "grad_norm": 0.8460670113563538,
+      "learning_rate": 0.003,
+      "loss": 4.0518,
+      "step": 8559
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.8860325217247009,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 8560
+    },
+    {
+      "epoch": 0.08561,
+      "grad_norm": 0.8015673756599426,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 8561
+    },
+    {
+      "epoch": 0.08562,
+      "grad_norm": 0.9084799885749817,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 8562
+    },
+    {
+      "epoch": 0.08563,
+      "grad_norm": 0.9597362875938416,
+      "learning_rate": 0.003,
+      "loss": 4.0537,
+      "step": 8563
+    },
+    {
+      "epoch": 0.08564,
+      "grad_norm": 0.8412767648696899,
+      "learning_rate": 0.003,
+      "loss": 4.0483,
+      "step": 8564
+    },
+    {
+      "epoch": 0.08565,
+      "grad_norm": 0.8772968649864197,
+      "learning_rate": 0.003,
+      "loss": 4.0389,
+      "step": 8565
+    },
+    {
+      "epoch": 0.08566,
+      "grad_norm": 0.8721387982368469,
+      "learning_rate": 0.003,
+      "loss": 4.0552,
+      "step": 8566
+    },
+    {
+      "epoch": 0.08567,
+      "grad_norm": 0.9334637522697449,
+      "learning_rate": 0.003,
+      "loss": 4.069,
+      "step": 8567
+    },
+    {
+      "epoch": 0.08568,
+      "grad_norm": 1.1856471300125122,
+      "learning_rate": 0.003,
+      "loss": 4.0868,
+      "step": 8568
+    },
+    {
+      "epoch": 0.08569,
+      "grad_norm": 1.0100841522216797,
+      "learning_rate": 0.003,
+      "loss": 4.0889,
+      "step": 8569
+    },
+    {
+      "epoch": 0.0857,
+      "grad_norm": 0.9219196438789368,
+      "learning_rate": 0.003,
+      "loss": 4.0817,
+      "step": 8570
+    },
+    {
+      "epoch": 0.08571,
+      "grad_norm": 1.1001014709472656,
+      "learning_rate": 0.003,
+      "loss": 4.0747,
+      "step": 8571
+    },
+    {
+      "epoch": 0.08572,
+      "grad_norm": 1.129728078842163,
+      "learning_rate": 0.003,
+      "loss": 4.0496,
+      "step": 8572
+    },
+    {
+      "epoch": 0.08573,
+      "grad_norm": 0.8418722748756409,
+      "learning_rate": 0.003,
+      "loss": 4.0527,
+      "step": 8573
+    },
+    {
+      "epoch": 0.08574,
+      "grad_norm": 0.8911969065666199,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 8574
+    },
+    {
+      "epoch": 0.08575,
+      "grad_norm": 0.9890245199203491,
+      "learning_rate": 0.003,
+      "loss": 4.07,
+      "step": 8575
+    },
+    {
+      "epoch": 0.08576,
+      "grad_norm": 0.9305644631385803,
+      "learning_rate": 0.003,
+      "loss": 4.0575,
+      "step": 8576
+    },
+    {
+      "epoch": 0.08577,
+      "grad_norm": 0.7585141062736511,
+      "learning_rate": 0.003,
+      "loss": 4.0504,
+      "step": 8577
+    },
+    {
+      "epoch": 0.08578,
+      "grad_norm": 0.6668472290039062,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 8578
+    },
+    {
+      "epoch": 0.08579,
+      "grad_norm": 0.6890462040901184,
+      "learning_rate": 0.003,
+      "loss": 4.047,
+      "step": 8579
+    },
+    {
+      "epoch": 0.0858,
+      "grad_norm": 0.5876961946487427,
+      "learning_rate": 0.003,
+      "loss": 4.0316,
+      "step": 8580
+    },
+    {
+      "epoch": 0.08581,
+      "grad_norm": 0.5115332007408142,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 8581
+    },
+    {
+      "epoch": 0.08582,
+      "grad_norm": 0.5275139212608337,
+      "learning_rate": 0.003,
+      "loss": 4.049,
+      "step": 8582
+    },
+    {
+      "epoch": 0.08583,
+      "grad_norm": 0.5889208912849426,
+      "learning_rate": 0.003,
+      "loss": 4.0616,
+      "step": 8583
+    },
+    {
+      "epoch": 0.08584,
+      "grad_norm": 0.6184143424034119,
+      "learning_rate": 0.003,
+      "loss": 4.0497,
+      "step": 8584
+    },
+    {
+      "epoch": 0.08585,
+      "grad_norm": 0.5801510810852051,
+      "learning_rate": 0.003,
+      "loss": 4.052,
+      "step": 8585
+    },
+    {
+      "epoch": 0.08586,
+      "grad_norm": 0.5840868353843689,
+      "learning_rate": 0.003,
+      "loss": 4.0694,
+      "step": 8586
+    },
+    {
+      "epoch": 0.08587,
+      "grad_norm": 0.5485032796859741,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 8587
+    },
+    {
+      "epoch": 0.08588,
+      "grad_norm": 0.596748411655426,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 8588
+    },
+    {
+      "epoch": 0.08589,
+      "grad_norm": 0.5736215710639954,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 8589
+    },
+    {
+      "epoch": 0.0859,
+      "grad_norm": 0.5293859839439392,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 8590
+    },
+    {
+      "epoch": 0.08591,
+      "grad_norm": 0.5721410512924194,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 8591
+    },
+    {
+      "epoch": 0.08592,
+      "grad_norm": 0.7228186130523682,
+      "learning_rate": 0.003,
+      "loss": 4.0548,
+      "step": 8592
+    },
+    {
+      "epoch": 0.08593,
+      "grad_norm": 0.9087763428688049,
+      "learning_rate": 0.003,
+      "loss": 4.0744,
+      "step": 8593
+    },
+    {
+      "epoch": 0.08594,
+      "grad_norm": 1.1849374771118164,
+      "learning_rate": 0.003,
+      "loss": 4.0355,
+      "step": 8594
+    },
+    {
+      "epoch": 0.08595,
+      "grad_norm": 0.8020411133766174,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 8595
+    },
+    {
+      "epoch": 0.08596,
+      "grad_norm": 0.6049209833145142,
+      "learning_rate": 0.003,
+      "loss": 4.0498,
+      "step": 8596
+    },
+    {
+      "epoch": 0.08597,
+      "grad_norm": 0.5967742204666138,
+      "learning_rate": 0.003,
+      "loss": 4.0437,
+      "step": 8597
+    },
+    {
+      "epoch": 0.08598,
+      "grad_norm": 0.6498646140098572,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 8598
+    },
+    {
+      "epoch": 0.08599,
+      "grad_norm": 0.6824044585227966,
+      "learning_rate": 0.003,
+      "loss": 4.0389,
+      "step": 8599
+    },
+    {
+      "epoch": 0.086,
+      "grad_norm": 0.6595525741577148,
+      "learning_rate": 0.003,
+      "loss": 4.0626,
+      "step": 8600
+    },
+    {
+      "epoch": 0.08601,
+      "grad_norm": 0.5955501198768616,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 8601
+    },
+    {
+      "epoch": 0.08602,
+      "grad_norm": 0.5499855279922485,
+      "learning_rate": 0.003,
+      "loss": 4.0551,
+      "step": 8602
+    },
+    {
+      "epoch": 0.08603,
+      "grad_norm": 0.6001824140548706,
+      "learning_rate": 0.003,
+      "loss": 4.0469,
+      "step": 8603
+    },
+    {
+      "epoch": 0.08604,
+      "grad_norm": 0.6126075983047485,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 8604
+    },
+    {
+      "epoch": 0.08605,
+      "grad_norm": 0.7098419666290283,
+      "learning_rate": 0.003,
+      "loss": 4.0444,
+      "step": 8605
+    },
+    {
+      "epoch": 0.08606,
+      "grad_norm": 0.7325479388237,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 8606
+    },
+    {
+      "epoch": 0.08607,
+      "grad_norm": 0.7961235046386719,
+      "learning_rate": 0.003,
+      "loss": 4.0588,
+      "step": 8607
+    },
+    {
+      "epoch": 0.08608,
+      "grad_norm": 0.7546827793121338,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 8608
+    },
+    {
+      "epoch": 0.08609,
+      "grad_norm": 0.715076744556427,
+      "learning_rate": 0.003,
+      "loss": 4.0522,
+      "step": 8609
+    },
+    {
+      "epoch": 0.0861,
+      "grad_norm": 0.6888747215270996,
+      "learning_rate": 0.003,
+      "loss": 4.0523,
+      "step": 8610
+    },
+    {
+      "epoch": 0.08611,
+      "grad_norm": 0.6887075304985046,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 8611
+    },
+    {
+      "epoch": 0.08612,
+      "grad_norm": 0.8714022040367126,
+      "learning_rate": 0.003,
+      "loss": 4.0708,
+      "step": 8612
+    },
+    {
+      "epoch": 0.08613,
+      "grad_norm": 1.1010104417800903,
+      "learning_rate": 0.003,
+      "loss": 4.052,
+      "step": 8613
+    },
+    {
+      "epoch": 0.08614,
+      "grad_norm": 1.023213267326355,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 8614
+    },
+    {
+      "epoch": 0.08615,
+      "grad_norm": 0.9047683477401733,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 8615
+    },
+    {
+      "epoch": 0.08616,
+      "grad_norm": 0.9519088268280029,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 8616
+    },
+    {
+      "epoch": 0.08617,
+      "grad_norm": 0.8695728182792664,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 8617
+    },
+    {
+      "epoch": 0.08618,
+      "grad_norm": 0.8256757259368896,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 8618
+    },
+    {
+      "epoch": 0.08619,
+      "grad_norm": 0.8691619634628296,
+      "learning_rate": 0.003,
+      "loss": 4.0703,
+      "step": 8619
+    },
+    {
+      "epoch": 0.0862,
+      "grad_norm": 0.9401669502258301,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 8620
+    },
+    {
+      "epoch": 0.08621,
+      "grad_norm": 1.0106126070022583,
+      "learning_rate": 0.003,
+      "loss": 4.048,
+      "step": 8621
+    },
+    {
+      "epoch": 0.08622,
+      "grad_norm": 1.0970956087112427,
+      "learning_rate": 0.003,
+      "loss": 4.0763,
+      "step": 8622
+    },
+    {
+      "epoch": 0.08623,
+      "grad_norm": 0.9480116367340088,
+      "learning_rate": 0.003,
+      "loss": 4.0653,
+      "step": 8623
+    },
+    {
+      "epoch": 0.08624,
+      "grad_norm": 0.8120977282524109,
+      "learning_rate": 0.003,
+      "loss": 4.046,
+      "step": 8624
+    },
+    {
+      "epoch": 0.08625,
+      "grad_norm": 0.7313287258148193,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 8625
+    },
+    {
+      "epoch": 0.08626,
+      "grad_norm": 0.7254028916358948,
+      "learning_rate": 0.003,
+      "loss": 4.0715,
+      "step": 8626
+    },
+    {
+      "epoch": 0.08627,
+      "grad_norm": 0.8220468163490295,
+      "learning_rate": 0.003,
+      "loss": 4.0507,
+      "step": 8627
+    },
+    {
+      "epoch": 0.08628,
+      "grad_norm": 0.8760457038879395,
+      "learning_rate": 0.003,
+      "loss": 4.0571,
+      "step": 8628
+    },
+    {
+      "epoch": 0.08629,
+      "grad_norm": 0.6859437227249146,
+      "learning_rate": 0.003,
+      "loss": 4.0469,
+      "step": 8629
+    },
+    {
+      "epoch": 0.0863,
+      "grad_norm": 0.7072209715843201,
+      "learning_rate": 0.003,
+      "loss": 4.0371,
+      "step": 8630
+    },
+    {
+      "epoch": 0.08631,
+      "grad_norm": 0.54953932762146,
+      "learning_rate": 0.003,
+      "loss": 4.0481,
+      "step": 8631
+    },
+    {
+      "epoch": 0.08632,
+      "grad_norm": 0.5660358667373657,
+      "learning_rate": 0.003,
+      "loss": 4.0295,
+      "step": 8632
+    },
+    {
+      "epoch": 0.08633,
+      "grad_norm": 0.621670663356781,
+      "learning_rate": 0.003,
+      "loss": 4.0684,
+      "step": 8633
+    },
+    {
+      "epoch": 0.08634,
+      "grad_norm": 0.6216848492622375,
+      "learning_rate": 0.003,
+      "loss": 4.0539,
+      "step": 8634
+    },
+    {
+      "epoch": 0.08635,
+      "grad_norm": 0.573436439037323,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 8635
+    },
+    {
+      "epoch": 0.08636,
+      "grad_norm": 0.5630894899368286,
+      "learning_rate": 0.003,
+      "loss": 4.0425,
+      "step": 8636
+    },
+    {
+      "epoch": 0.08637,
+      "grad_norm": 0.776887834072113,
+      "learning_rate": 0.003,
+      "loss": 4.0639,
+      "step": 8637
+    },
+    {
+      "epoch": 0.08638,
+      "grad_norm": 0.9575220346450806,
+      "learning_rate": 0.003,
+      "loss": 4.0567,
+      "step": 8638
+    },
+    {
+      "epoch": 0.08639,
+      "grad_norm": 1.1651415824890137,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 8639
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.8188918828964233,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 8640
+    },
+    {
+      "epoch": 0.08641,
+      "grad_norm": 0.49055084586143494,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 8641
+    },
+    {
+      "epoch": 0.08642,
+      "grad_norm": 0.5482150316238403,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 8642
+    },
+    {
+      "epoch": 0.08643,
+      "grad_norm": 0.6328015923500061,
+      "learning_rate": 0.003,
+      "loss": 4.0294,
+      "step": 8643
+    },
+    {
+      "epoch": 0.08644,
+      "grad_norm": 0.6240196824073792,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 8644
+    },
+    {
+      "epoch": 0.08645,
+      "grad_norm": 0.5255370140075684,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 8645
+    },
+    {
+      "epoch": 0.08646,
+      "grad_norm": 0.5354486703872681,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 8646
+    },
+    {
+      "epoch": 0.08647,
+      "grad_norm": 0.5208635926246643,
+      "learning_rate": 0.003,
+      "loss": 3.9926,
+      "step": 8647
+    },
+    {
+      "epoch": 0.08648,
+      "grad_norm": 0.4734020233154297,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 8648
+    },
+    {
+      "epoch": 0.08649,
+      "grad_norm": 0.49996834993362427,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 8649
+    },
+    {
+      "epoch": 0.0865,
+      "grad_norm": 0.6042665839195251,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 8650
+    },
+    {
+      "epoch": 0.08651,
+      "grad_norm": 0.647171139717102,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 8651
+    },
+    {
+      "epoch": 0.08652,
+      "grad_norm": 0.6678754687309265,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 8652
+    },
+    {
+      "epoch": 0.08653,
+      "grad_norm": 0.7537620663642883,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 8653
+    },
+    {
+      "epoch": 0.08654,
+      "grad_norm": 0.8044998049736023,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 8654
+    },
+    {
+      "epoch": 0.08655,
+      "grad_norm": 0.8797716498374939,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 8655
+    },
+    {
+      "epoch": 0.08656,
+      "grad_norm": 0.9785940051078796,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 8656
+    },
+    {
+      "epoch": 0.08657,
+      "grad_norm": 0.9777019023895264,
+      "learning_rate": 0.003,
+      "loss": 4.0646,
+      "step": 8657
+    },
+    {
+      "epoch": 0.08658,
+      "grad_norm": 0.9151862263679504,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 8658
+    },
+    {
+      "epoch": 0.08659,
+      "grad_norm": 0.8125520348548889,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 8659
+    },
+    {
+      "epoch": 0.0866,
+      "grad_norm": 0.8756903409957886,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 8660
+    },
+    {
+      "epoch": 0.08661,
+      "grad_norm": 0.9547138810157776,
+      "learning_rate": 0.003,
+      "loss": 4.0446,
+      "step": 8661
+    },
+    {
+      "epoch": 0.08662,
+      "grad_norm": 0.7263685464859009,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 8662
+    },
+    {
+      "epoch": 0.08663,
+      "grad_norm": 0.7474812269210815,
+      "learning_rate": 0.003,
+      "loss": 4.0589,
+      "step": 8663
+    },
+    {
+      "epoch": 0.08664,
+      "grad_norm": 0.7189542055130005,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 8664
+    },
+    {
+      "epoch": 0.08665,
+      "grad_norm": 0.833405077457428,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 8665
+    },
+    {
+      "epoch": 0.08666,
+      "grad_norm": 1.123273491859436,
+      "learning_rate": 0.003,
+      "loss": 4.048,
+      "step": 8666
+    },
+    {
+      "epoch": 0.08667,
+      "grad_norm": 0.9802038073539734,
+      "learning_rate": 0.003,
+      "loss": 4.0369,
+      "step": 8667
+    },
+    {
+      "epoch": 0.08668,
+      "grad_norm": 0.8807769417762756,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 8668
+    },
+    {
+      "epoch": 0.08669,
+      "grad_norm": 0.7354418635368347,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 8669
+    },
+    {
+      "epoch": 0.0867,
+      "grad_norm": 0.6019794940948486,
+      "learning_rate": 0.003,
+      "loss": 4.0316,
+      "step": 8670
+    },
+    {
+      "epoch": 0.08671,
+      "grad_norm": 0.5857205986976624,
+      "learning_rate": 0.003,
+      "loss": 4.0446,
+      "step": 8671
+    },
+    {
+      "epoch": 0.08672,
+      "grad_norm": 0.5742001533508301,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 8672
+    },
+    {
+      "epoch": 0.08673,
+      "grad_norm": 0.6621288657188416,
+      "learning_rate": 0.003,
+      "loss": 4.0375,
+      "step": 8673
+    },
+    {
+      "epoch": 0.08674,
+      "grad_norm": 0.719313383102417,
+      "learning_rate": 0.003,
+      "loss": 4.0473,
+      "step": 8674
+    },
+    {
+      "epoch": 0.08675,
+      "grad_norm": 0.8486753106117249,
+      "learning_rate": 0.003,
+      "loss": 4.0549,
+      "step": 8675
+    },
+    {
+      "epoch": 0.08676,
+      "grad_norm": 0.7985334992408752,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 8676
+    },
+    {
+      "epoch": 0.08677,
+      "grad_norm": 0.7688699960708618,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 8677
+    },
+    {
+      "epoch": 0.08678,
+      "grad_norm": 0.801563024520874,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 8678
+    },
+    {
+      "epoch": 0.08679,
+      "grad_norm": 0.791769802570343,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 8679
+    },
+    {
+      "epoch": 0.0868,
+      "grad_norm": 0.8047665953636169,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 8680
+    },
+    {
+      "epoch": 0.08681,
+      "grad_norm": 0.9945483803749084,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 8681
+    },
+    {
+      "epoch": 0.08682,
+      "grad_norm": 0.9851046800613403,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 8682
+    },
+    {
+      "epoch": 0.08683,
+      "grad_norm": 1.0485612154006958,
+      "learning_rate": 0.003,
+      "loss": 4.0787,
+      "step": 8683
+    },
+    {
+      "epoch": 0.08684,
+      "grad_norm": 1.0201568603515625,
+      "learning_rate": 0.003,
+      "loss": 4.0719,
+      "step": 8684
+    },
+    {
+      "epoch": 0.08685,
+      "grad_norm": 1.0376875400543213,
+      "learning_rate": 0.003,
+      "loss": 4.0732,
+      "step": 8685
+    },
+    {
+      "epoch": 0.08686,
+      "grad_norm": 0.805310845375061,
+      "learning_rate": 0.003,
+      "loss": 4.064,
+      "step": 8686
+    },
+    {
+      "epoch": 0.08687,
+      "grad_norm": 0.754871666431427,
+      "learning_rate": 0.003,
+      "loss": 4.067,
+      "step": 8687
+    },
+    {
+      "epoch": 0.08688,
+      "grad_norm": 0.8903035521507263,
+      "learning_rate": 0.003,
+      "loss": 4.0546,
+      "step": 8688
+    },
+    {
+      "epoch": 0.08689,
+      "grad_norm": 1.157721996307373,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 8689
+    },
+    {
+      "epoch": 0.0869,
+      "grad_norm": 0.8955743312835693,
+      "learning_rate": 0.003,
+      "loss": 4.0617,
+      "step": 8690
+    },
+    {
+      "epoch": 0.08691,
+      "grad_norm": 0.7679038047790527,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 8691
+    },
+    {
+      "epoch": 0.08692,
+      "grad_norm": 0.5723802447319031,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 8692
+    },
+    {
+      "epoch": 0.08693,
+      "grad_norm": 0.6420039534568787,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 8693
+    },
+    {
+      "epoch": 0.08694,
+      "grad_norm": 0.8482261896133423,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 8694
+    },
+    {
+      "epoch": 0.08695,
+      "grad_norm": 0.9729529619216919,
+      "learning_rate": 0.003,
+      "loss": 4.0434,
+      "step": 8695
+    },
+    {
+      "epoch": 0.08696,
+      "grad_norm": 1.0107791423797607,
+      "learning_rate": 0.003,
+      "loss": 4.0723,
+      "step": 8696
+    },
+    {
+      "epoch": 0.08697,
+      "grad_norm": 0.7928081154823303,
+      "learning_rate": 0.003,
+      "loss": 4.0601,
+      "step": 8697
+    },
+    {
+      "epoch": 0.08698,
+      "grad_norm": 0.664397656917572,
+      "learning_rate": 0.003,
+      "loss": 4.0331,
+      "step": 8698
+    },
+    {
+      "epoch": 0.08699,
+      "grad_norm": 0.6472232937812805,
+      "learning_rate": 0.003,
+      "loss": 4.0473,
+      "step": 8699
+    },
+    {
+      "epoch": 0.087,
+      "grad_norm": 0.644534170627594,
+      "learning_rate": 0.003,
+      "loss": 4.0679,
+      "step": 8700
+    },
+    {
+      "epoch": 0.08701,
+      "grad_norm": 0.7472826838493347,
+      "learning_rate": 0.003,
+      "loss": 4.0501,
+      "step": 8701
+    },
+    {
+      "epoch": 0.08702,
+      "grad_norm": 0.7674583196640015,
+      "learning_rate": 0.003,
+      "loss": 4.0402,
+      "step": 8702
+    },
+    {
+      "epoch": 0.08703,
+      "grad_norm": 0.7745077013969421,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 8703
+    },
+    {
+      "epoch": 0.08704,
+      "grad_norm": 0.695412814617157,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 8704
+    },
+    {
+      "epoch": 0.08705,
+      "grad_norm": 0.6802660822868347,
+      "learning_rate": 0.003,
+      "loss": 4.0635,
+      "step": 8705
+    },
+    {
+      "epoch": 0.08706,
+      "grad_norm": 0.6403552889823914,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 8706
+    },
+    {
+      "epoch": 0.08707,
+      "grad_norm": 0.5612523555755615,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 8707
+    },
+    {
+      "epoch": 0.08708,
+      "grad_norm": 0.5290988683700562,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 8708
+    },
+    {
+      "epoch": 0.08709,
+      "grad_norm": 0.5752286911010742,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 8709
+    },
+    {
+      "epoch": 0.0871,
+      "grad_norm": 0.5760716199874878,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 8710
+    },
+    {
+      "epoch": 0.08711,
+      "grad_norm": 0.6479002237319946,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 8711
+    },
+    {
+      "epoch": 0.08712,
+      "grad_norm": 0.6316835880279541,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 8712
+    },
+    {
+      "epoch": 0.08713,
+      "grad_norm": 0.6163070797920227,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 8713
+    },
+    {
+      "epoch": 0.08714,
+      "grad_norm": 0.6544851064682007,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 8714
+    },
+    {
+      "epoch": 0.08715,
+      "grad_norm": 0.6510108113288879,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 8715
+    },
+    {
+      "epoch": 0.08716,
+      "grad_norm": 0.6509416103363037,
+      "learning_rate": 0.003,
+      "loss": 4.0414,
+      "step": 8716
+    },
+    {
+      "epoch": 0.08717,
+      "grad_norm": 0.6482957601547241,
+      "learning_rate": 0.003,
+      "loss": 4.0475,
+      "step": 8717
+    },
+    {
+      "epoch": 0.08718,
+      "grad_norm": 0.6902303099632263,
+      "learning_rate": 0.003,
+      "loss": 4.0397,
+      "step": 8718
+    },
+    {
+      "epoch": 0.08719,
+      "grad_norm": 0.7940294146537781,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 8719
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.9621227979660034,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 8720
+    },
+    {
+      "epoch": 0.08721,
+      "grad_norm": 1.084385871887207,
+      "learning_rate": 0.003,
+      "loss": 4.0493,
+      "step": 8721
+    },
+    {
+      "epoch": 0.08722,
+      "grad_norm": 0.8606046438217163,
+      "learning_rate": 0.003,
+      "loss": 4.0334,
+      "step": 8722
+    },
+    {
+      "epoch": 0.08723,
+      "grad_norm": 0.7293562293052673,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 8723
+    },
+    {
+      "epoch": 0.08724,
+      "grad_norm": 0.8104085922241211,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 8724
+    },
+    {
+      "epoch": 0.08725,
+      "grad_norm": 1.134535789489746,
+      "learning_rate": 0.003,
+      "loss": 4.0526,
+      "step": 8725
+    },
+    {
+      "epoch": 0.08726,
+      "grad_norm": 0.9137052893638611,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 8726
+    },
+    {
+      "epoch": 0.08727,
+      "grad_norm": 0.7882857918739319,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 8727
+    },
+    {
+      "epoch": 0.08728,
+      "grad_norm": 0.6920211911201477,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 8728
+    },
+    {
+      "epoch": 0.08729,
+      "grad_norm": 0.769177258014679,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 8729
+    },
+    {
+      "epoch": 0.0873,
+      "grad_norm": 0.7354542016983032,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 8730
+    },
+    {
+      "epoch": 0.08731,
+      "grad_norm": 0.6996873617172241,
+      "learning_rate": 0.003,
+      "loss": 4.0454,
+      "step": 8731
+    },
+    {
+      "epoch": 0.08732,
+      "grad_norm": 0.6775871515274048,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 8732
+    },
+    {
+      "epoch": 0.08733,
+      "grad_norm": 0.6581737995147705,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 8733
+    },
+    {
+      "epoch": 0.08734,
+      "grad_norm": 0.6336266994476318,
+      "learning_rate": 0.003,
+      "loss": 4.0568,
+      "step": 8734
+    },
+    {
+      "epoch": 0.08735,
+      "grad_norm": 0.6769811511039734,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 8735
+    },
+    {
+      "epoch": 0.08736,
+      "grad_norm": 0.6477938890457153,
+      "learning_rate": 0.003,
+      "loss": 4.0427,
+      "step": 8736
+    },
+    {
+      "epoch": 0.08737,
+      "grad_norm": 0.5973692536354065,
+      "learning_rate": 0.003,
+      "loss": 4.0591,
+      "step": 8737
+    },
+    {
+      "epoch": 0.08738,
+      "grad_norm": 0.6644995808601379,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 8738
+    },
+    {
+      "epoch": 0.08739,
+      "grad_norm": 0.8802757859230042,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 8739
+    },
+    {
+      "epoch": 0.0874,
+      "grad_norm": 1.0935834646224976,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 8740
+    },
+    {
+      "epoch": 0.08741,
+      "grad_norm": 0.9527744650840759,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 8741
+    },
+    {
+      "epoch": 0.08742,
+      "grad_norm": 0.7332203984260559,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 8742
+    },
+    {
+      "epoch": 0.08743,
+      "grad_norm": 0.7583456635475159,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 8743
+    },
+    {
+      "epoch": 0.08744,
+      "grad_norm": 0.7854973673820496,
+      "learning_rate": 0.003,
+      "loss": 4.0586,
+      "step": 8744
+    },
+    {
+      "epoch": 0.08745,
+      "grad_norm": 0.721187949180603,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 8745
+    },
+    {
+      "epoch": 0.08746,
+      "grad_norm": 0.7170279026031494,
+      "learning_rate": 0.003,
+      "loss": 4.0406,
+      "step": 8746
+    },
+    {
+      "epoch": 0.08747,
+      "grad_norm": 0.6679021120071411,
+      "learning_rate": 0.003,
+      "loss": 4.043,
+      "step": 8747
+    },
+    {
+      "epoch": 0.08748,
+      "grad_norm": 0.6513135433197021,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 8748
+    },
+    {
+      "epoch": 0.08749,
+      "grad_norm": 0.7048290371894836,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 8749
+    },
+    {
+      "epoch": 0.0875,
+      "grad_norm": 0.8222705721855164,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 8750
+    },
+    {
+      "epoch": 0.08751,
+      "grad_norm": 0.8615703582763672,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 8751
+    },
+    {
+      "epoch": 0.08752,
+      "grad_norm": 0.9447921514511108,
+      "learning_rate": 0.003,
+      "loss": 4.0639,
+      "step": 8752
+    },
+    {
+      "epoch": 0.08753,
+      "grad_norm": 1.176733136177063,
+      "learning_rate": 0.003,
+      "loss": 4.0753,
+      "step": 8753
+    },
+    {
+      "epoch": 0.08754,
+      "grad_norm": 0.6545034050941467,
+      "learning_rate": 0.003,
+      "loss": 4.0433,
+      "step": 8754
+    },
+    {
+      "epoch": 0.08755,
+      "grad_norm": 0.681919276714325,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 8755
+    },
+    {
+      "epoch": 0.08756,
+      "grad_norm": 0.7190672159194946,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 8756
+    },
+    {
+      "epoch": 0.08757,
+      "grad_norm": 0.7491109371185303,
+      "learning_rate": 0.003,
+      "loss": 4.0597,
+      "step": 8757
+    },
+    {
+      "epoch": 0.08758,
+      "grad_norm": 0.8422783017158508,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 8758
+    },
+    {
+      "epoch": 0.08759,
+      "grad_norm": 0.8749980330467224,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 8759
+    },
+    {
+      "epoch": 0.0876,
+      "grad_norm": 0.8432993292808533,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 8760
+    },
+    {
+      "epoch": 0.08761,
+      "grad_norm": 0.8244159817695618,
+      "learning_rate": 0.003,
+      "loss": 4.0727,
+      "step": 8761
+    },
+    {
+      "epoch": 0.08762,
+      "grad_norm": 0.8379217982292175,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 8762
+    },
+    {
+      "epoch": 0.08763,
+      "grad_norm": 0.8017216920852661,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 8763
+    },
+    {
+      "epoch": 0.08764,
+      "grad_norm": 0.6845731139183044,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 8764
+    },
+    {
+      "epoch": 0.08765,
+      "grad_norm": 0.6732616424560547,
+      "learning_rate": 0.003,
+      "loss": 4.0001,
+      "step": 8765
+    },
+    {
+      "epoch": 0.08766,
+      "grad_norm": 0.6401374340057373,
+      "learning_rate": 0.003,
+      "loss": 4.0527,
+      "step": 8766
+    },
+    {
+      "epoch": 0.08767,
+      "grad_norm": 0.5827595591545105,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 8767
+    },
+    {
+      "epoch": 0.08768,
+      "grad_norm": 0.5755667686462402,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 8768
+    },
+    {
+      "epoch": 0.08769,
+      "grad_norm": 0.5682708024978638,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 8769
+    },
+    {
+      "epoch": 0.0877,
+      "grad_norm": 0.5289202332496643,
+      "learning_rate": 0.003,
+      "loss": 4.0339,
+      "step": 8770
+    },
+    {
+      "epoch": 0.08771,
+      "grad_norm": 0.5895897746086121,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 8771
+    },
+    {
+      "epoch": 0.08772,
+      "grad_norm": 0.8289152383804321,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 8772
+    },
+    {
+      "epoch": 0.08773,
+      "grad_norm": 1.1563746929168701,
+      "learning_rate": 0.003,
+      "loss": 4.058,
+      "step": 8773
+    },
+    {
+      "epoch": 0.08774,
+      "grad_norm": 0.7927180528640747,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 8774
+    },
+    {
+      "epoch": 0.08775,
+      "grad_norm": 0.6851915717124939,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 8775
+    },
+    {
+      "epoch": 0.08776,
+      "grad_norm": 0.8875740766525269,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 8776
+    },
+    {
+      "epoch": 0.08777,
+      "grad_norm": 1.134511947631836,
+      "learning_rate": 0.003,
+      "loss": 4.0667,
+      "step": 8777
+    },
+    {
+      "epoch": 0.08778,
+      "grad_norm": 0.9227170348167419,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 8778
+    },
+    {
+      "epoch": 0.08779,
+      "grad_norm": 0.8097333312034607,
+      "learning_rate": 0.003,
+      "loss": 4.058,
+      "step": 8779
+    },
+    {
+      "epoch": 0.0878,
+      "grad_norm": 0.6999717354774475,
+      "learning_rate": 0.003,
+      "loss": 4.0242,
+      "step": 8780
+    },
+    {
+      "epoch": 0.08781,
+      "grad_norm": 0.6974648237228394,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 8781
+    },
+    {
+      "epoch": 0.08782,
+      "grad_norm": 0.7920845746994019,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 8782
+    },
+    {
+      "epoch": 0.08783,
+      "grad_norm": 0.9354571104049683,
+      "learning_rate": 0.003,
+      "loss": 4.0574,
+      "step": 8783
+    },
+    {
+      "epoch": 0.08784,
+      "grad_norm": 0.8569172620773315,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 8784
+    },
+    {
+      "epoch": 0.08785,
+      "grad_norm": 0.8295965194702148,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 8785
+    },
+    {
+      "epoch": 0.08786,
+      "grad_norm": 0.7757327556610107,
+      "learning_rate": 0.003,
+      "loss": 4.0648,
+      "step": 8786
+    },
+    {
+      "epoch": 0.08787,
+      "grad_norm": 0.7948103547096252,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 8787
+    },
+    {
+      "epoch": 0.08788,
+      "grad_norm": 0.9465476870536804,
+      "learning_rate": 0.003,
+      "loss": 4.0634,
+      "step": 8788
+    },
+    {
+      "epoch": 0.08789,
+      "grad_norm": 1.0520426034927368,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 8789
+    },
+    {
+      "epoch": 0.0879,
+      "grad_norm": 0.8785009384155273,
+      "learning_rate": 0.003,
+      "loss": 4.0552,
+      "step": 8790
+    },
+    {
+      "epoch": 0.08791,
+      "grad_norm": 0.7729325890541077,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 8791
+    },
+    {
+      "epoch": 0.08792,
+      "grad_norm": 0.7058838605880737,
+      "learning_rate": 0.003,
+      "loss": 4.0493,
+      "step": 8792
+    },
+    {
+      "epoch": 0.08793,
+      "grad_norm": 0.6069271564483643,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 8793
+    },
+    {
+      "epoch": 0.08794,
+      "grad_norm": 0.6468567252159119,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 8794
+    },
+    {
+      "epoch": 0.08795,
+      "grad_norm": 0.6967955231666565,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 8795
+    },
+    {
+      "epoch": 0.08796,
+      "grad_norm": 0.6414638757705688,
+      "learning_rate": 0.003,
+      "loss": 4.0369,
+      "step": 8796
+    },
+    {
+      "epoch": 0.08797,
+      "grad_norm": 0.5752788782119751,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 8797
+    },
+    {
+      "epoch": 0.08798,
+      "grad_norm": 0.5287693738937378,
+      "learning_rate": 0.003,
+      "loss": 4.0489,
+      "step": 8798
+    },
+    {
+      "epoch": 0.08799,
+      "grad_norm": 0.5393201112747192,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 8799
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.6019169688224792,
+      "learning_rate": 0.003,
+      "loss": 4.0479,
+      "step": 8800
+    },
+    {
+      "epoch": 0.08801,
+      "grad_norm": 0.7389690279960632,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 8801
+    },
+    {
+      "epoch": 0.08802,
+      "grad_norm": 1.005857229232788,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 8802
+    },
+    {
+      "epoch": 0.08803,
+      "grad_norm": 1.0771352052688599,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 8803
+    },
+    {
+      "epoch": 0.08804,
+      "grad_norm": 0.7955772876739502,
+      "learning_rate": 0.003,
+      "loss": 4.0207,
+      "step": 8804
+    },
+    {
+      "epoch": 0.08805,
+      "grad_norm": 0.7112957239151001,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 8805
+    },
+    {
+      "epoch": 0.08806,
+      "grad_norm": 0.7480246424674988,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 8806
+    },
+    {
+      "epoch": 0.08807,
+      "grad_norm": 0.6301780343055725,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 8807
+    },
+    {
+      "epoch": 0.08808,
+      "grad_norm": 0.7244052886962891,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 8808
+    },
+    {
+      "epoch": 0.08809,
+      "grad_norm": 0.6920201182365417,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 8809
+    },
+    {
+      "epoch": 0.0881,
+      "grad_norm": 0.6484432816505432,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 8810
+    },
+    {
+      "epoch": 0.08811,
+      "grad_norm": 0.6578159332275391,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 8811
+    },
+    {
+      "epoch": 0.08812,
+      "grad_norm": 0.7231255173683167,
+      "learning_rate": 0.003,
+      "loss": 4.0409,
+      "step": 8812
+    },
+    {
+      "epoch": 0.08813,
+      "grad_norm": 0.7631604671478271,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 8813
+    },
+    {
+      "epoch": 0.08814,
+      "grad_norm": 0.7701826095581055,
+      "learning_rate": 0.003,
+      "loss": 4.0475,
+      "step": 8814
+    },
+    {
+      "epoch": 0.08815,
+      "grad_norm": 0.8050733804702759,
+      "learning_rate": 0.003,
+      "loss": 4.0434,
+      "step": 8815
+    },
+    {
+      "epoch": 0.08816,
+      "grad_norm": 0.8618783950805664,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 8816
+    },
+    {
+      "epoch": 0.08817,
+      "grad_norm": 0.9637402296066284,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 8817
+    },
+    {
+      "epoch": 0.08818,
+      "grad_norm": 1.014755129814148,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 8818
+    },
+    {
+      "epoch": 0.08819,
+      "grad_norm": 0.7961846590042114,
+      "learning_rate": 0.003,
+      "loss": 4.0624,
+      "step": 8819
+    },
+    {
+      "epoch": 0.0882,
+      "grad_norm": 0.7335184812545776,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 8820
+    },
+    {
+      "epoch": 0.08821,
+      "grad_norm": 0.6655312180519104,
+      "learning_rate": 0.003,
+      "loss": 4.0629,
+      "step": 8821
+    },
+    {
+      "epoch": 0.08822,
+      "grad_norm": 0.5640453696250916,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 8822
+    },
+    {
+      "epoch": 0.08823,
+      "grad_norm": 0.5673083066940308,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 8823
+    },
+    {
+      "epoch": 0.08824,
+      "grad_norm": 0.6634215712547302,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 8824
+    },
+    {
+      "epoch": 0.08825,
+      "grad_norm": 0.8428478837013245,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 8825
+    },
+    {
+      "epoch": 0.08826,
+      "grad_norm": 0.9593620896339417,
+      "learning_rate": 0.003,
+      "loss": 4.0176,
+      "step": 8826
+    },
+    {
+      "epoch": 0.08827,
+      "grad_norm": 0.9794454574584961,
+      "learning_rate": 0.003,
+      "loss": 4.0469,
+      "step": 8827
+    },
+    {
+      "epoch": 0.08828,
+      "grad_norm": 0.8954617381095886,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 8828
+    },
+    {
+      "epoch": 0.08829,
+      "grad_norm": 0.8380072712898254,
+      "learning_rate": 0.003,
+      "loss": 4.0475,
+      "step": 8829
+    },
+    {
+      "epoch": 0.0883,
+      "grad_norm": 0.7732906341552734,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 8830
+    },
+    {
+      "epoch": 0.08831,
+      "grad_norm": 0.7492073774337769,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 8831
+    },
+    {
+      "epoch": 0.08832,
+      "grad_norm": 0.7155649065971375,
+      "learning_rate": 0.003,
+      "loss": 4.0119,
+      "step": 8832
+    },
+    {
+      "epoch": 0.08833,
+      "grad_norm": 0.6484860181808472,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 8833
+    },
+    {
+      "epoch": 0.08834,
+      "grad_norm": 0.6914095878601074,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 8834
+    },
+    {
+      "epoch": 0.08835,
+      "grad_norm": 0.8174039721488953,
+      "learning_rate": 0.003,
+      "loss": 4.0605,
+      "step": 8835
+    },
+    {
+      "epoch": 0.08836,
+      "grad_norm": 0.7501481771469116,
+      "learning_rate": 0.003,
+      "loss": 4.0464,
+      "step": 8836
+    },
+    {
+      "epoch": 0.08837,
+      "grad_norm": 0.6577468514442444,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 8837
+    },
+    {
+      "epoch": 0.08838,
+      "grad_norm": 0.6486355066299438,
+      "learning_rate": 0.003,
+      "loss": 4.0364,
+      "step": 8838
+    },
+    {
+      "epoch": 0.08839,
+      "grad_norm": 0.7385262250900269,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 8839
+    },
+    {
+      "epoch": 0.0884,
+      "grad_norm": 0.9628206491470337,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 8840
+    },
+    {
+      "epoch": 0.08841,
+      "grad_norm": 1.0862576961517334,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 8841
+    },
+    {
+      "epoch": 0.08842,
+      "grad_norm": 0.941209077835083,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 8842
+    },
+    {
+      "epoch": 0.08843,
+      "grad_norm": 0.7774216532707214,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 8843
+    },
+    {
+      "epoch": 0.08844,
+      "grad_norm": 0.6513327956199646,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 8844
+    },
+    {
+      "epoch": 0.08845,
+      "grad_norm": 0.7469035983085632,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 8845
+    },
+    {
+      "epoch": 0.08846,
+      "grad_norm": 0.770413875579834,
+      "learning_rate": 0.003,
+      "loss": 4.0663,
+      "step": 8846
+    },
+    {
+      "epoch": 0.08847,
+      "grad_norm": 0.8194745182991028,
+      "learning_rate": 0.003,
+      "loss": 4.0579,
+      "step": 8847
+    },
+    {
+      "epoch": 0.08848,
+      "grad_norm": 0.9441425800323486,
+      "learning_rate": 0.003,
+      "loss": 4.0652,
+      "step": 8848
+    },
+    {
+      "epoch": 0.08849,
+      "grad_norm": 0.8423975110054016,
+      "learning_rate": 0.003,
+      "loss": 4.0384,
+      "step": 8849
+    },
+    {
+      "epoch": 0.0885,
+      "grad_norm": 0.928989589214325,
+      "learning_rate": 0.003,
+      "loss": 4.0528,
+      "step": 8850
+    },
+    {
+      "epoch": 0.08851,
+      "grad_norm": 0.954427182674408,
+      "learning_rate": 0.003,
+      "loss": 4.0545,
+      "step": 8851
+    },
+    {
+      "epoch": 0.08852,
+      "grad_norm": 0.9665095210075378,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 8852
+    },
+    {
+      "epoch": 0.08853,
+      "grad_norm": 0.8344659209251404,
+      "learning_rate": 0.003,
+      "loss": 4.0602,
+      "step": 8853
+    },
+    {
+      "epoch": 0.08854,
+      "grad_norm": 0.8386094570159912,
+      "learning_rate": 0.003,
+      "loss": 4.0784,
+      "step": 8854
+    },
+    {
+      "epoch": 0.08855,
+      "grad_norm": 0.862771213054657,
+      "learning_rate": 0.003,
+      "loss": 4.0355,
+      "step": 8855
+    },
+    {
+      "epoch": 0.08856,
+      "grad_norm": 0.779102623462677,
+      "learning_rate": 0.003,
+      "loss": 4.0505,
+      "step": 8856
+    },
+    {
+      "epoch": 0.08857,
+      "grad_norm": 0.8765580654144287,
+      "learning_rate": 0.003,
+      "loss": 4.0521,
+      "step": 8857
+    },
+    {
+      "epoch": 0.08858,
+      "grad_norm": 0.917957067489624,
+      "learning_rate": 0.003,
+      "loss": 4.0443,
+      "step": 8858
+    },
+    {
+      "epoch": 0.08859,
+      "grad_norm": 0.855716347694397,
+      "learning_rate": 0.003,
+      "loss": 4.0558,
+      "step": 8859
+    },
+    {
+      "epoch": 0.0886,
+      "grad_norm": 0.866002082824707,
+      "learning_rate": 0.003,
+      "loss": 4.0678,
+      "step": 8860
+    },
+    {
+      "epoch": 0.08861,
+      "grad_norm": 0.7670868635177612,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 8861
+    },
+    {
+      "epoch": 0.08862,
+      "grad_norm": 0.7301899790763855,
+      "learning_rate": 0.003,
+      "loss": 4.0321,
+      "step": 8862
+    },
+    {
+      "epoch": 0.08863,
+      "grad_norm": 0.728283166885376,
+      "learning_rate": 0.003,
+      "loss": 4.0503,
+      "step": 8863
+    },
+    {
+      "epoch": 0.08864,
+      "grad_norm": 0.7372556924819946,
+      "learning_rate": 0.003,
+      "loss": 4.0574,
+      "step": 8864
+    },
+    {
+      "epoch": 0.08865,
+      "grad_norm": 0.7424993515014648,
+      "learning_rate": 0.003,
+      "loss": 4.053,
+      "step": 8865
+    },
+    {
+      "epoch": 0.08866,
+      "grad_norm": 0.7967926263809204,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 8866
+    },
+    {
+      "epoch": 0.08867,
+      "grad_norm": 0.7535709738731384,
+      "learning_rate": 0.003,
+      "loss": 4.0485,
+      "step": 8867
+    },
+    {
+      "epoch": 0.08868,
+      "grad_norm": 0.8728439211845398,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 8868
+    },
+    {
+      "epoch": 0.08869,
+      "grad_norm": 0.8164510130882263,
+      "learning_rate": 0.003,
+      "loss": 4.084,
+      "step": 8869
+    },
+    {
+      "epoch": 0.0887,
+      "grad_norm": 0.750055730342865,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 8870
+    },
+    {
+      "epoch": 0.08871,
+      "grad_norm": 0.7114226222038269,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 8871
+    },
+    {
+      "epoch": 0.08872,
+      "grad_norm": 0.6471983790397644,
+      "learning_rate": 0.003,
+      "loss": 4.0685,
+      "step": 8872
+    },
+    {
+      "epoch": 0.08873,
+      "grad_norm": 0.5536442995071411,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 8873
+    },
+    {
+      "epoch": 0.08874,
+      "grad_norm": 0.5485543608665466,
+      "learning_rate": 0.003,
+      "loss": 4.0289,
+      "step": 8874
+    },
+    {
+      "epoch": 0.08875,
+      "grad_norm": 0.5261621475219727,
+      "learning_rate": 0.003,
+      "loss": 4.0485,
+      "step": 8875
+    },
+    {
+      "epoch": 0.08876,
+      "grad_norm": 0.486948162317276,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 8876
+    },
+    {
+      "epoch": 0.08877,
+      "grad_norm": 0.5024458765983582,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 8877
+    },
+    {
+      "epoch": 0.08878,
+      "grad_norm": 0.5072246193885803,
+      "learning_rate": 0.003,
+      "loss": 4.0263,
+      "step": 8878
+    },
+    {
+      "epoch": 0.08879,
+      "grad_norm": 0.4610580503940582,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 8879
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.4206774830818176,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 8880
+    },
+    {
+      "epoch": 0.08881,
+      "grad_norm": 0.4502694010734558,
+      "learning_rate": 0.003,
+      "loss": 3.9973,
+      "step": 8881
+    },
+    {
+      "epoch": 0.08882,
+      "grad_norm": 0.42741695046424866,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 8882
+    },
+    {
+      "epoch": 0.08883,
+      "grad_norm": 0.40149998664855957,
+      "learning_rate": 0.003,
+      "loss": 4.0673,
+      "step": 8883
+    },
+    {
+      "epoch": 0.08884,
+      "grad_norm": 0.4078429043292999,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 8884
+    },
+    {
+      "epoch": 0.08885,
+      "grad_norm": 0.5221074819564819,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 8885
+    },
+    {
+      "epoch": 0.08886,
+      "grad_norm": 0.617674708366394,
+      "learning_rate": 0.003,
+      "loss": 4.0271,
+      "step": 8886
+    },
+    {
+      "epoch": 0.08887,
+      "grad_norm": 0.6875243782997131,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 8887
+    },
+    {
+      "epoch": 0.08888,
+      "grad_norm": 0.8016972541809082,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 8888
+    },
+    {
+      "epoch": 0.08889,
+      "grad_norm": 1.1390345096588135,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 8889
+    },
+    {
+      "epoch": 0.0889,
+      "grad_norm": 1.3257744312286377,
+      "learning_rate": 0.003,
+      "loss": 4.0323,
+      "step": 8890
+    },
+    {
+      "epoch": 0.08891,
+      "grad_norm": 0.6148163080215454,
+      "learning_rate": 0.003,
+      "loss": 4.0028,
+      "step": 8891
+    },
+    {
+      "epoch": 0.08892,
+      "grad_norm": 0.7419478297233582,
+      "learning_rate": 0.003,
+      "loss": 4.0469,
+      "step": 8892
+    },
+    {
+      "epoch": 0.08893,
+      "grad_norm": 0.9414969086647034,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 8893
+    },
+    {
+      "epoch": 0.08894,
+      "grad_norm": 0.9926276803016663,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 8894
+    },
+    {
+      "epoch": 0.08895,
+      "grad_norm": 1.1406117677688599,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 8895
+    },
+    {
+      "epoch": 0.08896,
+      "grad_norm": 0.8471540212631226,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 8896
+    },
+    {
+      "epoch": 0.08897,
+      "grad_norm": 0.7552980780601501,
+      "learning_rate": 0.003,
+      "loss": 4.0242,
+      "step": 8897
+    },
+    {
+      "epoch": 0.08898,
+      "grad_norm": 0.8421490788459778,
+      "learning_rate": 0.003,
+      "loss": 4.042,
+      "step": 8898
+    },
+    {
+      "epoch": 0.08899,
+      "grad_norm": 0.9038390517234802,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 8899
+    },
+    {
+      "epoch": 0.089,
+      "grad_norm": 0.9617167115211487,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 8900
+    },
+    {
+      "epoch": 0.08901,
+      "grad_norm": 0.9217119812965393,
+      "learning_rate": 0.003,
+      "loss": 4.054,
+      "step": 8901
+    },
+    {
+      "epoch": 0.08902,
+      "grad_norm": 0.8489481210708618,
+      "learning_rate": 0.003,
+      "loss": 4.0539,
+      "step": 8902
+    },
+    {
+      "epoch": 0.08903,
+      "grad_norm": 0.9893561005592346,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 8903
+    },
+    {
+      "epoch": 0.08904,
+      "grad_norm": 1.184971570968628,
+      "learning_rate": 0.003,
+      "loss": 4.0674,
+      "step": 8904
+    },
+    {
+      "epoch": 0.08905,
+      "grad_norm": 0.829010546207428,
+      "learning_rate": 0.003,
+      "loss": 4.0668,
+      "step": 8905
+    },
+    {
+      "epoch": 0.08906,
+      "grad_norm": 0.8626430630683899,
+      "learning_rate": 0.003,
+      "loss": 4.0572,
+      "step": 8906
+    },
+    {
+      "epoch": 0.08907,
+      "grad_norm": 0.961496889591217,
+      "learning_rate": 0.003,
+      "loss": 4.0524,
+      "step": 8907
+    },
+    {
+      "epoch": 0.08908,
+      "grad_norm": 1.0651192665100098,
+      "learning_rate": 0.003,
+      "loss": 4.073,
+      "step": 8908
+    },
+    {
+      "epoch": 0.08909,
+      "grad_norm": 0.9574206471443176,
+      "learning_rate": 0.003,
+      "loss": 4.066,
+      "step": 8909
+    },
+    {
+      "epoch": 0.0891,
+      "grad_norm": 0.8476295471191406,
+      "learning_rate": 0.003,
+      "loss": 4.046,
+      "step": 8910
+    },
+    {
+      "epoch": 0.08911,
+      "grad_norm": 0.8135413527488708,
+      "learning_rate": 0.003,
+      "loss": 4.0521,
+      "step": 8911
+    },
+    {
+      "epoch": 0.08912,
+      "grad_norm": 0.8378600478172302,
+      "learning_rate": 0.003,
+      "loss": 4.0473,
+      "step": 8912
+    },
+    {
+      "epoch": 0.08913,
+      "grad_norm": 0.7921456694602966,
+      "learning_rate": 0.003,
+      "loss": 4.0657,
+      "step": 8913
+    },
+    {
+      "epoch": 0.08914,
+      "grad_norm": 0.8137087821960449,
+      "learning_rate": 0.003,
+      "loss": 4.0557,
+      "step": 8914
+    },
+    {
+      "epoch": 0.08915,
+      "grad_norm": 0.7893593311309814,
+      "learning_rate": 0.003,
+      "loss": 4.0614,
+      "step": 8915
+    },
+    {
+      "epoch": 0.08916,
+      "grad_norm": 0.7994639873504639,
+      "learning_rate": 0.003,
+      "loss": 4.0513,
+      "step": 8916
+    },
+    {
+      "epoch": 0.08917,
+      "grad_norm": 0.7975410223007202,
+      "learning_rate": 0.003,
+      "loss": 4.0311,
+      "step": 8917
+    },
+    {
+      "epoch": 0.08918,
+      "grad_norm": 0.6501461863517761,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 8918
+    },
+    {
+      "epoch": 0.08919,
+      "grad_norm": 0.5666654706001282,
+      "learning_rate": 0.003,
+      "loss": 4.0491,
+      "step": 8919
+    },
+    {
+      "epoch": 0.0892,
+      "grad_norm": 0.5936571359634399,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 8920
+    },
+    {
+      "epoch": 0.08921,
+      "grad_norm": 0.5901350378990173,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 8921
+    },
+    {
+      "epoch": 0.08922,
+      "grad_norm": 0.6528334617614746,
+      "learning_rate": 0.003,
+      "loss": 4.0481,
+      "step": 8922
+    },
+    {
+      "epoch": 0.08923,
+      "grad_norm": 0.7454528212547302,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 8923
+    },
+    {
+      "epoch": 0.08924,
+      "grad_norm": 0.7825729250907898,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 8924
+    },
+    {
+      "epoch": 0.08925,
+      "grad_norm": 0.7133893370628357,
+      "learning_rate": 0.003,
+      "loss": 4.0388,
+      "step": 8925
+    },
+    {
+      "epoch": 0.08926,
+      "grad_norm": 0.7308877110481262,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 8926
+    },
+    {
+      "epoch": 0.08927,
+      "grad_norm": 0.7993197441101074,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 8927
+    },
+    {
+      "epoch": 0.08928,
+      "grad_norm": 0.8782841563224792,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 8928
+    },
+    {
+      "epoch": 0.08929,
+      "grad_norm": 0.9822357296943665,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 8929
+    },
+    {
+      "epoch": 0.0893,
+      "grad_norm": 0.9535257816314697,
+      "learning_rate": 0.003,
+      "loss": 4.0447,
+      "step": 8930
+    },
+    {
+      "epoch": 0.08931,
+      "grad_norm": 0.830475926399231,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 8931
+    },
+    {
+      "epoch": 0.08932,
+      "grad_norm": 0.7698143124580383,
+      "learning_rate": 0.003,
+      "loss": 4.0467,
+      "step": 8932
+    },
+    {
+      "epoch": 0.08933,
+      "grad_norm": 0.8662832379341125,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 8933
+    },
+    {
+      "epoch": 0.08934,
+      "grad_norm": 0.8279432654380798,
+      "learning_rate": 0.003,
+      "loss": 4.0389,
+      "step": 8934
+    },
+    {
+      "epoch": 0.08935,
+      "grad_norm": 0.7569650411605835,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 8935
+    },
+    {
+      "epoch": 0.08936,
+      "grad_norm": 0.6220132112503052,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 8936
+    },
+    {
+      "epoch": 0.08937,
+      "grad_norm": 0.618433952331543,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 8937
+    },
+    {
+      "epoch": 0.08938,
+      "grad_norm": 0.7037031650543213,
+      "learning_rate": 0.003,
+      "loss": 4.0514,
+      "step": 8938
+    },
+    {
+      "epoch": 0.08939,
+      "grad_norm": 0.7682626843452454,
+      "learning_rate": 0.003,
+      "loss": 4.0678,
+      "step": 8939
+    },
+    {
+      "epoch": 0.0894,
+      "grad_norm": 0.8006645441055298,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 8940
+    },
+    {
+      "epoch": 0.08941,
+      "grad_norm": 0.7939675450325012,
+      "learning_rate": 0.003,
+      "loss": 4.0427,
+      "step": 8941
+    },
+    {
+      "epoch": 0.08942,
+      "grad_norm": 0.7516507506370544,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 8942
+    },
+    {
+      "epoch": 0.08943,
+      "grad_norm": 0.657136082649231,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 8943
+    },
+    {
+      "epoch": 0.08944,
+      "grad_norm": 0.6682058572769165,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 8944
+    },
+    {
+      "epoch": 0.08945,
+      "grad_norm": 0.6049004793167114,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 8945
+    },
+    {
+      "epoch": 0.08946,
+      "grad_norm": 0.6015246510505676,
+      "learning_rate": 0.003,
+      "loss": 4.0542,
+      "step": 8946
+    },
+    {
+      "epoch": 0.08947,
+      "grad_norm": 0.727924108505249,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 8947
+    },
+    {
+      "epoch": 0.08948,
+      "grad_norm": 0.7806482315063477,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 8948
+    },
+    {
+      "epoch": 0.08949,
+      "grad_norm": 0.836334764957428,
+      "learning_rate": 0.003,
+      "loss": 4.0302,
+      "step": 8949
+    },
+    {
+      "epoch": 0.0895,
+      "grad_norm": 0.8896807432174683,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 8950
+    },
+    {
+      "epoch": 0.08951,
+      "grad_norm": 0.9908595681190491,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 8951
+    },
+    {
+      "epoch": 0.08952,
+      "grad_norm": 0.992614209651947,
+      "learning_rate": 0.003,
+      "loss": 4.0518,
+      "step": 8952
+    },
+    {
+      "epoch": 0.08953,
+      "grad_norm": 0.997110903263092,
+      "learning_rate": 0.003,
+      "loss": 4.0479,
+      "step": 8953
+    },
+    {
+      "epoch": 0.08954,
+      "grad_norm": 0.9277966618537903,
+      "learning_rate": 0.003,
+      "loss": 4.0168,
+      "step": 8954
+    },
+    {
+      "epoch": 0.08955,
+      "grad_norm": 0.8257642388343811,
+      "learning_rate": 0.003,
+      "loss": 4.0447,
+      "step": 8955
+    },
+    {
+      "epoch": 0.08956,
+      "grad_norm": 0.9073073267936707,
+      "learning_rate": 0.003,
+      "loss": 4.052,
+      "step": 8956
+    },
+    {
+      "epoch": 0.08957,
+      "grad_norm": 0.8745629191398621,
+      "learning_rate": 0.003,
+      "loss": 4.0595,
+      "step": 8957
+    },
+    {
+      "epoch": 0.08958,
+      "grad_norm": 0.86850905418396,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 8958
+    },
+    {
+      "epoch": 0.08959,
+      "grad_norm": 0.8888902068138123,
+      "learning_rate": 0.003,
+      "loss": 4.0382,
+      "step": 8959
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.9012948870658875,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 8960
+    },
+    {
+      "epoch": 0.08961,
+      "grad_norm": 0.8690242767333984,
+      "learning_rate": 0.003,
+      "loss": 4.0366,
+      "step": 8961
+    },
+    {
+      "epoch": 0.08962,
+      "grad_norm": 0.863722562789917,
+      "learning_rate": 0.003,
+      "loss": 4.0446,
+      "step": 8962
+    },
+    {
+      "epoch": 0.08963,
+      "grad_norm": 0.7099302411079407,
+      "learning_rate": 0.003,
+      "loss": 4.0729,
+      "step": 8963
+    },
+    {
+      "epoch": 0.08964,
+      "grad_norm": 0.8618674278259277,
+      "learning_rate": 0.003,
+      "loss": 4.0527,
+      "step": 8964
+    },
+    {
+      "epoch": 0.08965,
+      "grad_norm": 1.0636932849884033,
+      "learning_rate": 0.003,
+      "loss": 4.0637,
+      "step": 8965
+    },
+    {
+      "epoch": 0.08966,
+      "grad_norm": 0.7878800630569458,
+      "learning_rate": 0.003,
+      "loss": 4.0289,
+      "step": 8966
+    },
+    {
+      "epoch": 0.08967,
+      "grad_norm": 0.8019261956214905,
+      "learning_rate": 0.003,
+      "loss": 4.071,
+      "step": 8967
+    },
+    {
+      "epoch": 0.08968,
+      "grad_norm": 0.7324358224868774,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 8968
+    },
+    {
+      "epoch": 0.08969,
+      "grad_norm": 0.5871388912200928,
+      "learning_rate": 0.003,
+      "loss": 4.0334,
+      "step": 8969
+    },
+    {
+      "epoch": 0.0897,
+      "grad_norm": 0.6406157612800598,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 8970
+    },
+    {
+      "epoch": 0.08971,
+      "grad_norm": 0.6205124855041504,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 8971
+    },
+    {
+      "epoch": 0.08972,
+      "grad_norm": 0.630068302154541,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 8972
+    },
+    {
+      "epoch": 0.08973,
+      "grad_norm": 0.6608004570007324,
+      "learning_rate": 0.003,
+      "loss": 4.0564,
+      "step": 8973
+    },
+    {
+      "epoch": 0.08974,
+      "grad_norm": 0.675197184085846,
+      "learning_rate": 0.003,
+      "loss": 4.0505,
+      "step": 8974
+    },
+    {
+      "epoch": 0.08975,
+      "grad_norm": 0.7026122808456421,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 8975
+    },
+    {
+      "epoch": 0.08976,
+      "grad_norm": 0.7652435898780823,
+      "learning_rate": 0.003,
+      "loss": 4.0451,
+      "step": 8976
+    },
+    {
+      "epoch": 0.08977,
+      "grad_norm": 0.8026885390281677,
+      "learning_rate": 0.003,
+      "loss": 4.0334,
+      "step": 8977
+    },
+    {
+      "epoch": 0.08978,
+      "grad_norm": 0.7495819926261902,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 8978
+    },
+    {
+      "epoch": 0.08979,
+      "grad_norm": 0.7030617594718933,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 8979
+    },
+    {
+      "epoch": 0.0898,
+      "grad_norm": 0.6318227052688599,
+      "learning_rate": 0.003,
+      "loss": 4.0397,
+      "step": 8980
+    },
+    {
+      "epoch": 0.08981,
+      "grad_norm": 0.6919098496437073,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 8981
+    },
+    {
+      "epoch": 0.08982,
+      "grad_norm": 0.7900656461715698,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 8982
+    },
+    {
+      "epoch": 0.08983,
+      "grad_norm": 0.691038191318512,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 8983
+    },
+    {
+      "epoch": 0.08984,
+      "grad_norm": 0.6485254168510437,
+      "learning_rate": 0.003,
+      "loss": 4.0516,
+      "step": 8984
+    },
+    {
+      "epoch": 0.08985,
+      "grad_norm": 0.5956553816795349,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 8985
+    },
+    {
+      "epoch": 0.08986,
+      "grad_norm": 0.6086512207984924,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 8986
+    },
+    {
+      "epoch": 0.08987,
+      "grad_norm": 0.6024112701416016,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 8987
+    },
+    {
+      "epoch": 0.08988,
+      "grad_norm": 0.6147379279136658,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 8988
+    },
+    {
+      "epoch": 0.08989,
+      "grad_norm": 0.7100562453269958,
+      "learning_rate": 0.003,
+      "loss": 4.0464,
+      "step": 8989
+    },
+    {
+      "epoch": 0.0899,
+      "grad_norm": 0.7910034656524658,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 8990
+    },
+    {
+      "epoch": 0.08991,
+      "grad_norm": 0.9195103645324707,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 8991
+    },
+    {
+      "epoch": 0.08992,
+      "grad_norm": 0.8702215552330017,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 8992
+    },
+    {
+      "epoch": 0.08993,
+      "grad_norm": 0.7406909465789795,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 8993
+    },
+    {
+      "epoch": 0.08994,
+      "grad_norm": 0.6627404689788818,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 8994
+    },
+    {
+      "epoch": 0.08995,
+      "grad_norm": 0.6060569882392883,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 8995
+    },
+    {
+      "epoch": 0.08996,
+      "grad_norm": 0.48137080669403076,
+      "learning_rate": 0.003,
+      "loss": 4.0475,
+      "step": 8996
+    },
+    {
+      "epoch": 0.08997,
+      "grad_norm": 0.5556135177612305,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 8997
+    },
+    {
+      "epoch": 0.08998,
+      "grad_norm": 0.6087490320205688,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 8998
+    },
+    {
+      "epoch": 0.08999,
+      "grad_norm": 0.600788950920105,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 8999
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.6633055210113525,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 9000
+    },
+    {
+      "epoch": 0.09001,
+      "grad_norm": 0.7464064359664917,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 9001
+    },
+    {
+      "epoch": 0.09002,
+      "grad_norm": 0.8089091181755066,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 9002
+    },
+    {
+      "epoch": 0.09003,
+      "grad_norm": 0.8683592081069946,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 9003
+    },
+    {
+      "epoch": 0.09004,
+      "grad_norm": 0.9728555679321289,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 9004
+    },
+    {
+      "epoch": 0.09005,
+      "grad_norm": 1.038753867149353,
+      "learning_rate": 0.003,
+      "loss": 4.0577,
+      "step": 9005
+    },
+    {
+      "epoch": 0.09006,
+      "grad_norm": 0.8108296394348145,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 9006
+    },
+    {
+      "epoch": 0.09007,
+      "grad_norm": 0.7779677510261536,
+      "learning_rate": 0.003,
+      "loss": 4.0558,
+      "step": 9007
+    },
+    {
+      "epoch": 0.09008,
+      "grad_norm": 0.7952173948287964,
+      "learning_rate": 0.003,
+      "loss": 4.0427,
+      "step": 9008
+    },
+    {
+      "epoch": 0.09009,
+      "grad_norm": 0.963347315788269,
+      "learning_rate": 0.003,
+      "loss": 4.0525,
+      "step": 9009
+    },
+    {
+      "epoch": 0.0901,
+      "grad_norm": 1.054080605506897,
+      "learning_rate": 0.003,
+      "loss": 4.0451,
+      "step": 9010
+    },
+    {
+      "epoch": 0.09011,
+      "grad_norm": 0.9431594014167786,
+      "learning_rate": 0.003,
+      "loss": 4.0414,
+      "step": 9011
+    },
+    {
+      "epoch": 0.09012,
+      "grad_norm": 0.9199943542480469,
+      "learning_rate": 0.003,
+      "loss": 4.0446,
+      "step": 9012
+    },
+    {
+      "epoch": 0.09013,
+      "grad_norm": 0.9955227971076965,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 9013
+    },
+    {
+      "epoch": 0.09014,
+      "grad_norm": 1.195615291595459,
+      "learning_rate": 0.003,
+      "loss": 4.0749,
+      "step": 9014
+    },
+    {
+      "epoch": 0.09015,
+      "grad_norm": 0.9534950256347656,
+      "learning_rate": 0.003,
+      "loss": 4.0829,
+      "step": 9015
+    },
+    {
+      "epoch": 0.09016,
+      "grad_norm": 0.8363763689994812,
+      "learning_rate": 0.003,
+      "loss": 4.0618,
+      "step": 9016
+    },
+    {
+      "epoch": 0.09017,
+      "grad_norm": 0.8595864772796631,
+      "learning_rate": 0.003,
+      "loss": 4.0656,
+      "step": 9017
+    },
+    {
+      "epoch": 0.09018,
+      "grad_norm": 0.8017337322235107,
+      "learning_rate": 0.003,
+      "loss": 4.0576,
+      "step": 9018
+    },
+    {
+      "epoch": 0.09019,
+      "grad_norm": 0.7216896414756775,
+      "learning_rate": 0.003,
+      "loss": 4.0263,
+      "step": 9019
+    },
+    {
+      "epoch": 0.0902,
+      "grad_norm": 0.7337148189544678,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 9020
+    },
+    {
+      "epoch": 0.09021,
+      "grad_norm": 0.7746081352233887,
+      "learning_rate": 0.003,
+      "loss": 4.0565,
+      "step": 9021
+    },
+    {
+      "epoch": 0.09022,
+      "grad_norm": 0.7762857675552368,
+      "learning_rate": 0.003,
+      "loss": 4.0478,
+      "step": 9022
+    },
+    {
+      "epoch": 0.09023,
+      "grad_norm": 0.6827755570411682,
+      "learning_rate": 0.003,
+      "loss": 4.0364,
+      "step": 9023
+    },
+    {
+      "epoch": 0.09024,
+      "grad_norm": 0.6656430959701538,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 9024
+    },
+    {
+      "epoch": 0.09025,
+      "grad_norm": 0.5954445004463196,
+      "learning_rate": 0.003,
+      "loss": 4.0403,
+      "step": 9025
+    },
+    {
+      "epoch": 0.09026,
+      "grad_norm": 0.6387386918067932,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 9026
+    },
+    {
+      "epoch": 0.09027,
+      "grad_norm": 0.6285505294799805,
+      "learning_rate": 0.003,
+      "loss": 4.0271,
+      "step": 9027
+    },
+    {
+      "epoch": 0.09028,
+      "grad_norm": 0.6999052166938782,
+      "learning_rate": 0.003,
+      "loss": 4.0433,
+      "step": 9028
+    },
+    {
+      "epoch": 0.09029,
+      "grad_norm": 0.9033819437026978,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 9029
+    },
+    {
+      "epoch": 0.0903,
+      "grad_norm": 1.188402533531189,
+      "learning_rate": 0.003,
+      "loss": 4.0714,
+      "step": 9030
+    },
+    {
+      "epoch": 0.09031,
+      "grad_norm": 0.7485238313674927,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 9031
+    },
+    {
+      "epoch": 0.09032,
+      "grad_norm": 0.5360812544822693,
+      "learning_rate": 0.003,
+      "loss": 4.0618,
+      "step": 9032
+    },
+    {
+      "epoch": 0.09033,
+      "grad_norm": 0.6563149690628052,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 9033
+    },
+    {
+      "epoch": 0.09034,
+      "grad_norm": 0.7612379193305969,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 9034
+    },
+    {
+      "epoch": 0.09035,
+      "grad_norm": 0.7884421348571777,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 9035
+    },
+    {
+      "epoch": 0.09036,
+      "grad_norm": 0.7918843626976013,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 9036
+    },
+    {
+      "epoch": 0.09037,
+      "grad_norm": 0.8294838070869446,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 9037
+    },
+    {
+      "epoch": 0.09038,
+      "grad_norm": 0.712574303150177,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 9038
+    },
+    {
+      "epoch": 0.09039,
+      "grad_norm": 0.6491422653198242,
+      "learning_rate": 0.003,
+      "loss": 4.0386,
+      "step": 9039
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.6552371382713318,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 9040
+    },
+    {
+      "epoch": 0.09041,
+      "grad_norm": 0.5920805335044861,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 9041
+    },
+    {
+      "epoch": 0.09042,
+      "grad_norm": 0.5702484846115112,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 9042
+    },
+    {
+      "epoch": 0.09043,
+      "grad_norm": 0.5535892844200134,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 9043
+    },
+    {
+      "epoch": 0.09044,
+      "grad_norm": 0.610673189163208,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 9044
+    },
+    {
+      "epoch": 0.09045,
+      "grad_norm": 0.5978013873100281,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 9045
+    },
+    {
+      "epoch": 0.09046,
+      "grad_norm": 0.6004743576049805,
+      "learning_rate": 0.003,
+      "loss": 3.9957,
+      "step": 9046
+    },
+    {
+      "epoch": 0.09047,
+      "grad_norm": 0.7006171345710754,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 9047
+    },
+    {
+      "epoch": 0.09048,
+      "grad_norm": 0.7652348279953003,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 9048
+    },
+    {
+      "epoch": 0.09049,
+      "grad_norm": 0.9703059792518616,
+      "learning_rate": 0.003,
+      "loss": 4.0504,
+      "step": 9049
+    },
+    {
+      "epoch": 0.0905,
+      "grad_norm": 1.1826372146606445,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 9050
+    },
+    {
+      "epoch": 0.09051,
+      "grad_norm": 0.655387818813324,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 9051
+    },
+    {
+      "epoch": 0.09052,
+      "grad_norm": 0.5878477692604065,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 9052
+    },
+    {
+      "epoch": 0.09053,
+      "grad_norm": 0.5366464257240295,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 9053
+    },
+    {
+      "epoch": 0.09054,
+      "grad_norm": 0.5335559844970703,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 9054
+    },
+    {
+      "epoch": 0.09055,
+      "grad_norm": 0.673572301864624,
+      "learning_rate": 0.003,
+      "loss": 4.0047,
+      "step": 9055
+    },
+    {
+      "epoch": 0.09056,
+      "grad_norm": 0.7973268032073975,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 9056
+    },
+    {
+      "epoch": 0.09057,
+      "grad_norm": 0.8581774234771729,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 9057
+    },
+    {
+      "epoch": 0.09058,
+      "grad_norm": 0.8413546681404114,
+      "learning_rate": 0.003,
+      "loss": 4.0549,
+      "step": 9058
+    },
+    {
+      "epoch": 0.09059,
+      "grad_norm": 0.8598448634147644,
+      "learning_rate": 0.003,
+      "loss": 4.0397,
+      "step": 9059
+    },
+    {
+      "epoch": 0.0906,
+      "grad_norm": 0.8993529677391052,
+      "learning_rate": 0.003,
+      "loss": 4.0573,
+      "step": 9060
+    },
+    {
+      "epoch": 0.09061,
+      "grad_norm": 1.0165449380874634,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 9061
+    },
+    {
+      "epoch": 0.09062,
+      "grad_norm": 0.9853262305259705,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 9062
+    },
+    {
+      "epoch": 0.09063,
+      "grad_norm": 0.8807757496833801,
+      "learning_rate": 0.003,
+      "loss": 4.0398,
+      "step": 9063
+    },
+    {
+      "epoch": 0.09064,
+      "grad_norm": 0.8548852801322937,
+      "learning_rate": 0.003,
+      "loss": 4.0622,
+      "step": 9064
+    },
+    {
+      "epoch": 0.09065,
+      "grad_norm": 0.9431764483451843,
+      "learning_rate": 0.003,
+      "loss": 4.0576,
+      "step": 9065
+    },
+    {
+      "epoch": 0.09066,
+      "grad_norm": 1.1532913446426392,
+      "learning_rate": 0.003,
+      "loss": 4.0523,
+      "step": 9066
+    },
+    {
+      "epoch": 0.09067,
+      "grad_norm": 1.311736822128296,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 9067
+    },
+    {
+      "epoch": 0.09068,
+      "grad_norm": 0.796724796295166,
+      "learning_rate": 0.003,
+      "loss": 4.0572,
+      "step": 9068
+    },
+    {
+      "epoch": 0.09069,
+      "grad_norm": 0.7498504519462585,
+      "learning_rate": 0.003,
+      "loss": 4.0912,
+      "step": 9069
+    },
+    {
+      "epoch": 0.0907,
+      "grad_norm": 0.8895300030708313,
+      "learning_rate": 0.003,
+      "loss": 4.0784,
+      "step": 9070
+    },
+    {
+      "epoch": 0.09071,
+      "grad_norm": 0.9477413296699524,
+      "learning_rate": 0.003,
+      "loss": 4.0585,
+      "step": 9071
+    },
+    {
+      "epoch": 0.09072,
+      "grad_norm": 0.7607925534248352,
+      "learning_rate": 0.003,
+      "loss": 4.0263,
+      "step": 9072
+    },
+    {
+      "epoch": 0.09073,
+      "grad_norm": 0.7120130658149719,
+      "learning_rate": 0.003,
+      "loss": 4.0529,
+      "step": 9073
+    },
+    {
+      "epoch": 0.09074,
+      "grad_norm": 0.7336847186088562,
+      "learning_rate": 0.003,
+      "loss": 4.0256,
+      "step": 9074
+    },
+    {
+      "epoch": 0.09075,
+      "grad_norm": 0.6090520024299622,
+      "learning_rate": 0.003,
+      "loss": 4.0608,
+      "step": 9075
+    },
+    {
+      "epoch": 0.09076,
+      "grad_norm": 0.6007069945335388,
+      "learning_rate": 0.003,
+      "loss": 4.0615,
+      "step": 9076
+    },
+    {
+      "epoch": 0.09077,
+      "grad_norm": 0.6927357316017151,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 9077
+    },
+    {
+      "epoch": 0.09078,
+      "grad_norm": 0.863045334815979,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 9078
+    },
+    {
+      "epoch": 0.09079,
+      "grad_norm": 1.1068956851959229,
+      "learning_rate": 0.003,
+      "loss": 4.055,
+      "step": 9079
+    },
+    {
+      "epoch": 0.0908,
+      "grad_norm": 0.8308747410774231,
+      "learning_rate": 0.003,
+      "loss": 4.0295,
+      "step": 9080
+    },
+    {
+      "epoch": 0.09081,
+      "grad_norm": 0.6511629819869995,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 9081
+    },
+    {
+      "epoch": 0.09082,
+      "grad_norm": 0.582959771156311,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 9082
+    },
+    {
+      "epoch": 0.09083,
+      "grad_norm": 0.6266202330589294,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 9083
+    },
+    {
+      "epoch": 0.09084,
+      "grad_norm": 0.6483187079429626,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 9084
+    },
+    {
+      "epoch": 0.09085,
+      "grad_norm": 0.6330807209014893,
+      "learning_rate": 0.003,
+      "loss": 4.0501,
+      "step": 9085
+    },
+    {
+      "epoch": 0.09086,
+      "grad_norm": 0.5214323401451111,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 9086
+    },
+    {
+      "epoch": 0.09087,
+      "grad_norm": 0.5562803745269775,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 9087
+    },
+    {
+      "epoch": 0.09088,
+      "grad_norm": 0.606450080871582,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 9088
+    },
+    {
+      "epoch": 0.09089,
+      "grad_norm": 0.5667656064033508,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 9089
+    },
+    {
+      "epoch": 0.0909,
+      "grad_norm": 0.5123067498207092,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 9090
+    },
+    {
+      "epoch": 0.09091,
+      "grad_norm": 0.4887959957122803,
+      "learning_rate": 0.003,
+      "loss": 3.9985,
+      "step": 9091
+    },
+    {
+      "epoch": 0.09092,
+      "grad_norm": 0.4976811110973358,
+      "learning_rate": 0.003,
+      "loss": 4.0538,
+      "step": 9092
+    },
+    {
+      "epoch": 0.09093,
+      "grad_norm": 0.4877670109272003,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 9093
+    },
+    {
+      "epoch": 0.09094,
+      "grad_norm": 0.48510006070137024,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 9094
+    },
+    {
+      "epoch": 0.09095,
+      "grad_norm": 0.5514529347419739,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 9095
+    },
+    {
+      "epoch": 0.09096,
+      "grad_norm": 0.6349421739578247,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 9096
+    },
+    {
+      "epoch": 0.09097,
+      "grad_norm": 0.9241090416908264,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 9097
+    },
+    {
+      "epoch": 0.09098,
+      "grad_norm": 1.3963031768798828,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 9098
+    },
+    {
+      "epoch": 0.09099,
+      "grad_norm": 0.6102014183998108,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 9099
+    },
+    {
+      "epoch": 0.091,
+      "grad_norm": 0.6364977359771729,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 9100
+    },
+    {
+      "epoch": 0.09101,
+      "grad_norm": 0.8607155680656433,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 9101
+    },
+    {
+      "epoch": 0.09102,
+      "grad_norm": 0.8906451463699341,
+      "learning_rate": 0.003,
+      "loss": 4.0371,
+      "step": 9102
+    },
+    {
+      "epoch": 0.09103,
+      "grad_norm": 0.8266270756721497,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 9103
+    },
+    {
+      "epoch": 0.09104,
+      "grad_norm": 0.8795189261436462,
+      "learning_rate": 0.003,
+      "loss": 4.0347,
+      "step": 9104
+    },
+    {
+      "epoch": 0.09105,
+      "grad_norm": 0.8761870265007019,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 9105
+    },
+    {
+      "epoch": 0.09106,
+      "grad_norm": 0.8591187596321106,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 9106
+    },
+    {
+      "epoch": 0.09107,
+      "grad_norm": 1.034685730934143,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 9107
+    },
+    {
+      "epoch": 0.09108,
+      "grad_norm": 1.0666166543960571,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 9108
+    },
+    {
+      "epoch": 0.09109,
+      "grad_norm": 0.9874948859214783,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 9109
+    },
+    {
+      "epoch": 0.0911,
+      "grad_norm": 0.9942079186439514,
+      "learning_rate": 0.003,
+      "loss": 4.0523,
+      "step": 9110
+    },
+    {
+      "epoch": 0.09111,
+      "grad_norm": 1.0025235414505005,
+      "learning_rate": 0.003,
+      "loss": 4.0671,
+      "step": 9111
+    },
+    {
+      "epoch": 0.09112,
+      "grad_norm": 1.0080291032791138,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 9112
+    },
+    {
+      "epoch": 0.09113,
+      "grad_norm": 0.8179364800453186,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 9113
+    },
+    {
+      "epoch": 0.09114,
+      "grad_norm": 0.6516106128692627,
+      "learning_rate": 0.003,
+      "loss": 4.034,
+      "step": 9114
+    },
+    {
+      "epoch": 0.09115,
+      "grad_norm": 0.6664100885391235,
+      "learning_rate": 0.003,
+      "loss": 4.0375,
+      "step": 9115
+    },
+    {
+      "epoch": 0.09116,
+      "grad_norm": 0.7324456572532654,
+      "learning_rate": 0.003,
+      "loss": 4.0877,
+      "step": 9116
+    },
+    {
+      "epoch": 0.09117,
+      "grad_norm": 0.7465541362762451,
+      "learning_rate": 0.003,
+      "loss": 4.0573,
+      "step": 9117
+    },
+    {
+      "epoch": 0.09118,
+      "grad_norm": 0.7696415185928345,
+      "learning_rate": 0.003,
+      "loss": 4.0474,
+      "step": 9118
+    },
+    {
+      "epoch": 0.09119,
+      "grad_norm": 0.6559749245643616,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 9119
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.7420315146446228,
+      "learning_rate": 0.003,
+      "loss": 4.0536,
+      "step": 9120
+    },
+    {
+      "epoch": 0.09121,
+      "grad_norm": 0.8487631678581238,
+      "learning_rate": 0.003,
+      "loss": 4.0767,
+      "step": 9121
+    },
+    {
+      "epoch": 0.09122,
+      "grad_norm": 1.019974946975708,
+      "learning_rate": 0.003,
+      "loss": 4.0748,
+      "step": 9122
+    },
+    {
+      "epoch": 0.09123,
+      "grad_norm": 1.0010663270950317,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 9123
+    },
+    {
+      "epoch": 0.09124,
+      "grad_norm": 1.219331979751587,
+      "learning_rate": 0.003,
+      "loss": 4.0546,
+      "step": 9124
+    },
+    {
+      "epoch": 0.09125,
+      "grad_norm": 1.1329954862594604,
+      "learning_rate": 0.003,
+      "loss": 4.0721,
+      "step": 9125
+    },
+    {
+      "epoch": 0.09126,
+      "grad_norm": 0.9389865398406982,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 9126
+    },
+    {
+      "epoch": 0.09127,
+      "grad_norm": 0.782051682472229,
+      "learning_rate": 0.003,
+      "loss": 4.0575,
+      "step": 9127
+    },
+    {
+      "epoch": 0.09128,
+      "grad_norm": 0.8009851574897766,
+      "learning_rate": 0.003,
+      "loss": 4.0508,
+      "step": 9128
+    },
+    {
+      "epoch": 0.09129,
+      "grad_norm": 0.8631070256233215,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 9129
+    },
+    {
+      "epoch": 0.0913,
+      "grad_norm": 0.9466512799263,
+      "learning_rate": 0.003,
+      "loss": 4.0409,
+      "step": 9130
+    },
+    {
+      "epoch": 0.09131,
+      "grad_norm": 0.840096652507782,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 9131
+    },
+    {
+      "epoch": 0.09132,
+      "grad_norm": 0.751104474067688,
+      "learning_rate": 0.003,
+      "loss": 4.0635,
+      "step": 9132
+    },
+    {
+      "epoch": 0.09133,
+      "grad_norm": 0.7003132104873657,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 9133
+    },
+    {
+      "epoch": 0.09134,
+      "grad_norm": 0.6505602598190308,
+      "learning_rate": 0.003,
+      "loss": 4.0425,
+      "step": 9134
+    },
+    {
+      "epoch": 0.09135,
+      "grad_norm": 0.6527884602546692,
+      "learning_rate": 0.003,
+      "loss": 4.0511,
+      "step": 9135
+    },
+    {
+      "epoch": 0.09136,
+      "grad_norm": 0.7264319062232971,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 9136
+    },
+    {
+      "epoch": 0.09137,
+      "grad_norm": 0.8720904588699341,
+      "learning_rate": 0.003,
+      "loss": 4.0682,
+      "step": 9137
+    },
+    {
+      "epoch": 0.09138,
+      "grad_norm": 0.9983429312705994,
+      "learning_rate": 0.003,
+      "loss": 4.0394,
+      "step": 9138
+    },
+    {
+      "epoch": 0.09139,
+      "grad_norm": 0.9738618731498718,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 9139
+    },
+    {
+      "epoch": 0.0914,
+      "grad_norm": 0.7946332097053528,
+      "learning_rate": 0.003,
+      "loss": 4.0406,
+      "step": 9140
+    },
+    {
+      "epoch": 0.09141,
+      "grad_norm": 0.5857697129249573,
+      "learning_rate": 0.003,
+      "loss": 4.0664,
+      "step": 9141
+    },
+    {
+      "epoch": 0.09142,
+      "grad_norm": 0.5243735909461975,
+      "learning_rate": 0.003,
+      "loss": 4.0543,
+      "step": 9142
+    },
+    {
+      "epoch": 0.09143,
+      "grad_norm": 0.501167893409729,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 9143
+    },
+    {
+      "epoch": 0.09144,
+      "grad_norm": 0.5033556818962097,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 9144
+    },
+    {
+      "epoch": 0.09145,
+      "grad_norm": 0.541192889213562,
+      "learning_rate": 0.003,
+      "loss": 4.0485,
+      "step": 9145
+    },
+    {
+      "epoch": 0.09146,
+      "grad_norm": 0.7105612754821777,
+      "learning_rate": 0.003,
+      "loss": 4.0429,
+      "step": 9146
+    },
+    {
+      "epoch": 0.09147,
+      "grad_norm": 0.8535759449005127,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 9147
+    },
+    {
+      "epoch": 0.09148,
+      "grad_norm": 0.7989993691444397,
+      "learning_rate": 0.003,
+      "loss": 4.0437,
+      "step": 9148
+    },
+    {
+      "epoch": 0.09149,
+      "grad_norm": 0.6929615139961243,
+      "learning_rate": 0.003,
+      "loss": 3.9901,
+      "step": 9149
+    },
+    {
+      "epoch": 0.0915,
+      "grad_norm": 0.58013516664505,
+      "learning_rate": 0.003,
+      "loss": 3.9936,
+      "step": 9150
+    },
+    {
+      "epoch": 0.09151,
+      "grad_norm": 0.4813550114631653,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 9151
+    },
+    {
+      "epoch": 0.09152,
+      "grad_norm": 0.5421327948570251,
+      "learning_rate": 0.003,
+      "loss": 4.0364,
+      "step": 9152
+    },
+    {
+      "epoch": 0.09153,
+      "grad_norm": 0.5891533493995667,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 9153
+    },
+    {
+      "epoch": 0.09154,
+      "grad_norm": 0.6357941031455994,
+      "learning_rate": 0.003,
+      "loss": 4.0373,
+      "step": 9154
+    },
+    {
+      "epoch": 0.09155,
+      "grad_norm": 0.659079372882843,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 9155
+    },
+    {
+      "epoch": 0.09156,
+      "grad_norm": 0.657294511795044,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 9156
+    },
+    {
+      "epoch": 0.09157,
+      "grad_norm": 0.6846706867218018,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 9157
+    },
+    {
+      "epoch": 0.09158,
+      "grad_norm": 0.6189616918563843,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 9158
+    },
+    {
+      "epoch": 0.09159,
+      "grad_norm": 0.6369915008544922,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 9159
+    },
+    {
+      "epoch": 0.0916,
+      "grad_norm": 0.7117534875869751,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 9160
+    },
+    {
+      "epoch": 0.09161,
+      "grad_norm": 0.7279969453811646,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 9161
+    },
+    {
+      "epoch": 0.09162,
+      "grad_norm": 0.7147172689437866,
+      "learning_rate": 0.003,
+      "loss": 4.0524,
+      "step": 9162
+    },
+    {
+      "epoch": 0.09163,
+      "grad_norm": 0.8350904583930969,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 9163
+    },
+    {
+      "epoch": 0.09164,
+      "grad_norm": 1.0859333276748657,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 9164
+    },
+    {
+      "epoch": 0.09165,
+      "grad_norm": 1.132325291633606,
+      "learning_rate": 0.003,
+      "loss": 4.0615,
+      "step": 9165
+    },
+    {
+      "epoch": 0.09166,
+      "grad_norm": 0.6931542754173279,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 9166
+    },
+    {
+      "epoch": 0.09167,
+      "grad_norm": 0.5628350377082825,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 9167
+    },
+    {
+      "epoch": 0.09168,
+      "grad_norm": 0.691896378993988,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 9168
+    },
+    {
+      "epoch": 0.09169,
+      "grad_norm": 0.7813774347305298,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 9169
+    },
+    {
+      "epoch": 0.0917,
+      "grad_norm": 0.8740519285202026,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 9170
+    },
+    {
+      "epoch": 0.09171,
+      "grad_norm": 0.8779575824737549,
+      "learning_rate": 0.003,
+      "loss": 4.0582,
+      "step": 9171
+    },
+    {
+      "epoch": 0.09172,
+      "grad_norm": 0.7493038773536682,
+      "learning_rate": 0.003,
+      "loss": 4.0555,
+      "step": 9172
+    },
+    {
+      "epoch": 0.09173,
+      "grad_norm": 0.6481500864028931,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 9173
+    },
+    {
+      "epoch": 0.09174,
+      "grad_norm": 0.6999644637107849,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 9174
+    },
+    {
+      "epoch": 0.09175,
+      "grad_norm": 0.7405575513839722,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 9175
+    },
+    {
+      "epoch": 0.09176,
+      "grad_norm": 0.7775292992591858,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 9176
+    },
+    {
+      "epoch": 0.09177,
+      "grad_norm": 0.7605270743370056,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 9177
+    },
+    {
+      "epoch": 0.09178,
+      "grad_norm": 0.6888309121131897,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 9178
+    },
+    {
+      "epoch": 0.09179,
+      "grad_norm": 0.7338680624961853,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 9179
+    },
+    {
+      "epoch": 0.0918,
+      "grad_norm": 0.9125281572341919,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 9180
+    },
+    {
+      "epoch": 0.09181,
+      "grad_norm": 1.0294517278671265,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 9181
+    },
+    {
+      "epoch": 0.09182,
+      "grad_norm": 0.9629702568054199,
+      "learning_rate": 0.003,
+      "loss": 4.0342,
+      "step": 9182
+    },
+    {
+      "epoch": 0.09183,
+      "grad_norm": 0.9055722951889038,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 9183
+    },
+    {
+      "epoch": 0.09184,
+      "grad_norm": 0.9649536609649658,
+      "learning_rate": 0.003,
+      "loss": 4.0316,
+      "step": 9184
+    },
+    {
+      "epoch": 0.09185,
+      "grad_norm": 0.9440880417823792,
+      "learning_rate": 0.003,
+      "loss": 4.062,
+      "step": 9185
+    },
+    {
+      "epoch": 0.09186,
+      "grad_norm": 0.8451281189918518,
+      "learning_rate": 0.003,
+      "loss": 4.0047,
+      "step": 9186
+    },
+    {
+      "epoch": 0.09187,
+      "grad_norm": 0.8534144759178162,
+      "learning_rate": 0.003,
+      "loss": 4.0761,
+      "step": 9187
+    },
+    {
+      "epoch": 0.09188,
+      "grad_norm": 0.8250672817230225,
+      "learning_rate": 0.003,
+      "loss": 4.0492,
+      "step": 9188
+    },
+    {
+      "epoch": 0.09189,
+      "grad_norm": 0.9062046408653259,
+      "learning_rate": 0.003,
+      "loss": 4.0548,
+      "step": 9189
+    },
+    {
+      "epoch": 0.0919,
+      "grad_norm": 0.9313827157020569,
+      "learning_rate": 0.003,
+      "loss": 4.067,
+      "step": 9190
+    },
+    {
+      "epoch": 0.09191,
+      "grad_norm": 0.7684466242790222,
+      "learning_rate": 0.003,
+      "loss": 4.0429,
+      "step": 9191
+    },
+    {
+      "epoch": 0.09192,
+      "grad_norm": 0.6470143795013428,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 9192
+    },
+    {
+      "epoch": 0.09193,
+      "grad_norm": 0.5500982403755188,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 9193
+    },
+    {
+      "epoch": 0.09194,
+      "grad_norm": 0.5531850457191467,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 9194
+    },
+    {
+      "epoch": 0.09195,
+      "grad_norm": 0.5240871906280518,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 9195
+    },
+    {
+      "epoch": 0.09196,
+      "grad_norm": 0.4800591468811035,
+      "learning_rate": 0.003,
+      "loss": 4.0456,
+      "step": 9196
+    },
+    {
+      "epoch": 0.09197,
+      "grad_norm": 0.598760724067688,
+      "learning_rate": 0.003,
+      "loss": 4.0334,
+      "step": 9197
+    },
+    {
+      "epoch": 0.09198,
+      "grad_norm": 0.8965339660644531,
+      "learning_rate": 0.003,
+      "loss": 4.0334,
+      "step": 9198
+    },
+    {
+      "epoch": 0.09199,
+      "grad_norm": 1.3671212196350098,
+      "learning_rate": 0.003,
+      "loss": 4.0557,
+      "step": 9199
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.6588968634605408,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 9200
+    },
+    {
+      "epoch": 0.09201,
+      "grad_norm": 0.6256622672080994,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 9201
+    },
+    {
+      "epoch": 0.09202,
+      "grad_norm": 0.6034049987792969,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 9202
+    },
+    {
+      "epoch": 0.09203,
+      "grad_norm": 0.6372929811477661,
+      "learning_rate": 0.003,
+      "loss": 4.0388,
+      "step": 9203
+    },
+    {
+      "epoch": 0.09204,
+      "grad_norm": 0.530689001083374,
+      "learning_rate": 0.003,
+      "loss": 4.0394,
+      "step": 9204
+    },
+    {
+      "epoch": 0.09205,
+      "grad_norm": 0.5760714411735535,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 9205
+    },
+    {
+      "epoch": 0.09206,
+      "grad_norm": 0.6162959933280945,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 9206
+    },
+    {
+      "epoch": 0.09207,
+      "grad_norm": 0.8327687382698059,
+      "learning_rate": 0.003,
+      "loss": 4.0545,
+      "step": 9207
+    },
+    {
+      "epoch": 0.09208,
+      "grad_norm": 1.0663541555404663,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 9208
+    },
+    {
+      "epoch": 0.09209,
+      "grad_norm": 0.99628084897995,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 9209
+    },
+    {
+      "epoch": 0.0921,
+      "grad_norm": 0.7654755711555481,
+      "learning_rate": 0.003,
+      "loss": 4.0382,
+      "step": 9210
+    },
+    {
+      "epoch": 0.09211,
+      "grad_norm": 0.5291755795478821,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 9211
+    },
+    {
+      "epoch": 0.09212,
+      "grad_norm": 0.545681893825531,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 9212
+    },
+    {
+      "epoch": 0.09213,
+      "grad_norm": 0.7274799346923828,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 9213
+    },
+    {
+      "epoch": 0.09214,
+      "grad_norm": 0.8287873268127441,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 9214
+    },
+    {
+      "epoch": 0.09215,
+      "grad_norm": 0.8357604742050171,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 9215
+    },
+    {
+      "epoch": 0.09216,
+      "grad_norm": 0.9297974705696106,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 9216
+    },
+    {
+      "epoch": 0.09217,
+      "grad_norm": 1.03271484375,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 9217
+    },
+    {
+      "epoch": 0.09218,
+      "grad_norm": 0.8471750617027283,
+      "learning_rate": 0.003,
+      "loss": 4.0545,
+      "step": 9218
+    },
+    {
+      "epoch": 0.09219,
+      "grad_norm": 0.7956122159957886,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 9219
+    },
+    {
+      "epoch": 0.0922,
+      "grad_norm": 0.7104371786117554,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 9220
+    },
+    {
+      "epoch": 0.09221,
+      "grad_norm": 0.8256221413612366,
+      "learning_rate": 0.003,
+      "loss": 4.0476,
+      "step": 9221
+    },
+    {
+      "epoch": 0.09222,
+      "grad_norm": 0.914454460144043,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 9222
+    },
+    {
+      "epoch": 0.09223,
+      "grad_norm": 0.8593582510948181,
+      "learning_rate": 0.003,
+      "loss": 4.0302,
+      "step": 9223
+    },
+    {
+      "epoch": 0.09224,
+      "grad_norm": 0.8074911236763,
+      "learning_rate": 0.003,
+      "loss": 4.0462,
+      "step": 9224
+    },
+    {
+      "epoch": 0.09225,
+      "grad_norm": 0.7950632572174072,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 9225
+    },
+    {
+      "epoch": 0.09226,
+      "grad_norm": 0.7571146488189697,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 9226
+    },
+    {
+      "epoch": 0.09227,
+      "grad_norm": 0.7939788699150085,
+      "learning_rate": 0.003,
+      "loss": 4.0369,
+      "step": 9227
+    },
+    {
+      "epoch": 0.09228,
+      "grad_norm": 0.999841034412384,
+      "learning_rate": 0.003,
+      "loss": 4.0447,
+      "step": 9228
+    },
+    {
+      "epoch": 0.09229,
+      "grad_norm": 1.0757583379745483,
+      "learning_rate": 0.003,
+      "loss": 4.0574,
+      "step": 9229
+    },
+    {
+      "epoch": 0.0923,
+      "grad_norm": 0.8179429769515991,
+      "learning_rate": 0.003,
+      "loss": 4.0458,
+      "step": 9230
+    },
+    {
+      "epoch": 0.09231,
+      "grad_norm": 0.7525832056999207,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 9231
+    },
+    {
+      "epoch": 0.09232,
+      "grad_norm": 0.857976496219635,
+      "learning_rate": 0.003,
+      "loss": 4.0535,
+      "step": 9232
+    },
+    {
+      "epoch": 0.09233,
+      "grad_norm": 1.0185387134552002,
+      "learning_rate": 0.003,
+      "loss": 4.0742,
+      "step": 9233
+    },
+    {
+      "epoch": 0.09234,
+      "grad_norm": 0.9663023948669434,
+      "learning_rate": 0.003,
+      "loss": 4.0753,
+      "step": 9234
+    },
+    {
+      "epoch": 0.09235,
+      "grad_norm": 1.0109349489212036,
+      "learning_rate": 0.003,
+      "loss": 4.0435,
+      "step": 9235
+    },
+    {
+      "epoch": 0.09236,
+      "grad_norm": 1.1014132499694824,
+      "learning_rate": 0.003,
+      "loss": 4.0861,
+      "step": 9236
+    },
+    {
+      "epoch": 0.09237,
+      "grad_norm": 0.919548511505127,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 9237
+    },
+    {
+      "epoch": 0.09238,
+      "grad_norm": 0.8530013561248779,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 9238
+    },
+    {
+      "epoch": 0.09239,
+      "grad_norm": 0.939599871635437,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 9239
+    },
+    {
+      "epoch": 0.0924,
+      "grad_norm": 1.015742540359497,
+      "learning_rate": 0.003,
+      "loss": 4.0403,
+      "step": 9240
+    },
+    {
+      "epoch": 0.09241,
+      "grad_norm": 0.8002774119377136,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 9241
+    },
+    {
+      "epoch": 0.09242,
+      "grad_norm": 0.6762545704841614,
+      "learning_rate": 0.003,
+      "loss": 4.0583,
+      "step": 9242
+    },
+    {
+      "epoch": 0.09243,
+      "grad_norm": 0.6825734376907349,
+      "learning_rate": 0.003,
+      "loss": 4.0564,
+      "step": 9243
+    },
+    {
+      "epoch": 0.09244,
+      "grad_norm": 0.7166531682014465,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 9244
+    },
+    {
+      "epoch": 0.09245,
+      "grad_norm": 0.6903985738754272,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 9245
+    },
+    {
+      "epoch": 0.09246,
+      "grad_norm": 0.6596519351005554,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 9246
+    },
+    {
+      "epoch": 0.09247,
+      "grad_norm": 0.6328878998756409,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 9247
+    },
+    {
+      "epoch": 0.09248,
+      "grad_norm": 0.5870897769927979,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 9248
+    },
+    {
+      "epoch": 0.09249,
+      "grad_norm": 0.5670067071914673,
+      "learning_rate": 0.003,
+      "loss": 4.0799,
+      "step": 9249
+    },
+    {
+      "epoch": 0.0925,
+      "grad_norm": 0.6085765957832336,
+      "learning_rate": 0.003,
+      "loss": 4.049,
+      "step": 9250
+    },
+    {
+      "epoch": 0.09251,
+      "grad_norm": 0.5557095408439636,
+      "learning_rate": 0.003,
+      "loss": 4.0454,
+      "step": 9251
+    },
+    {
+      "epoch": 0.09252,
+      "grad_norm": 0.48302340507507324,
+      "learning_rate": 0.003,
+      "loss": 4.054,
+      "step": 9252
+    },
+    {
+      "epoch": 0.09253,
+      "grad_norm": 0.5740175843238831,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 9253
+    },
+    {
+      "epoch": 0.09254,
+      "grad_norm": 0.7579610347747803,
+      "learning_rate": 0.003,
+      "loss": 4.0458,
+      "step": 9254
+    },
+    {
+      "epoch": 0.09255,
+      "grad_norm": 1.0486177206039429,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 9255
+    },
+    {
+      "epoch": 0.09256,
+      "grad_norm": 1.0068448781967163,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 9256
+    },
+    {
+      "epoch": 0.09257,
+      "grad_norm": 0.7558557987213135,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 9257
+    },
+    {
+      "epoch": 0.09258,
+      "grad_norm": 0.66719651222229,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 9258
+    },
+    {
+      "epoch": 0.09259,
+      "grad_norm": 0.7420986294746399,
+      "learning_rate": 0.003,
+      "loss": 4.0452,
+      "step": 9259
+    },
+    {
+      "epoch": 0.0926,
+      "grad_norm": 0.8558784127235413,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 9260
+    },
+    {
+      "epoch": 0.09261,
+      "grad_norm": 0.9271590709686279,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 9261
+    },
+    {
+      "epoch": 0.09262,
+      "grad_norm": 0.8783436417579651,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 9262
+    },
+    {
+      "epoch": 0.09263,
+      "grad_norm": 0.7863196730613708,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 9263
+    },
+    {
+      "epoch": 0.09264,
+      "grad_norm": 0.6934552192687988,
+      "learning_rate": 0.003,
+      "loss": 4.0662,
+      "step": 9264
+    },
+    {
+      "epoch": 0.09265,
+      "grad_norm": 0.6641402244567871,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 9265
+    },
+    {
+      "epoch": 0.09266,
+      "grad_norm": 0.6042680740356445,
+      "learning_rate": 0.003,
+      "loss": 4.0339,
+      "step": 9266
+    },
+    {
+      "epoch": 0.09267,
+      "grad_norm": 0.6411666870117188,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 9267
+    },
+    {
+      "epoch": 0.09268,
+      "grad_norm": 0.5758857131004333,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 9268
+    },
+    {
+      "epoch": 0.09269,
+      "grad_norm": 0.5887449383735657,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 9269
+    },
+    {
+      "epoch": 0.0927,
+      "grad_norm": 0.6199323534965515,
+      "learning_rate": 0.003,
+      "loss": 4.0545,
+      "step": 9270
+    },
+    {
+      "epoch": 0.09271,
+      "grad_norm": 0.6328174471855164,
+      "learning_rate": 0.003,
+      "loss": 4.0386,
+      "step": 9271
+    },
+    {
+      "epoch": 0.09272,
+      "grad_norm": 0.6937214136123657,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 9272
+    },
+    {
+      "epoch": 0.09273,
+      "grad_norm": 0.7590581178665161,
+      "learning_rate": 0.003,
+      "loss": 4.0451,
+      "step": 9273
+    },
+    {
+      "epoch": 0.09274,
+      "grad_norm": 0.775363564491272,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 9274
+    },
+    {
+      "epoch": 0.09275,
+      "grad_norm": 0.765939474105835,
+      "learning_rate": 0.003,
+      "loss": 4.0513,
+      "step": 9275
+    },
+    {
+      "epoch": 0.09276,
+      "grad_norm": 0.6670629978179932,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 9276
+    },
+    {
+      "epoch": 0.09277,
+      "grad_norm": 0.555806577205658,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 9277
+    },
+    {
+      "epoch": 0.09278,
+      "grad_norm": 0.4805139899253845,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 9278
+    },
+    {
+      "epoch": 0.09279,
+      "grad_norm": 0.4400136172771454,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 9279
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.4695459008216858,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 9280
+    },
+    {
+      "epoch": 0.09281,
+      "grad_norm": 0.4340028762817383,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 9281
+    },
+    {
+      "epoch": 0.09282,
+      "grad_norm": 0.4828382730484009,
+      "learning_rate": 0.003,
+      "loss": 4.0523,
+      "step": 9282
+    },
+    {
+      "epoch": 0.09283,
+      "grad_norm": 0.669910728931427,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 9283
+    },
+    {
+      "epoch": 0.09284,
+      "grad_norm": 0.787203848361969,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 9284
+    },
+    {
+      "epoch": 0.09285,
+      "grad_norm": 0.9136567115783691,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 9285
+    },
+    {
+      "epoch": 0.09286,
+      "grad_norm": 1.051871657371521,
+      "learning_rate": 0.003,
+      "loss": 4.0421,
+      "step": 9286
+    },
+    {
+      "epoch": 0.09287,
+      "grad_norm": 1.0061372518539429,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 9287
+    },
+    {
+      "epoch": 0.09288,
+      "grad_norm": 0.91563880443573,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 9288
+    },
+    {
+      "epoch": 0.09289,
+      "grad_norm": 0.8254592418670654,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 9289
+    },
+    {
+      "epoch": 0.0929,
+      "grad_norm": 0.9599001407623291,
+      "learning_rate": 0.003,
+      "loss": 4.0503,
+      "step": 9290
+    },
+    {
+      "epoch": 0.09291,
+      "grad_norm": 1.0785131454467773,
+      "learning_rate": 0.003,
+      "loss": 4.0484,
+      "step": 9291
+    },
+    {
+      "epoch": 0.09292,
+      "grad_norm": 0.8951935172080994,
+      "learning_rate": 0.003,
+      "loss": 4.0572,
+      "step": 9292
+    },
+    {
+      "epoch": 0.09293,
+      "grad_norm": 0.8868071436882019,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 9293
+    },
+    {
+      "epoch": 0.09294,
+      "grad_norm": 0.9149964451789856,
+      "learning_rate": 0.003,
+      "loss": 4.0444,
+      "step": 9294
+    },
+    {
+      "epoch": 0.09295,
+      "grad_norm": 0.9641788601875305,
+      "learning_rate": 0.003,
+      "loss": 4.0541,
+      "step": 9295
+    },
+    {
+      "epoch": 0.09296,
+      "grad_norm": 1.0511088371276855,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 9296
+    },
+    {
+      "epoch": 0.09297,
+      "grad_norm": 1.1237189769744873,
+      "learning_rate": 0.003,
+      "loss": 4.0604,
+      "step": 9297
+    },
+    {
+      "epoch": 0.09298,
+      "grad_norm": 0.8537935614585876,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 9298
+    },
+    {
+      "epoch": 0.09299,
+      "grad_norm": 0.8743082880973816,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 9299
+    },
+    {
+      "epoch": 0.093,
+      "grad_norm": 0.8369987607002258,
+      "learning_rate": 0.003,
+      "loss": 4.0527,
+      "step": 9300
+    },
+    {
+      "epoch": 0.09301,
+      "grad_norm": 0.7713850736618042,
+      "learning_rate": 0.003,
+      "loss": 4.0429,
+      "step": 9301
+    },
+    {
+      "epoch": 0.09302,
+      "grad_norm": 0.7446997761726379,
+      "learning_rate": 0.003,
+      "loss": 4.0724,
+      "step": 9302
+    },
+    {
+      "epoch": 0.09303,
+      "grad_norm": 0.7269630432128906,
+      "learning_rate": 0.003,
+      "loss": 4.0446,
+      "step": 9303
+    },
+    {
+      "epoch": 0.09304,
+      "grad_norm": 0.6271345615386963,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 9304
+    },
+    {
+      "epoch": 0.09305,
+      "grad_norm": 0.6511696577072144,
+      "learning_rate": 0.003,
+      "loss": 4.0342,
+      "step": 9305
+    },
+    {
+      "epoch": 0.09306,
+      "grad_norm": 0.7046614289283752,
+      "learning_rate": 0.003,
+      "loss": 4.0394,
+      "step": 9306
+    },
+    {
+      "epoch": 0.09307,
+      "grad_norm": 0.6628211140632629,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 9307
+    },
+    {
+      "epoch": 0.09308,
+      "grad_norm": 0.6787553429603577,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 9308
+    },
+    {
+      "epoch": 0.09309,
+      "grad_norm": 0.7450267672538757,
+      "learning_rate": 0.003,
+      "loss": 4.0483,
+      "step": 9309
+    },
+    {
+      "epoch": 0.0931,
+      "grad_norm": 0.8884741067886353,
+      "learning_rate": 0.003,
+      "loss": 4.057,
+      "step": 9310
+    },
+    {
+      "epoch": 0.09311,
+      "grad_norm": 1.3012408018112183,
+      "learning_rate": 0.003,
+      "loss": 4.0528,
+      "step": 9311
+    },
+    {
+      "epoch": 0.09312,
+      "grad_norm": 0.8154418468475342,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 9312
+    },
+    {
+      "epoch": 0.09313,
+      "grad_norm": 0.7738723158836365,
+      "learning_rate": 0.003,
+      "loss": 4.0727,
+      "step": 9313
+    },
+    {
+      "epoch": 0.09314,
+      "grad_norm": 0.7311467528343201,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 9314
+    },
+    {
+      "epoch": 0.09315,
+      "grad_norm": 0.6791749596595764,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 9315
+    },
+    {
+      "epoch": 0.09316,
+      "grad_norm": 0.7334395051002502,
+      "learning_rate": 0.003,
+      "loss": 4.0433,
+      "step": 9316
+    },
+    {
+      "epoch": 0.09317,
+      "grad_norm": 0.7056670784950256,
+      "learning_rate": 0.003,
+      "loss": 4.058,
+      "step": 9317
+    },
+    {
+      "epoch": 0.09318,
+      "grad_norm": 0.7427524328231812,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 9318
+    },
+    {
+      "epoch": 0.09319,
+      "grad_norm": 0.773429274559021,
+      "learning_rate": 0.003,
+      "loss": 4.0703,
+      "step": 9319
+    },
+    {
+      "epoch": 0.0932,
+      "grad_norm": 0.8293418884277344,
+      "learning_rate": 0.003,
+      "loss": 4.0443,
+      "step": 9320
+    },
+    {
+      "epoch": 0.09321,
+      "grad_norm": 0.7913047671318054,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 9321
+    },
+    {
+      "epoch": 0.09322,
+      "grad_norm": 0.8516207337379456,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 9322
+    },
+    {
+      "epoch": 0.09323,
+      "grad_norm": 0.9764220714569092,
+      "learning_rate": 0.003,
+      "loss": 4.0397,
+      "step": 9323
+    },
+    {
+      "epoch": 0.09324,
+      "grad_norm": 1.0407365560531616,
+      "learning_rate": 0.003,
+      "loss": 4.0627,
+      "step": 9324
+    },
+    {
+      "epoch": 0.09325,
+      "grad_norm": 0.7859419584274292,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 9325
+    },
+    {
+      "epoch": 0.09326,
+      "grad_norm": 0.7470349073410034,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 9326
+    },
+    {
+      "epoch": 0.09327,
+      "grad_norm": 1.0527385473251343,
+      "learning_rate": 0.003,
+      "loss": 4.0406,
+      "step": 9327
+    },
+    {
+      "epoch": 0.09328,
+      "grad_norm": 1.1210170984268188,
+      "learning_rate": 0.003,
+      "loss": 4.0437,
+      "step": 9328
+    },
+    {
+      "epoch": 0.09329,
+      "grad_norm": 0.8028057217597961,
+      "learning_rate": 0.003,
+      "loss": 4.0414,
+      "step": 9329
+    },
+    {
+      "epoch": 0.0933,
+      "grad_norm": 0.5863675475120544,
+      "learning_rate": 0.003,
+      "loss": 4.052,
+      "step": 9330
+    },
+    {
+      "epoch": 0.09331,
+      "grad_norm": 0.6289627552032471,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 9331
+    },
+    {
+      "epoch": 0.09332,
+      "grad_norm": 0.6613301634788513,
+      "learning_rate": 0.003,
+      "loss": 4.0462,
+      "step": 9332
+    },
+    {
+      "epoch": 0.09333,
+      "grad_norm": 0.621925413608551,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 9333
+    },
+    {
+      "epoch": 0.09334,
+      "grad_norm": 0.6211190819740295,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 9334
+    },
+    {
+      "epoch": 0.09335,
+      "grad_norm": 0.6644201874732971,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 9335
+    },
+    {
+      "epoch": 0.09336,
+      "grad_norm": 0.6829946637153625,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 9336
+    },
+    {
+      "epoch": 0.09337,
+      "grad_norm": 0.5816702246665955,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 9337
+    },
+    {
+      "epoch": 0.09338,
+      "grad_norm": 0.5819098353385925,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 9338
+    },
+    {
+      "epoch": 0.09339,
+      "grad_norm": 0.5959669351577759,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 9339
+    },
+    {
+      "epoch": 0.0934,
+      "grad_norm": 0.5701149106025696,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 9340
+    },
+    {
+      "epoch": 0.09341,
+      "grad_norm": 0.43852511048316956,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 9341
+    },
+    {
+      "epoch": 0.09342,
+      "grad_norm": 0.48799461126327515,
+      "learning_rate": 0.003,
+      "loss": 4.0176,
+      "step": 9342
+    },
+    {
+      "epoch": 0.09343,
+      "grad_norm": 0.5336135029792786,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 9343
+    },
+    {
+      "epoch": 0.09344,
+      "grad_norm": 0.5141116380691528,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 9344
+    },
+    {
+      "epoch": 0.09345,
+      "grad_norm": 0.47849857807159424,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 9345
+    },
+    {
+      "epoch": 0.09346,
+      "grad_norm": 0.49492859840393066,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 9346
+    },
+    {
+      "epoch": 0.09347,
+      "grad_norm": 0.6012299060821533,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 9347
+    },
+    {
+      "epoch": 0.09348,
+      "grad_norm": 0.8121467232704163,
+      "learning_rate": 0.003,
+      "loss": 4.04,
+      "step": 9348
+    },
+    {
+      "epoch": 0.09349,
+      "grad_norm": 1.0918225049972534,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 9349
+    },
+    {
+      "epoch": 0.0935,
+      "grad_norm": 1.1535876989364624,
+      "learning_rate": 0.003,
+      "loss": 4.0295,
+      "step": 9350
+    },
+    {
+      "epoch": 0.09351,
+      "grad_norm": 0.6782242655754089,
+      "learning_rate": 0.003,
+      "loss": 3.9954,
+      "step": 9351
+    },
+    {
+      "epoch": 0.09352,
+      "grad_norm": 0.6605706810951233,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 9352
+    },
+    {
+      "epoch": 0.09353,
+      "grad_norm": 0.8234308362007141,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 9353
+    },
+    {
+      "epoch": 0.09354,
+      "grad_norm": 0.7437732815742493,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 9354
+    },
+    {
+      "epoch": 0.09355,
+      "grad_norm": 0.7545703649520874,
+      "learning_rate": 0.003,
+      "loss": 4.0472,
+      "step": 9355
+    },
+    {
+      "epoch": 0.09356,
+      "grad_norm": 0.9154151678085327,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 9356
+    },
+    {
+      "epoch": 0.09357,
+      "grad_norm": 0.9658426642417908,
+      "learning_rate": 0.003,
+      "loss": 4.0386,
+      "step": 9357
+    },
+    {
+      "epoch": 0.09358,
+      "grad_norm": 0.9715717434883118,
+      "learning_rate": 0.003,
+      "loss": 4.0449,
+      "step": 9358
+    },
+    {
+      "epoch": 0.09359,
+      "grad_norm": 0.8674546480178833,
+      "learning_rate": 0.003,
+      "loss": 4.055,
+      "step": 9359
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.8513766527175903,
+      "learning_rate": 0.003,
+      "loss": 4.0492,
+      "step": 9360
+    },
+    {
+      "epoch": 0.09361,
+      "grad_norm": 0.9162145256996155,
+      "learning_rate": 0.003,
+      "loss": 4.0539,
+      "step": 9361
+    },
+    {
+      "epoch": 0.09362,
+      "grad_norm": 1.0360206365585327,
+      "learning_rate": 0.003,
+      "loss": 4.0867,
+      "step": 9362
+    },
+    {
+      "epoch": 0.09363,
+      "grad_norm": 1.0021082162857056,
+      "learning_rate": 0.003,
+      "loss": 4.0723,
+      "step": 9363
+    },
+    {
+      "epoch": 0.09364,
+      "grad_norm": 0.9146813750267029,
+      "learning_rate": 0.003,
+      "loss": 4.0435,
+      "step": 9364
+    },
+    {
+      "epoch": 0.09365,
+      "grad_norm": 0.8867185711860657,
+      "learning_rate": 0.003,
+      "loss": 4.0709,
+      "step": 9365
+    },
+    {
+      "epoch": 0.09366,
+      "grad_norm": 0.9262175559997559,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 9366
+    },
+    {
+      "epoch": 0.09367,
+      "grad_norm": 0.9771784543991089,
+      "learning_rate": 0.003,
+      "loss": 4.0562,
+      "step": 9367
+    },
+    {
+      "epoch": 0.09368,
+      "grad_norm": 1.0174373388290405,
+      "learning_rate": 0.003,
+      "loss": 4.0674,
+      "step": 9368
+    },
+    {
+      "epoch": 0.09369,
+      "grad_norm": 0.972047746181488,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 9369
+    },
+    {
+      "epoch": 0.0937,
+      "grad_norm": 0.9600644111633301,
+      "learning_rate": 0.003,
+      "loss": 4.074,
+      "step": 9370
+    },
+    {
+      "epoch": 0.09371,
+      "grad_norm": 0.8365176916122437,
+      "learning_rate": 0.003,
+      "loss": 4.06,
+      "step": 9371
+    },
+    {
+      "epoch": 0.09372,
+      "grad_norm": 0.6783679127693176,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 9372
+    },
+    {
+      "epoch": 0.09373,
+      "grad_norm": 0.6796581149101257,
+      "learning_rate": 0.003,
+      "loss": 4.0838,
+      "step": 9373
+    },
+    {
+      "epoch": 0.09374,
+      "grad_norm": 0.6576843857765198,
+      "learning_rate": 0.003,
+      "loss": 4.0808,
+      "step": 9374
+    },
+    {
+      "epoch": 0.09375,
+      "grad_norm": 0.7115494608879089,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 9375
+    },
+    {
+      "epoch": 0.09376,
+      "grad_norm": 0.7252764105796814,
+      "learning_rate": 0.003,
+      "loss": 4.0641,
+      "step": 9376
+    },
+    {
+      "epoch": 0.09377,
+      "grad_norm": 0.7984814047813416,
+      "learning_rate": 0.003,
+      "loss": 4.0457,
+      "step": 9377
+    },
+    {
+      "epoch": 0.09378,
+      "grad_norm": 0.8357805609703064,
+      "learning_rate": 0.003,
+      "loss": 4.0594,
+      "step": 9378
+    },
+    {
+      "epoch": 0.09379,
+      "grad_norm": 0.7846959233283997,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 9379
+    },
+    {
+      "epoch": 0.0938,
+      "grad_norm": 0.7526464462280273,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 9380
+    },
+    {
+      "epoch": 0.09381,
+      "grad_norm": 0.8302079439163208,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 9381
+    },
+    {
+      "epoch": 0.09382,
+      "grad_norm": 1.106316089630127,
+      "learning_rate": 0.003,
+      "loss": 4.0503,
+      "step": 9382
+    },
+    {
+      "epoch": 0.09383,
+      "grad_norm": 1.034866452217102,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 9383
+    },
+    {
+      "epoch": 0.09384,
+      "grad_norm": 0.8781853914260864,
+      "learning_rate": 0.003,
+      "loss": 4.0631,
+      "step": 9384
+    },
+    {
+      "epoch": 0.09385,
+      "grad_norm": 0.8163227438926697,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 9385
+    },
+    {
+      "epoch": 0.09386,
+      "grad_norm": 0.7359960675239563,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 9386
+    },
+    {
+      "epoch": 0.09387,
+      "grad_norm": 0.8685034513473511,
+      "learning_rate": 0.003,
+      "loss": 4.0471,
+      "step": 9387
+    },
+    {
+      "epoch": 0.09388,
+      "grad_norm": 0.9002259969711304,
+      "learning_rate": 0.003,
+      "loss": 4.049,
+      "step": 9388
+    },
+    {
+      "epoch": 0.09389,
+      "grad_norm": 0.7710829973220825,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 9389
+    },
+    {
+      "epoch": 0.0939,
+      "grad_norm": 0.7765557169914246,
+      "learning_rate": 0.003,
+      "loss": 4.0467,
+      "step": 9390
+    },
+    {
+      "epoch": 0.09391,
+      "grad_norm": 0.7747265100479126,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 9391
+    },
+    {
+      "epoch": 0.09392,
+      "grad_norm": 0.6100158095359802,
+      "learning_rate": 0.003,
+      "loss": 4.0557,
+      "step": 9392
+    },
+    {
+      "epoch": 0.09393,
+      "grad_norm": 0.533496081829071,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 9393
+    },
+    {
+      "epoch": 0.09394,
+      "grad_norm": 0.5862325429916382,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 9394
+    },
+    {
+      "epoch": 0.09395,
+      "grad_norm": 0.5538999438285828,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 9395
+    },
+    {
+      "epoch": 0.09396,
+      "grad_norm": 0.5180746912956238,
+      "learning_rate": 0.003,
+      "loss": 4.0567,
+      "step": 9396
+    },
+    {
+      "epoch": 0.09397,
+      "grad_norm": 0.5542228817939758,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 9397
+    },
+    {
+      "epoch": 0.09398,
+      "grad_norm": 0.5188986659049988,
+      "learning_rate": 0.003,
+      "loss": 4.0454,
+      "step": 9398
+    },
+    {
+      "epoch": 0.09399,
+      "grad_norm": 0.5786181688308716,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 9399
+    },
+    {
+      "epoch": 0.094,
+      "grad_norm": 0.6772082448005676,
+      "learning_rate": 0.003,
+      "loss": 4.0289,
+      "step": 9400
+    },
+    {
+      "epoch": 0.09401,
+      "grad_norm": 0.693780779838562,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 9401
+    },
+    {
+      "epoch": 0.09402,
+      "grad_norm": 0.6480074524879456,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 9402
+    },
+    {
+      "epoch": 0.09403,
+      "grad_norm": 0.599751353263855,
+      "learning_rate": 0.003,
+      "loss": 4.0471,
+      "step": 9403
+    },
+    {
+      "epoch": 0.09404,
+      "grad_norm": 0.5232686996459961,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 9404
+    },
+    {
+      "epoch": 0.09405,
+      "grad_norm": 0.5285734534263611,
+      "learning_rate": 0.003,
+      "loss": 4.042,
+      "step": 9405
+    },
+    {
+      "epoch": 0.09406,
+      "grad_norm": 0.5814787149429321,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 9406
+    },
+    {
+      "epoch": 0.09407,
+      "grad_norm": 0.6249456405639648,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 9407
+    },
+    {
+      "epoch": 0.09408,
+      "grad_norm": 0.774571418762207,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 9408
+    },
+    {
+      "epoch": 0.09409,
+      "grad_norm": 0.8388532400131226,
+      "learning_rate": 0.003,
+      "loss": 4.0436,
+      "step": 9409
+    },
+    {
+      "epoch": 0.0941,
+      "grad_norm": 0.783204197883606,
+      "learning_rate": 0.003,
+      "loss": 4.05,
+      "step": 9410
+    },
+    {
+      "epoch": 0.09411,
+      "grad_norm": 0.9117321372032166,
+      "learning_rate": 0.003,
+      "loss": 4.0612,
+      "step": 9411
+    },
+    {
+      "epoch": 0.09412,
+      "grad_norm": 0.9661768078804016,
+      "learning_rate": 0.003,
+      "loss": 4.0542,
+      "step": 9412
+    },
+    {
+      "epoch": 0.09413,
+      "grad_norm": 1.01078462600708,
+      "learning_rate": 0.003,
+      "loss": 4.0271,
+      "step": 9413
+    },
+    {
+      "epoch": 0.09414,
+      "grad_norm": 1.0741658210754395,
+      "learning_rate": 0.003,
+      "loss": 4.0525,
+      "step": 9414
+    },
+    {
+      "epoch": 0.09415,
+      "grad_norm": 0.9591096043586731,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 9415
+    },
+    {
+      "epoch": 0.09416,
+      "grad_norm": 0.8794155716896057,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 9416
+    },
+    {
+      "epoch": 0.09417,
+      "grad_norm": 0.841712236404419,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 9417
+    },
+    {
+      "epoch": 0.09418,
+      "grad_norm": 0.7917954921722412,
+      "learning_rate": 0.003,
+      "loss": 4.0302,
+      "step": 9418
+    },
+    {
+      "epoch": 0.09419,
+      "grad_norm": 0.7964409589767456,
+      "learning_rate": 0.003,
+      "loss": 4.0634,
+      "step": 9419
+    },
+    {
+      "epoch": 0.0942,
+      "grad_norm": 0.8282405734062195,
+      "learning_rate": 0.003,
+      "loss": 4.0384,
+      "step": 9420
+    },
+    {
+      "epoch": 0.09421,
+      "grad_norm": 0.8015482425689697,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 9421
+    },
+    {
+      "epoch": 0.09422,
+      "grad_norm": 0.7137182950973511,
+      "learning_rate": 0.003,
+      "loss": 4.0339,
+      "step": 9422
+    },
+    {
+      "epoch": 0.09423,
+      "grad_norm": 0.8045299053192139,
+      "learning_rate": 0.003,
+      "loss": 3.9856,
+      "step": 9423
+    },
+    {
+      "epoch": 0.09424,
+      "grad_norm": 0.7985638380050659,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 9424
+    },
+    {
+      "epoch": 0.09425,
+      "grad_norm": 0.9419013261795044,
+      "learning_rate": 0.003,
+      "loss": 4.0599,
+      "step": 9425
+    },
+    {
+      "epoch": 0.09426,
+      "grad_norm": 0.8842517137527466,
+      "learning_rate": 0.003,
+      "loss": 4.065,
+      "step": 9426
+    },
+    {
+      "epoch": 0.09427,
+      "grad_norm": 0.8842640519142151,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 9427
+    },
+    {
+      "epoch": 0.09428,
+      "grad_norm": 0.8510754108428955,
+      "learning_rate": 0.003,
+      "loss": 4.057,
+      "step": 9428
+    },
+    {
+      "epoch": 0.09429,
+      "grad_norm": 0.7526082396507263,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 9429
+    },
+    {
+      "epoch": 0.0943,
+      "grad_norm": 0.7133992314338684,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 9430
+    },
+    {
+      "epoch": 0.09431,
+      "grad_norm": 0.7032984495162964,
+      "learning_rate": 0.003,
+      "loss": 4.0452,
+      "step": 9431
+    },
+    {
+      "epoch": 0.09432,
+      "grad_norm": 0.7109702229499817,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 9432
+    },
+    {
+      "epoch": 0.09433,
+      "grad_norm": 0.8378332257270813,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 9433
+    },
+    {
+      "epoch": 0.09434,
+      "grad_norm": 0.8901048302650452,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 9434
+    },
+    {
+      "epoch": 0.09435,
+      "grad_norm": 0.8476412296295166,
+      "learning_rate": 0.003,
+      "loss": 4.0331,
+      "step": 9435
+    },
+    {
+      "epoch": 0.09436,
+      "grad_norm": 0.7854786515235901,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 9436
+    },
+    {
+      "epoch": 0.09437,
+      "grad_norm": 0.6704685091972351,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 9437
+    },
+    {
+      "epoch": 0.09438,
+      "grad_norm": 0.6411417722702026,
+      "learning_rate": 0.003,
+      "loss": 4.05,
+      "step": 9438
+    },
+    {
+      "epoch": 0.09439,
+      "grad_norm": 0.6393620371818542,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 9439
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.728570818901062,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 9440
+    },
+    {
+      "epoch": 0.09441,
+      "grad_norm": 0.9385724067687988,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 9441
+    },
+    {
+      "epoch": 0.09442,
+      "grad_norm": 0.9962633848190308,
+      "learning_rate": 0.003,
+      "loss": 4.0347,
+      "step": 9442
+    },
+    {
+      "epoch": 0.09443,
+      "grad_norm": 0.8917745351791382,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 9443
+    },
+    {
+      "epoch": 0.09444,
+      "grad_norm": 0.7558419704437256,
+      "learning_rate": 0.003,
+      "loss": 4.0478,
+      "step": 9444
+    },
+    {
+      "epoch": 0.09445,
+      "grad_norm": 0.6210421919822693,
+      "learning_rate": 0.003,
+      "loss": 4.0321,
+      "step": 9445
+    },
+    {
+      "epoch": 0.09446,
+      "grad_norm": 0.6951500177383423,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 9446
+    },
+    {
+      "epoch": 0.09447,
+      "grad_norm": 0.8409909605979919,
+      "learning_rate": 0.003,
+      "loss": 4.05,
+      "step": 9447
+    },
+    {
+      "epoch": 0.09448,
+      "grad_norm": 0.9376384019851685,
+      "learning_rate": 0.003,
+      "loss": 4.0483,
+      "step": 9448
+    },
+    {
+      "epoch": 0.09449,
+      "grad_norm": 1.001735806465149,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 9449
+    },
+    {
+      "epoch": 0.0945,
+      "grad_norm": 1.2646803855895996,
+      "learning_rate": 0.003,
+      "loss": 4.0443,
+      "step": 9450
+    },
+    {
+      "epoch": 0.09451,
+      "grad_norm": 0.7255167365074158,
+      "learning_rate": 0.003,
+      "loss": 4.0468,
+      "step": 9451
+    },
+    {
+      "epoch": 0.09452,
+      "grad_norm": 0.775261402130127,
+      "learning_rate": 0.003,
+      "loss": 4.0565,
+      "step": 9452
+    },
+    {
+      "epoch": 0.09453,
+      "grad_norm": 0.7530305981636047,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 9453
+    },
+    {
+      "epoch": 0.09454,
+      "grad_norm": 0.8406985402107239,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 9454
+    },
+    {
+      "epoch": 0.09455,
+      "grad_norm": 0.8127760887145996,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 9455
+    },
+    {
+      "epoch": 0.09456,
+      "grad_norm": 0.679355800151825,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 9456
+    },
+    {
+      "epoch": 0.09457,
+      "grad_norm": 0.660081148147583,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 9457
+    },
+    {
+      "epoch": 0.09458,
+      "grad_norm": 0.5813243985176086,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 9458
+    },
+    {
+      "epoch": 0.09459,
+      "grad_norm": 0.4867233633995056,
+      "learning_rate": 0.003,
+      "loss": 4.0382,
+      "step": 9459
+    },
+    {
+      "epoch": 0.0946,
+      "grad_norm": 0.5141322612762451,
+      "learning_rate": 0.003,
+      "loss": 4.0508,
+      "step": 9460
+    },
+    {
+      "epoch": 0.09461,
+      "grad_norm": 0.6217049956321716,
+      "learning_rate": 0.003,
+      "loss": 4.06,
+      "step": 9461
+    },
+    {
+      "epoch": 0.09462,
+      "grad_norm": 0.7092083692550659,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 9462
+    },
+    {
+      "epoch": 0.09463,
+      "grad_norm": 0.8736013770103455,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 9463
+    },
+    {
+      "epoch": 0.09464,
+      "grad_norm": 0.9941328763961792,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 9464
+    },
+    {
+      "epoch": 0.09465,
+      "grad_norm": 1.0948302745819092,
+      "learning_rate": 0.003,
+      "loss": 4.0311,
+      "step": 9465
+    },
+    {
+      "epoch": 0.09466,
+      "grad_norm": 0.8153526782989502,
+      "learning_rate": 0.003,
+      "loss": 4.0382,
+      "step": 9466
+    },
+    {
+      "epoch": 0.09467,
+      "grad_norm": 0.6857858300209045,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 9467
+    },
+    {
+      "epoch": 0.09468,
+      "grad_norm": 0.7290007472038269,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 9468
+    },
+    {
+      "epoch": 0.09469,
+      "grad_norm": 0.7589170932769775,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 9469
+    },
+    {
+      "epoch": 0.0947,
+      "grad_norm": 0.7441050410270691,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 9470
+    },
+    {
+      "epoch": 0.09471,
+      "grad_norm": 0.699702262878418,
+      "learning_rate": 0.003,
+      "loss": 4.0513,
+      "step": 9471
+    },
+    {
+      "epoch": 0.09472,
+      "grad_norm": 0.6244022250175476,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 9472
+    },
+    {
+      "epoch": 0.09473,
+      "grad_norm": 0.5642143487930298,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 9473
+    },
+    {
+      "epoch": 0.09474,
+      "grad_norm": 0.6282489895820618,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 9474
+    },
+    {
+      "epoch": 0.09475,
+      "grad_norm": 0.538658082485199,
+      "learning_rate": 0.003,
+      "loss": 4.0207,
+      "step": 9475
+    },
+    {
+      "epoch": 0.09476,
+      "grad_norm": 0.5374001264572144,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 9476
+    },
+    {
+      "epoch": 0.09477,
+      "grad_norm": 0.509412944316864,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 9477
+    },
+    {
+      "epoch": 0.09478,
+      "grad_norm": 0.5749872326850891,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 9478
+    },
+    {
+      "epoch": 0.09479,
+      "grad_norm": 0.6383438110351562,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 9479
+    },
+    {
+      "epoch": 0.0948,
+      "grad_norm": 0.8217725157737732,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 9480
+    },
+    {
+      "epoch": 0.09481,
+      "grad_norm": 1.1610746383666992,
+      "learning_rate": 0.003,
+      "loss": 4.0588,
+      "step": 9481
+    },
+    {
+      "epoch": 0.09482,
+      "grad_norm": 0.9759625196456909,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 9482
+    },
+    {
+      "epoch": 0.09483,
+      "grad_norm": 0.8113720417022705,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 9483
+    },
+    {
+      "epoch": 0.09484,
+      "grad_norm": 0.7075552344322205,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 9484
+    },
+    {
+      "epoch": 0.09485,
+      "grad_norm": 0.718109667301178,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 9485
+    },
+    {
+      "epoch": 0.09486,
+      "grad_norm": 0.7402860522270203,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 9486
+    },
+    {
+      "epoch": 0.09487,
+      "grad_norm": 0.8021457195281982,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 9487
+    },
+    {
+      "epoch": 0.09488,
+      "grad_norm": 0.9090721607208252,
+      "learning_rate": 0.003,
+      "loss": 4.0822,
+      "step": 9488
+    },
+    {
+      "epoch": 0.09489,
+      "grad_norm": 1.036071538925171,
+      "learning_rate": 0.003,
+      "loss": 4.0551,
+      "step": 9489
+    },
+    {
+      "epoch": 0.0949,
+      "grad_norm": 0.9327799677848816,
+      "learning_rate": 0.003,
+      "loss": 4.0496,
+      "step": 9490
+    },
+    {
+      "epoch": 0.09491,
+      "grad_norm": 0.9281620383262634,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 9491
+    },
+    {
+      "epoch": 0.09492,
+      "grad_norm": 0.8979142904281616,
+      "learning_rate": 0.003,
+      "loss": 4.0423,
+      "step": 9492
+    },
+    {
+      "epoch": 0.09493,
+      "grad_norm": 0.8455875515937805,
+      "learning_rate": 0.003,
+      "loss": 4.0168,
+      "step": 9493
+    },
+    {
+      "epoch": 0.09494,
+      "grad_norm": 0.7073626518249512,
+      "learning_rate": 0.003,
+      "loss": 4.0555,
+      "step": 9494
+    },
+    {
+      "epoch": 0.09495,
+      "grad_norm": 0.6978188753128052,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 9495
+    },
+    {
+      "epoch": 0.09496,
+      "grad_norm": 0.6406537294387817,
+      "learning_rate": 0.003,
+      "loss": 4.055,
+      "step": 9496
+    },
+    {
+      "epoch": 0.09497,
+      "grad_norm": 0.584082841873169,
+      "learning_rate": 0.003,
+      "loss": 4.0366,
+      "step": 9497
+    },
+    {
+      "epoch": 0.09498,
+      "grad_norm": 0.650418758392334,
+      "learning_rate": 0.003,
+      "loss": 4.015,
+      "step": 9498
+    },
+    {
+      "epoch": 0.09499,
+      "grad_norm": 0.6627920269966125,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 9499
+    },
+    {
+      "epoch": 0.095,
+      "grad_norm": 0.7207992672920227,
+      "learning_rate": 0.003,
+      "loss": 4.0677,
+      "step": 9500
+    },
+    {
+      "epoch": 0.09501,
+      "grad_norm": 0.8046587705612183,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 9501
+    },
+    {
+      "epoch": 0.09502,
+      "grad_norm": 0.9965620040893555,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 9502
+    },
+    {
+      "epoch": 0.09503,
+      "grad_norm": 1.2637056112289429,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 9503
+    },
+    {
+      "epoch": 0.09504,
+      "grad_norm": 0.7111979722976685,
+      "learning_rate": 0.003,
+      "loss": 4.0416,
+      "step": 9504
+    },
+    {
+      "epoch": 0.09505,
+      "grad_norm": 0.5596445798873901,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 9505
+    },
+    {
+      "epoch": 0.09506,
+      "grad_norm": 0.6463555693626404,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 9506
+    },
+    {
+      "epoch": 0.09507,
+      "grad_norm": 0.8175804615020752,
+      "learning_rate": 0.003,
+      "loss": 4.0565,
+      "step": 9507
+    },
+    {
+      "epoch": 0.09508,
+      "grad_norm": 0.995126485824585,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 9508
+    },
+    {
+      "epoch": 0.09509,
+      "grad_norm": 1.0844470262527466,
+      "learning_rate": 0.003,
+      "loss": 4.0347,
+      "step": 9509
+    },
+    {
+      "epoch": 0.0951,
+      "grad_norm": 0.8322227001190186,
+      "learning_rate": 0.003,
+      "loss": 4.0389,
+      "step": 9510
+    },
+    {
+      "epoch": 0.09511,
+      "grad_norm": 0.838636040687561,
+      "learning_rate": 0.003,
+      "loss": 4.0302,
+      "step": 9511
+    },
+    {
+      "epoch": 0.09512,
+      "grad_norm": 0.8196890354156494,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 9512
+    },
+    {
+      "epoch": 0.09513,
+      "grad_norm": 0.7390263080596924,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 9513
+    },
+    {
+      "epoch": 0.09514,
+      "grad_norm": 0.5975653529167175,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 9514
+    },
+    {
+      "epoch": 0.09515,
+      "grad_norm": 0.544066846370697,
+      "learning_rate": 0.003,
+      "loss": 4.0355,
+      "step": 9515
+    },
+    {
+      "epoch": 0.09516,
+      "grad_norm": 0.6771560907363892,
+      "learning_rate": 0.003,
+      "loss": 4.0289,
+      "step": 9516
+    },
+    {
+      "epoch": 0.09517,
+      "grad_norm": 0.7807263135910034,
+      "learning_rate": 0.003,
+      "loss": 4.0263,
+      "step": 9517
+    },
+    {
+      "epoch": 0.09518,
+      "grad_norm": 0.8398572206497192,
+      "learning_rate": 0.003,
+      "loss": 4.0492,
+      "step": 9518
+    },
+    {
+      "epoch": 0.09519,
+      "grad_norm": 0.8214612007141113,
+      "learning_rate": 0.003,
+      "loss": 4.0597,
+      "step": 9519
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.7340238690376282,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 9520
+    },
+    {
+      "epoch": 0.09521,
+      "grad_norm": 0.6527026295661926,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 9521
+    },
+    {
+      "epoch": 0.09522,
+      "grad_norm": 0.7640578746795654,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 9522
+    },
+    {
+      "epoch": 0.09523,
+      "grad_norm": 0.8175305724143982,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 9523
+    },
+    {
+      "epoch": 0.09524,
+      "grad_norm": 0.9655996561050415,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 9524
+    },
+    {
+      "epoch": 0.09525,
+      "grad_norm": 0.8379790186882019,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 9525
+    },
+    {
+      "epoch": 0.09526,
+      "grad_norm": 0.9293071627616882,
+      "learning_rate": 0.003,
+      "loss": 4.0394,
+      "step": 9526
+    },
+    {
+      "epoch": 0.09527,
+      "grad_norm": 0.9869701266288757,
+      "learning_rate": 0.003,
+      "loss": 4.0449,
+      "step": 9527
+    },
+    {
+      "epoch": 0.09528,
+      "grad_norm": 1.0038269758224487,
+      "learning_rate": 0.003,
+      "loss": 4.0611,
+      "step": 9528
+    },
+    {
+      "epoch": 0.09529,
+      "grad_norm": 1.0940920114517212,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 9529
+    },
+    {
+      "epoch": 0.0953,
+      "grad_norm": 0.8704492449760437,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 9530
+    },
+    {
+      "epoch": 0.09531,
+      "grad_norm": 0.7913749814033508,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 9531
+    },
+    {
+      "epoch": 0.09532,
+      "grad_norm": 0.9196059703826904,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 9532
+    },
+    {
+      "epoch": 0.09533,
+      "grad_norm": 1.0615483522415161,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 9533
+    },
+    {
+      "epoch": 0.09534,
+      "grad_norm": 0.9020791053771973,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 9534
+    },
+    {
+      "epoch": 0.09535,
+      "grad_norm": 0.948823094367981,
+      "learning_rate": 0.003,
+      "loss": 4.0456,
+      "step": 9535
+    },
+    {
+      "epoch": 0.09536,
+      "grad_norm": 1.080247163772583,
+      "learning_rate": 0.003,
+      "loss": 4.0662,
+      "step": 9536
+    },
+    {
+      "epoch": 0.09537,
+      "grad_norm": 0.874646008014679,
+      "learning_rate": 0.003,
+      "loss": 4.0694,
+      "step": 9537
+    },
+    {
+      "epoch": 0.09538,
+      "grad_norm": 0.9942668676376343,
+      "learning_rate": 0.003,
+      "loss": 4.05,
+      "step": 9538
+    },
+    {
+      "epoch": 0.09539,
+      "grad_norm": 1.0160984992980957,
+      "learning_rate": 0.003,
+      "loss": 4.0685,
+      "step": 9539
+    },
+    {
+      "epoch": 0.0954,
+      "grad_norm": 0.9498040676116943,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 9540
+    },
+    {
+      "epoch": 0.09541,
+      "grad_norm": 0.7964853644371033,
+      "learning_rate": 0.003,
+      "loss": 4.0505,
+      "step": 9541
+    },
+    {
+      "epoch": 0.09542,
+      "grad_norm": 0.6286479830741882,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 9542
+    },
+    {
+      "epoch": 0.09543,
+      "grad_norm": 0.5860509872436523,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 9543
+    },
+    {
+      "epoch": 0.09544,
+      "grad_norm": 0.6001367568969727,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 9544
+    },
+    {
+      "epoch": 0.09545,
+      "grad_norm": 0.5685854554176331,
+      "learning_rate": 0.003,
+      "loss": 4.0331,
+      "step": 9545
+    },
+    {
+      "epoch": 0.09546,
+      "grad_norm": 0.6297983527183533,
+      "learning_rate": 0.003,
+      "loss": 4.0473,
+      "step": 9546
+    },
+    {
+      "epoch": 0.09547,
+      "grad_norm": 0.594254195690155,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 9547
+    },
+    {
+      "epoch": 0.09548,
+      "grad_norm": 0.5334281921386719,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 9548
+    },
+    {
+      "epoch": 0.09549,
+      "grad_norm": 0.47183743119239807,
+      "learning_rate": 0.003,
+      "loss": 4.0617,
+      "step": 9549
+    },
+    {
+      "epoch": 0.0955,
+      "grad_norm": 0.4987733066082001,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 9550
+    },
+    {
+      "epoch": 0.09551,
+      "grad_norm": 0.5213873386383057,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 9551
+    },
+    {
+      "epoch": 0.09552,
+      "grad_norm": 0.5530688166618347,
+      "learning_rate": 0.003,
+      "loss": 4.0756,
+      "step": 9552
+    },
+    {
+      "epoch": 0.09553,
+      "grad_norm": 0.5738921761512756,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 9553
+    },
+    {
+      "epoch": 0.09554,
+      "grad_norm": 0.5256248116493225,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 9554
+    },
+    {
+      "epoch": 0.09555,
+      "grad_norm": 0.5314795970916748,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 9555
+    },
+    {
+      "epoch": 0.09556,
+      "grad_norm": 0.6204894781112671,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 9556
+    },
+    {
+      "epoch": 0.09557,
+      "grad_norm": 0.8138146996498108,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 9557
+    },
+    {
+      "epoch": 0.09558,
+      "grad_norm": 1.0319538116455078,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 9558
+    },
+    {
+      "epoch": 0.09559,
+      "grad_norm": 1.1002483367919922,
+      "learning_rate": 0.003,
+      "loss": 4.0602,
+      "step": 9559
+    },
+    {
+      "epoch": 0.0956,
+      "grad_norm": 0.7374374270439148,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 9560
+    },
+    {
+      "epoch": 0.09561,
+      "grad_norm": 0.6492986083030701,
+      "learning_rate": 0.003,
+      "loss": 4.0457,
+      "step": 9561
+    },
+    {
+      "epoch": 0.09562,
+      "grad_norm": 0.6662077903747559,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 9562
+    },
+    {
+      "epoch": 0.09563,
+      "grad_norm": 0.6137615442276001,
+      "learning_rate": 0.003,
+      "loss": 4.0586,
+      "step": 9563
+    },
+    {
+      "epoch": 0.09564,
+      "grad_norm": 0.6091228127479553,
+      "learning_rate": 0.003,
+      "loss": 4.0451,
+      "step": 9564
+    },
+    {
+      "epoch": 0.09565,
+      "grad_norm": 0.6556311845779419,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 9565
+    },
+    {
+      "epoch": 0.09566,
+      "grad_norm": 0.7306276559829712,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 9566
+    },
+    {
+      "epoch": 0.09567,
+      "grad_norm": 0.8696443438529968,
+      "learning_rate": 0.003,
+      "loss": 4.0237,
+      "step": 9567
+    },
+    {
+      "epoch": 0.09568,
+      "grad_norm": 0.9888119101524353,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 9568
+    },
+    {
+      "epoch": 0.09569,
+      "grad_norm": 0.8965682983398438,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 9569
+    },
+    {
+      "epoch": 0.0957,
+      "grad_norm": 0.6938888430595398,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 9570
+    },
+    {
+      "epoch": 0.09571,
+      "grad_norm": 0.6471319794654846,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 9571
+    },
+    {
+      "epoch": 0.09572,
+      "grad_norm": 0.6941522359848022,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 9572
+    },
+    {
+      "epoch": 0.09573,
+      "grad_norm": 0.7525028586387634,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 9573
+    },
+    {
+      "epoch": 0.09574,
+      "grad_norm": 0.7359570860862732,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 9574
+    },
+    {
+      "epoch": 0.09575,
+      "grad_norm": 0.679182231426239,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 9575
+    },
+    {
+      "epoch": 0.09576,
+      "grad_norm": 0.634701669216156,
+      "learning_rate": 0.003,
+      "loss": 4.0119,
+      "step": 9576
+    },
+    {
+      "epoch": 0.09577,
+      "grad_norm": 0.5520635843276978,
+      "learning_rate": 0.003,
+      "loss": 3.9988,
+      "step": 9577
+    },
+    {
+      "epoch": 0.09578,
+      "grad_norm": 0.6691205501556396,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 9578
+    },
+    {
+      "epoch": 0.09579,
+      "grad_norm": 0.8371917605400085,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 9579
+    },
+    {
+      "epoch": 0.0958,
+      "grad_norm": 1.0776106119155884,
+      "learning_rate": 0.003,
+      "loss": 4.0492,
+      "step": 9580
+    },
+    {
+      "epoch": 0.09581,
+      "grad_norm": 1.0245087146759033,
+      "learning_rate": 0.003,
+      "loss": 4.0812,
+      "step": 9581
+    },
+    {
+      "epoch": 0.09582,
+      "grad_norm": 0.8801830410957336,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 9582
+    },
+    {
+      "epoch": 0.09583,
+      "grad_norm": 0.8247877359390259,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 9583
+    },
+    {
+      "epoch": 0.09584,
+      "grad_norm": 0.7499005198478699,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 9584
+    },
+    {
+      "epoch": 0.09585,
+      "grad_norm": 0.7511628866195679,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 9585
+    },
+    {
+      "epoch": 0.09586,
+      "grad_norm": 0.6854458451271057,
+      "learning_rate": 0.003,
+      "loss": 4.0449,
+      "step": 9586
+    },
+    {
+      "epoch": 0.09587,
+      "grad_norm": 0.6980164051055908,
+      "learning_rate": 0.003,
+      "loss": 3.9967,
+      "step": 9587
+    },
+    {
+      "epoch": 0.09588,
+      "grad_norm": 0.7041137218475342,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 9588
+    },
+    {
+      "epoch": 0.09589,
+      "grad_norm": 0.7521610856056213,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 9589
+    },
+    {
+      "epoch": 0.0959,
+      "grad_norm": 0.7036656737327576,
+      "learning_rate": 0.003,
+      "loss": 4.0462,
+      "step": 9590
+    },
+    {
+      "epoch": 0.09591,
+      "grad_norm": 0.7238452434539795,
+      "learning_rate": 0.003,
+      "loss": 4.0742,
+      "step": 9591
+    },
+    {
+      "epoch": 0.09592,
+      "grad_norm": 0.8542155027389526,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 9592
+    },
+    {
+      "epoch": 0.09593,
+      "grad_norm": 0.9843723177909851,
+      "learning_rate": 0.003,
+      "loss": 4.0541,
+      "step": 9593
+    },
+    {
+      "epoch": 0.09594,
+      "grad_norm": 1.0649001598358154,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 9594
+    },
+    {
+      "epoch": 0.09595,
+      "grad_norm": 0.8816093802452087,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 9595
+    },
+    {
+      "epoch": 0.09596,
+      "grad_norm": 0.7636190056800842,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 9596
+    },
+    {
+      "epoch": 0.09597,
+      "grad_norm": 0.7464763522148132,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 9597
+    },
+    {
+      "epoch": 0.09598,
+      "grad_norm": 0.8498935699462891,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 9598
+    },
+    {
+      "epoch": 0.09599,
+      "grad_norm": 0.9224379062652588,
+      "learning_rate": 0.003,
+      "loss": 4.0316,
+      "step": 9599
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.8882635235786438,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 9600
+    },
+    {
+      "epoch": 0.09601,
+      "grad_norm": 0.8223801255226135,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 9601
+    },
+    {
+      "epoch": 0.09602,
+      "grad_norm": 0.8122618198394775,
+      "learning_rate": 0.003,
+      "loss": 4.0677,
+      "step": 9602
+    },
+    {
+      "epoch": 0.09603,
+      "grad_norm": 0.7895182967185974,
+      "learning_rate": 0.003,
+      "loss": 4.0432,
+      "step": 9603
+    },
+    {
+      "epoch": 0.09604,
+      "grad_norm": 0.6999941468238831,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 9604
+    },
+    {
+      "epoch": 0.09605,
+      "grad_norm": 0.7563785910606384,
+      "learning_rate": 0.003,
+      "loss": 4.0467,
+      "step": 9605
+    },
+    {
+      "epoch": 0.09606,
+      "grad_norm": 0.8351266980171204,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 9606
+    },
+    {
+      "epoch": 0.09607,
+      "grad_norm": 0.9096396565437317,
+      "learning_rate": 0.003,
+      "loss": 4.0547,
+      "step": 9607
+    },
+    {
+      "epoch": 0.09608,
+      "grad_norm": 0.9197260141372681,
+      "learning_rate": 0.003,
+      "loss": 4.0667,
+      "step": 9608
+    },
+    {
+      "epoch": 0.09609,
+      "grad_norm": 0.7963103652000427,
+      "learning_rate": 0.003,
+      "loss": 4.0355,
+      "step": 9609
+    },
+    {
+      "epoch": 0.0961,
+      "grad_norm": 0.6685717701911926,
+      "learning_rate": 0.003,
+      "loss": 4.0369,
+      "step": 9610
+    },
+    {
+      "epoch": 0.09611,
+      "grad_norm": 0.7270292043685913,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 9611
+    },
+    {
+      "epoch": 0.09612,
+      "grad_norm": 0.7644274830818176,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 9612
+    },
+    {
+      "epoch": 0.09613,
+      "grad_norm": 0.790471613407135,
+      "learning_rate": 0.003,
+      "loss": 4.049,
+      "step": 9613
+    },
+    {
+      "epoch": 0.09614,
+      "grad_norm": 0.8529259562492371,
+      "learning_rate": 0.003,
+      "loss": 4.0451,
+      "step": 9614
+    },
+    {
+      "epoch": 0.09615,
+      "grad_norm": 0.7713029384613037,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 9615
+    },
+    {
+      "epoch": 0.09616,
+      "grad_norm": 0.6528921127319336,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 9616
+    },
+    {
+      "epoch": 0.09617,
+      "grad_norm": 0.7347012758255005,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 9617
+    },
+    {
+      "epoch": 0.09618,
+      "grad_norm": 0.7659905552864075,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 9618
+    },
+    {
+      "epoch": 0.09619,
+      "grad_norm": 0.8106355667114258,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 9619
+    },
+    {
+      "epoch": 0.0962,
+      "grad_norm": 0.6627798080444336,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 9620
+    },
+    {
+      "epoch": 0.09621,
+      "grad_norm": 0.7969244122505188,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 9621
+    },
+    {
+      "epoch": 0.09622,
+      "grad_norm": 0.903766393661499,
+      "learning_rate": 0.003,
+      "loss": 4.0311,
+      "step": 9622
+    },
+    {
+      "epoch": 0.09623,
+      "grad_norm": 1.0692620277404785,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 9623
+    },
+    {
+      "epoch": 0.09624,
+      "grad_norm": 0.8282185196876526,
+      "learning_rate": 0.003,
+      "loss": 4.0389,
+      "step": 9624
+    },
+    {
+      "epoch": 0.09625,
+      "grad_norm": 0.713636040687561,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 9625
+    },
+    {
+      "epoch": 0.09626,
+      "grad_norm": 0.7236156463623047,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 9626
+    },
+    {
+      "epoch": 0.09627,
+      "grad_norm": 0.694229245185852,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 9627
+    },
+    {
+      "epoch": 0.09628,
+      "grad_norm": 0.6868857741355896,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 9628
+    },
+    {
+      "epoch": 0.09629,
+      "grad_norm": 0.7673381567001343,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 9629
+    },
+    {
+      "epoch": 0.0963,
+      "grad_norm": 0.8264285922050476,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 9630
+    },
+    {
+      "epoch": 0.09631,
+      "grad_norm": 0.8883211016654968,
+      "learning_rate": 0.003,
+      "loss": 4.0684,
+      "step": 9631
+    },
+    {
+      "epoch": 0.09632,
+      "grad_norm": 0.8831930756568909,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 9632
+    },
+    {
+      "epoch": 0.09633,
+      "grad_norm": 0.8816713690757751,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 9633
+    },
+    {
+      "epoch": 0.09634,
+      "grad_norm": 0.8870523571968079,
+      "learning_rate": 0.003,
+      "loss": 4.0095,
+      "step": 9634
+    },
+    {
+      "epoch": 0.09635,
+      "grad_norm": 0.9964356422424316,
+      "learning_rate": 0.003,
+      "loss": 4.0557,
+      "step": 9635
+    },
+    {
+      "epoch": 0.09636,
+      "grad_norm": 0.8906300067901611,
+      "learning_rate": 0.003,
+      "loss": 4.0622,
+      "step": 9636
+    },
+    {
+      "epoch": 0.09637,
+      "grad_norm": 0.8000807762145996,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 9637
+    },
+    {
+      "epoch": 0.09638,
+      "grad_norm": 0.7173836827278137,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 9638
+    },
+    {
+      "epoch": 0.09639,
+      "grad_norm": 0.67566978931427,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 9639
+    },
+    {
+      "epoch": 0.0964,
+      "grad_norm": 0.5778241753578186,
+      "learning_rate": 0.003,
+      "loss": 4.0644,
+      "step": 9640
+    },
+    {
+      "epoch": 0.09641,
+      "grad_norm": 0.7081737518310547,
+      "learning_rate": 0.003,
+      "loss": 4.0057,
+      "step": 9641
+    },
+    {
+      "epoch": 0.09642,
+      "grad_norm": 0.741016149520874,
+      "learning_rate": 0.003,
+      "loss": 4.0572,
+      "step": 9642
+    },
+    {
+      "epoch": 0.09643,
+      "grad_norm": 0.778562605381012,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 9643
+    },
+    {
+      "epoch": 0.09644,
+      "grad_norm": 0.9048216342926025,
+      "learning_rate": 0.003,
+      "loss": 4.0493,
+      "step": 9644
+    },
+    {
+      "epoch": 0.09645,
+      "grad_norm": 0.9883460402488708,
+      "learning_rate": 0.003,
+      "loss": 4.0436,
+      "step": 9645
+    },
+    {
+      "epoch": 0.09646,
+      "grad_norm": 0.7610167860984802,
+      "learning_rate": 0.003,
+      "loss": 4.0347,
+      "step": 9646
+    },
+    {
+      "epoch": 0.09647,
+      "grad_norm": 0.6392595767974854,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 9647
+    },
+    {
+      "epoch": 0.09648,
+      "grad_norm": 0.6436144709587097,
+      "learning_rate": 0.003,
+      "loss": 4.0676,
+      "step": 9648
+    },
+    {
+      "epoch": 0.09649,
+      "grad_norm": 0.6952054500579834,
+      "learning_rate": 0.003,
+      "loss": 4.0039,
+      "step": 9649
+    },
+    {
+      "epoch": 0.0965,
+      "grad_norm": 0.6287616491317749,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 9650
+    },
+    {
+      "epoch": 0.09651,
+      "grad_norm": 0.6473034024238586,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 9651
+    },
+    {
+      "epoch": 0.09652,
+      "grad_norm": 0.6925511360168457,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 9652
+    },
+    {
+      "epoch": 0.09653,
+      "grad_norm": 0.708454966545105,
+      "learning_rate": 0.003,
+      "loss": 4.0501,
+      "step": 9653
+    },
+    {
+      "epoch": 0.09654,
+      "grad_norm": 0.7384510040283203,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 9654
+    },
+    {
+      "epoch": 0.09655,
+      "grad_norm": 0.6604750156402588,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 9655
+    },
+    {
+      "epoch": 0.09656,
+      "grad_norm": 0.5758141279220581,
+      "learning_rate": 0.003,
+      "loss": 4.0447,
+      "step": 9656
+    },
+    {
+      "epoch": 0.09657,
+      "grad_norm": 0.6268450021743774,
+      "learning_rate": 0.003,
+      "loss": 4.0414,
+      "step": 9657
+    },
+    {
+      "epoch": 0.09658,
+      "grad_norm": 0.7189260125160217,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 9658
+    },
+    {
+      "epoch": 0.09659,
+      "grad_norm": 0.7771549224853516,
+      "learning_rate": 0.003,
+      "loss": 4.0529,
+      "step": 9659
+    },
+    {
+      "epoch": 0.0966,
+      "grad_norm": 0.8796934485435486,
+      "learning_rate": 0.003,
+      "loss": 4.0475,
+      "step": 9660
+    },
+    {
+      "epoch": 0.09661,
+      "grad_norm": 0.896022617816925,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 9661
+    },
+    {
+      "epoch": 0.09662,
+      "grad_norm": 0.938194751739502,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 9662
+    },
+    {
+      "epoch": 0.09663,
+      "grad_norm": 0.8051214218139648,
+      "learning_rate": 0.003,
+      "loss": 4.0511,
+      "step": 9663
+    },
+    {
+      "epoch": 0.09664,
+      "grad_norm": 0.6830205321311951,
+      "learning_rate": 0.003,
+      "loss": 4.044,
+      "step": 9664
+    },
+    {
+      "epoch": 0.09665,
+      "grad_norm": 0.793272078037262,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 9665
+    },
+    {
+      "epoch": 0.09666,
+      "grad_norm": 0.9261988401412964,
+      "learning_rate": 0.003,
+      "loss": 4.0497,
+      "step": 9666
+    },
+    {
+      "epoch": 0.09667,
+      "grad_norm": 0.8233135938644409,
+      "learning_rate": 0.003,
+      "loss": 4.0481,
+      "step": 9667
+    },
+    {
+      "epoch": 0.09668,
+      "grad_norm": 0.7708394527435303,
+      "learning_rate": 0.003,
+      "loss": 4.0207,
+      "step": 9668
+    },
+    {
+      "epoch": 0.09669,
+      "grad_norm": 0.7485005855560303,
+      "learning_rate": 0.003,
+      "loss": 4.0263,
+      "step": 9669
+    },
+    {
+      "epoch": 0.0967,
+      "grad_norm": 0.7645751237869263,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 9670
+    },
+    {
+      "epoch": 0.09671,
+      "grad_norm": 0.882091224193573,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 9671
+    },
+    {
+      "epoch": 0.09672,
+      "grad_norm": 1.0624443292617798,
+      "learning_rate": 0.003,
+      "loss": 4.0563,
+      "step": 9672
+    },
+    {
+      "epoch": 0.09673,
+      "grad_norm": 0.900747537612915,
+      "learning_rate": 0.003,
+      "loss": 4.0563,
+      "step": 9673
+    },
+    {
+      "epoch": 0.09674,
+      "grad_norm": 0.8541334867477417,
+      "learning_rate": 0.003,
+      "loss": 4.0874,
+      "step": 9674
+    },
+    {
+      "epoch": 0.09675,
+      "grad_norm": 0.9348879456520081,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 9675
+    },
+    {
+      "epoch": 0.09676,
+      "grad_norm": 0.8951733112335205,
+      "learning_rate": 0.003,
+      "loss": 4.0493,
+      "step": 9676
+    },
+    {
+      "epoch": 0.09677,
+      "grad_norm": 1.0163615942001343,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 9677
+    },
+    {
+      "epoch": 0.09678,
+      "grad_norm": 0.9605147242546082,
+      "learning_rate": 0.003,
+      "loss": 4.0748,
+      "step": 9678
+    },
+    {
+      "epoch": 0.09679,
+      "grad_norm": 0.8225002288818359,
+      "learning_rate": 0.003,
+      "loss": 4.0597,
+      "step": 9679
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.8135245442390442,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 9680
+    },
+    {
+      "epoch": 0.09681,
+      "grad_norm": 0.8713023066520691,
+      "learning_rate": 0.003,
+      "loss": 4.0569,
+      "step": 9681
+    },
+    {
+      "epoch": 0.09682,
+      "grad_norm": 0.8497050404548645,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 9682
+    },
+    {
+      "epoch": 0.09683,
+      "grad_norm": 0.7430970072746277,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 9683
+    },
+    {
+      "epoch": 0.09684,
+      "grad_norm": 0.7872548699378967,
+      "learning_rate": 0.003,
+      "loss": 4.0604,
+      "step": 9684
+    },
+    {
+      "epoch": 0.09685,
+      "grad_norm": 0.6804424524307251,
+      "learning_rate": 0.003,
+      "loss": 4.04,
+      "step": 9685
+    },
+    {
+      "epoch": 0.09686,
+      "grad_norm": 0.5914613008499146,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 9686
+    },
+    {
+      "epoch": 0.09687,
+      "grad_norm": 0.5871429443359375,
+      "learning_rate": 0.003,
+      "loss": 4.043,
+      "step": 9687
+    },
+    {
+      "epoch": 0.09688,
+      "grad_norm": 0.7036808133125305,
+      "learning_rate": 0.003,
+      "loss": 4.0714,
+      "step": 9688
+    },
+    {
+      "epoch": 0.09689,
+      "grad_norm": 0.7072443962097168,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 9689
+    },
+    {
+      "epoch": 0.0969,
+      "grad_norm": 0.6006872653961182,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 9690
+    },
+    {
+      "epoch": 0.09691,
+      "grad_norm": 0.627549409866333,
+      "learning_rate": 0.003,
+      "loss": 4.0443,
+      "step": 9691
+    },
+    {
+      "epoch": 0.09692,
+      "grad_norm": 0.6675216555595398,
+      "learning_rate": 0.003,
+      "loss": 4.0448,
+      "step": 9692
+    },
+    {
+      "epoch": 0.09693,
+      "grad_norm": 0.8195521831512451,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 9693
+    },
+    {
+      "epoch": 0.09694,
+      "grad_norm": 1.020370602607727,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 9694
+    },
+    {
+      "epoch": 0.09695,
+      "grad_norm": 1.159254550933838,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 9695
+    },
+    {
+      "epoch": 0.09696,
+      "grad_norm": 0.7246578335762024,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 9696
+    },
+    {
+      "epoch": 0.09697,
+      "grad_norm": 0.7815423607826233,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 9697
+    },
+    {
+      "epoch": 0.09698,
+      "grad_norm": 0.9583694934844971,
+      "learning_rate": 0.003,
+      "loss": 4.053,
+      "step": 9698
+    },
+    {
+      "epoch": 0.09699,
+      "grad_norm": 0.948357343673706,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 9699
+    },
+    {
+      "epoch": 0.097,
+      "grad_norm": 0.8365939855575562,
+      "learning_rate": 0.003,
+      "loss": 4.0586,
+      "step": 9700
+    },
+    {
+      "epoch": 0.09701,
+      "grad_norm": 0.7484097480773926,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 9701
+    },
+    {
+      "epoch": 0.09702,
+      "grad_norm": 0.8812928795814514,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 9702
+    },
+    {
+      "epoch": 0.09703,
+      "grad_norm": 0.8621808886528015,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 9703
+    },
+    {
+      "epoch": 0.09704,
+      "grad_norm": 0.7826769351959229,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 9704
+    },
+    {
+      "epoch": 0.09705,
+      "grad_norm": 0.6948601603507996,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 9705
+    },
+    {
+      "epoch": 0.09706,
+      "grad_norm": 0.7186203002929688,
+      "learning_rate": 0.003,
+      "loss": 4.042,
+      "step": 9706
+    },
+    {
+      "epoch": 0.09707,
+      "grad_norm": 0.8767298460006714,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 9707
+    },
+    {
+      "epoch": 0.09708,
+      "grad_norm": 1.0957658290863037,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 9708
+    },
+    {
+      "epoch": 0.09709,
+      "grad_norm": 0.939866304397583,
+      "learning_rate": 0.003,
+      "loss": 4.0436,
+      "step": 9709
+    },
+    {
+      "epoch": 0.0971,
+      "grad_norm": 0.9766221046447754,
+      "learning_rate": 0.003,
+      "loss": 4.0364,
+      "step": 9710
+    },
+    {
+      "epoch": 0.09711,
+      "grad_norm": 0.938903272151947,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 9711
+    },
+    {
+      "epoch": 0.09712,
+      "grad_norm": 0.9619114995002747,
+      "learning_rate": 0.003,
+      "loss": 4.0789,
+      "step": 9712
+    },
+    {
+      "epoch": 0.09713,
+      "grad_norm": 0.8209119439125061,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 9713
+    },
+    {
+      "epoch": 0.09714,
+      "grad_norm": 0.5877271890640259,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 9714
+    },
+    {
+      "epoch": 0.09715,
+      "grad_norm": 0.6621226668357849,
+      "learning_rate": 0.003,
+      "loss": 4.0394,
+      "step": 9715
+    },
+    {
+      "epoch": 0.09716,
+      "grad_norm": 0.7536833882331848,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 9716
+    },
+    {
+      "epoch": 0.09717,
+      "grad_norm": 0.8996556401252747,
+      "learning_rate": 0.003,
+      "loss": 4.0662,
+      "step": 9717
+    },
+    {
+      "epoch": 0.09718,
+      "grad_norm": 0.8144115805625916,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 9718
+    },
+    {
+      "epoch": 0.09719,
+      "grad_norm": 0.7471117377281189,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 9719
+    },
+    {
+      "epoch": 0.0972,
+      "grad_norm": 0.7669509649276733,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 9720
+    },
+    {
+      "epoch": 0.09721,
+      "grad_norm": 0.7159215807914734,
+      "learning_rate": 0.003,
+      "loss": 4.0536,
+      "step": 9721
+    },
+    {
+      "epoch": 0.09722,
+      "grad_norm": 0.6609331369400024,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 9722
+    },
+    {
+      "epoch": 0.09723,
+      "grad_norm": 0.5876916646957397,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 9723
+    },
+    {
+      "epoch": 0.09724,
+      "grad_norm": 0.48599714040756226,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 9724
+    },
+    {
+      "epoch": 0.09725,
+      "grad_norm": 0.5811905860900879,
+      "learning_rate": 0.003,
+      "loss": 3.9738,
+      "step": 9725
+    },
+    {
+      "epoch": 0.09726,
+      "grad_norm": 0.7120689153671265,
+      "learning_rate": 0.003,
+      "loss": 4.0339,
+      "step": 9726
+    },
+    {
+      "epoch": 0.09727,
+      "grad_norm": 0.8851310014724731,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 9727
+    },
+    {
+      "epoch": 0.09728,
+      "grad_norm": 0.842197060585022,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 9728
+    },
+    {
+      "epoch": 0.09729,
+      "grad_norm": 0.6467271447181702,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 9729
+    },
+    {
+      "epoch": 0.0973,
+      "grad_norm": 0.48625877499580383,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 9730
+    },
+    {
+      "epoch": 0.09731,
+      "grad_norm": 0.5645377039909363,
+      "learning_rate": 0.003,
+      "loss": 3.9811,
+      "step": 9731
+    },
+    {
+      "epoch": 0.09732,
+      "grad_norm": 0.5628966093063354,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 9732
+    },
+    {
+      "epoch": 0.09733,
+      "grad_norm": 0.46445903182029724,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 9733
+    },
+    {
+      "epoch": 0.09734,
+      "grad_norm": 0.4646936357021332,
+      "learning_rate": 0.003,
+      "loss": 4.0397,
+      "step": 9734
+    },
+    {
+      "epoch": 0.09735,
+      "grad_norm": 0.43636274337768555,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 9735
+    },
+    {
+      "epoch": 0.09736,
+      "grad_norm": 0.5397223830223083,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 9736
+    },
+    {
+      "epoch": 0.09737,
+      "grad_norm": 0.5813911557197571,
+      "learning_rate": 0.003,
+      "loss": 3.9686,
+      "step": 9737
+    },
+    {
+      "epoch": 0.09738,
+      "grad_norm": 0.6568850874900818,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 9738
+    },
+    {
+      "epoch": 0.09739,
+      "grad_norm": 0.7499068379402161,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 9739
+    },
+    {
+      "epoch": 0.0974,
+      "grad_norm": 0.971897542476654,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 9740
+    },
+    {
+      "epoch": 0.09741,
+      "grad_norm": 1.23680579662323,
+      "learning_rate": 0.003,
+      "loss": 4.0797,
+      "step": 9741
+    },
+    {
+      "epoch": 0.09742,
+      "grad_norm": 0.6952364444732666,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 9742
+    },
+    {
+      "epoch": 0.09743,
+      "grad_norm": 0.7512345314025879,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 9743
+    },
+    {
+      "epoch": 0.09744,
+      "grad_norm": 0.8438845872879028,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 9744
+    },
+    {
+      "epoch": 0.09745,
+      "grad_norm": 0.881648063659668,
+      "learning_rate": 0.003,
+      "loss": 4.06,
+      "step": 9745
+    },
+    {
+      "epoch": 0.09746,
+      "grad_norm": 0.8643108010292053,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 9746
+    },
+    {
+      "epoch": 0.09747,
+      "grad_norm": 1.1098822355270386,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 9747
+    },
+    {
+      "epoch": 0.09748,
+      "grad_norm": 0.9371985197067261,
+      "learning_rate": 0.003,
+      "loss": 4.0686,
+      "step": 9748
+    },
+    {
+      "epoch": 0.09749,
+      "grad_norm": 0.9462610483169556,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 9749
+    },
+    {
+      "epoch": 0.0975,
+      "grad_norm": 0.8169135451316833,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 9750
+    },
+    {
+      "epoch": 0.09751,
+      "grad_norm": 0.7658066749572754,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 9751
+    },
+    {
+      "epoch": 0.09752,
+      "grad_norm": 0.7241183519363403,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 9752
+    },
+    {
+      "epoch": 0.09753,
+      "grad_norm": 0.7353487014770508,
+      "learning_rate": 0.003,
+      "loss": 3.9943,
+      "step": 9753
+    },
+    {
+      "epoch": 0.09754,
+      "grad_norm": 0.7551058530807495,
+      "learning_rate": 0.003,
+      "loss": 4.0435,
+      "step": 9754
+    },
+    {
+      "epoch": 0.09755,
+      "grad_norm": 0.6881881356239319,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 9755
+    },
+    {
+      "epoch": 0.09756,
+      "grad_norm": 0.6139550805091858,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 9756
+    },
+    {
+      "epoch": 0.09757,
+      "grad_norm": 0.7223386168479919,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 9757
+    },
+    {
+      "epoch": 0.09758,
+      "grad_norm": 0.8988397121429443,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 9758
+    },
+    {
+      "epoch": 0.09759,
+      "grad_norm": 1.005197286605835,
+      "learning_rate": 0.003,
+      "loss": 4.0342,
+      "step": 9759
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 1.1311531066894531,
+      "learning_rate": 0.003,
+      "loss": 4.0421,
+      "step": 9760
+    },
+    {
+      "epoch": 0.09761,
+      "grad_norm": 0.7937563061714172,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 9761
+    },
+    {
+      "epoch": 0.09762,
+      "grad_norm": 0.6707890033721924,
+      "learning_rate": 0.003,
+      "loss": 4.0487,
+      "step": 9762
+    },
+    {
+      "epoch": 0.09763,
+      "grad_norm": 0.7106285691261292,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 9763
+    },
+    {
+      "epoch": 0.09764,
+      "grad_norm": 0.7116639614105225,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 9764
+    },
+    {
+      "epoch": 0.09765,
+      "grad_norm": 0.7842574119567871,
+      "learning_rate": 0.003,
+      "loss": 4.0598,
+      "step": 9765
+    },
+    {
+      "epoch": 0.09766,
+      "grad_norm": 0.902124285697937,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 9766
+    },
+    {
+      "epoch": 0.09767,
+      "grad_norm": 1.0107566118240356,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 9767
+    },
+    {
+      "epoch": 0.09768,
+      "grad_norm": 0.9981918334960938,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 9768
+    },
+    {
+      "epoch": 0.09769,
+      "grad_norm": 0.7847188115119934,
+      "learning_rate": 0.003,
+      "loss": 4.0591,
+      "step": 9769
+    },
+    {
+      "epoch": 0.0977,
+      "grad_norm": 0.8173283934593201,
+      "learning_rate": 0.003,
+      "loss": 4.0547,
+      "step": 9770
+    },
+    {
+      "epoch": 0.09771,
+      "grad_norm": 0.8425480127334595,
+      "learning_rate": 0.003,
+      "loss": 4.0271,
+      "step": 9771
+    },
+    {
+      "epoch": 0.09772,
+      "grad_norm": 0.8838803172111511,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 9772
+    },
+    {
+      "epoch": 0.09773,
+      "grad_norm": 0.8878778219223022,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 9773
+    },
+    {
+      "epoch": 0.09774,
+      "grad_norm": 0.9738554954528809,
+      "learning_rate": 0.003,
+      "loss": 4.0658,
+      "step": 9774
+    },
+    {
+      "epoch": 0.09775,
+      "grad_norm": 0.969858705997467,
+      "learning_rate": 0.003,
+      "loss": 4.0597,
+      "step": 9775
+    },
+    {
+      "epoch": 0.09776,
+      "grad_norm": 1.0172128677368164,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 9776
+    },
+    {
+      "epoch": 0.09777,
+      "grad_norm": 0.7204704284667969,
+      "learning_rate": 0.003,
+      "loss": 4.0574,
+      "step": 9777
+    },
+    {
+      "epoch": 0.09778,
+      "grad_norm": 0.6760993003845215,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 9778
+    },
+    {
+      "epoch": 0.09779,
+      "grad_norm": 0.723325788974762,
+      "learning_rate": 0.003,
+      "loss": 4.0295,
+      "step": 9779
+    },
+    {
+      "epoch": 0.0978,
+      "grad_norm": 0.7009559273719788,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 9780
+    },
+    {
+      "epoch": 0.09781,
+      "grad_norm": 0.7214966416358948,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 9781
+    },
+    {
+      "epoch": 0.09782,
+      "grad_norm": 0.7424318790435791,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 9782
+    },
+    {
+      "epoch": 0.09783,
+      "grad_norm": 0.6193498969078064,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 9783
+    },
+    {
+      "epoch": 0.09784,
+      "grad_norm": 0.6854250431060791,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 9784
+    },
+    {
+      "epoch": 0.09785,
+      "grad_norm": 0.7265077233314514,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 9785
+    },
+    {
+      "epoch": 0.09786,
+      "grad_norm": 0.7454569935798645,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 9786
+    },
+    {
+      "epoch": 0.09787,
+      "grad_norm": 0.8594154119491577,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 9787
+    },
+    {
+      "epoch": 0.09788,
+      "grad_norm": 1.0226812362670898,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 9788
+    },
+    {
+      "epoch": 0.09789,
+      "grad_norm": 1.1170358657836914,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 9789
+    },
+    {
+      "epoch": 0.0979,
+      "grad_norm": 0.7604365348815918,
+      "learning_rate": 0.003,
+      "loss": 4.0389,
+      "step": 9790
+    },
+    {
+      "epoch": 0.09791,
+      "grad_norm": 0.6184024810791016,
+      "learning_rate": 0.003,
+      "loss": 4.0406,
+      "step": 9791
+    },
+    {
+      "epoch": 0.09792,
+      "grad_norm": 0.7284930348396301,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 9792
+    },
+    {
+      "epoch": 0.09793,
+      "grad_norm": 0.6939471364021301,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 9793
+    },
+    {
+      "epoch": 0.09794,
+      "grad_norm": 0.7011038661003113,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 9794
+    },
+    {
+      "epoch": 0.09795,
+      "grad_norm": 0.6286042332649231,
+      "learning_rate": 0.003,
+      "loss": 4.0444,
+      "step": 9795
+    },
+    {
+      "epoch": 0.09796,
+      "grad_norm": 0.5014172792434692,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 9796
+    },
+    {
+      "epoch": 0.09797,
+      "grad_norm": 0.5449438691139221,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 9797
+    },
+    {
+      "epoch": 0.09798,
+      "grad_norm": 0.6074647903442383,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 9798
+    },
+    {
+      "epoch": 0.09799,
+      "grad_norm": 0.6542043685913086,
+      "learning_rate": 0.003,
+      "loss": 4.0504,
+      "step": 9799
+    },
+    {
+      "epoch": 0.098,
+      "grad_norm": 0.785072386264801,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 9800
+    },
+    {
+      "epoch": 0.09801,
+      "grad_norm": 0.7558362483978271,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 9801
+    },
+    {
+      "epoch": 0.09802,
+      "grad_norm": 0.7080808281898499,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 9802
+    },
+    {
+      "epoch": 0.09803,
+      "grad_norm": 0.9028113484382629,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 9803
+    },
+    {
+      "epoch": 0.09804,
+      "grad_norm": 1.1181584596633911,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 9804
+    },
+    {
+      "epoch": 0.09805,
+      "grad_norm": 1.098592758178711,
+      "learning_rate": 0.003,
+      "loss": 4.0185,
+      "step": 9805
+    },
+    {
+      "epoch": 0.09806,
+      "grad_norm": 0.807129442691803,
+      "learning_rate": 0.003,
+      "loss": 4.0091,
+      "step": 9806
+    },
+    {
+      "epoch": 0.09807,
+      "grad_norm": 0.7035567164421082,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 9807
+    },
+    {
+      "epoch": 0.09808,
+      "grad_norm": 0.7286992073059082,
+      "learning_rate": 0.003,
+      "loss": 4.06,
+      "step": 9808
+    },
+    {
+      "epoch": 0.09809,
+      "grad_norm": 0.7307597994804382,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 9809
+    },
+    {
+      "epoch": 0.0981,
+      "grad_norm": 0.721434473991394,
+      "learning_rate": 0.003,
+      "loss": 4.0517,
+      "step": 9810
+    },
+    {
+      "epoch": 0.09811,
+      "grad_norm": 0.6393311023712158,
+      "learning_rate": 0.003,
+      "loss": 4.0524,
+      "step": 9811
+    },
+    {
+      "epoch": 0.09812,
+      "grad_norm": 0.6354530453681946,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 9812
+    },
+    {
+      "epoch": 0.09813,
+      "grad_norm": 0.7901116609573364,
+      "learning_rate": 0.003,
+      "loss": 4.0389,
+      "step": 9813
+    },
+    {
+      "epoch": 0.09814,
+      "grad_norm": 0.7675938010215759,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 9814
+    },
+    {
+      "epoch": 0.09815,
+      "grad_norm": 0.6112906336784363,
+      "learning_rate": 0.003,
+      "loss": 3.9847,
+      "step": 9815
+    },
+    {
+      "epoch": 0.09816,
+      "grad_norm": 0.7082177400588989,
+      "learning_rate": 0.003,
+      "loss": 3.9919,
+      "step": 9816
+    },
+    {
+      "epoch": 0.09817,
+      "grad_norm": 0.7757717370986938,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 9817
+    },
+    {
+      "epoch": 0.09818,
+      "grad_norm": 0.9383963346481323,
+      "learning_rate": 0.003,
+      "loss": 4.045,
+      "step": 9818
+    },
+    {
+      "epoch": 0.09819,
+      "grad_norm": 1.1323498487472534,
+      "learning_rate": 0.003,
+      "loss": 4.05,
+      "step": 9819
+    },
+    {
+      "epoch": 0.0982,
+      "grad_norm": 0.9455949068069458,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 9820
+    },
+    {
+      "epoch": 0.09821,
+      "grad_norm": 0.7970534563064575,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 9821
+    },
+    {
+      "epoch": 0.09822,
+      "grad_norm": 0.7729085683822632,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 9822
+    },
+    {
+      "epoch": 0.09823,
+      "grad_norm": 0.7592610716819763,
+      "learning_rate": 0.003,
+      "loss": 4.0516,
+      "step": 9823
+    },
+    {
+      "epoch": 0.09824,
+      "grad_norm": 0.8357574343681335,
+      "learning_rate": 0.003,
+      "loss": 4.0432,
+      "step": 9824
+    },
+    {
+      "epoch": 0.09825,
+      "grad_norm": 0.9557722210884094,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 9825
+    },
+    {
+      "epoch": 0.09826,
+      "grad_norm": 1.0662164688110352,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 9826
+    },
+    {
+      "epoch": 0.09827,
+      "grad_norm": 1.1117784976959229,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 9827
+    },
+    {
+      "epoch": 0.09828,
+      "grad_norm": 0.6650376319885254,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 9828
+    },
+    {
+      "epoch": 0.09829,
+      "grad_norm": 0.6073980331420898,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 9829
+    },
+    {
+      "epoch": 0.0983,
+      "grad_norm": 0.6785967350006104,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 9830
+    },
+    {
+      "epoch": 0.09831,
+      "grad_norm": 0.8099671006202698,
+      "learning_rate": 0.003,
+      "loss": 4.0427,
+      "step": 9831
+    },
+    {
+      "epoch": 0.09832,
+      "grad_norm": 0.8946943283081055,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 9832
+    },
+    {
+      "epoch": 0.09833,
+      "grad_norm": 0.9274506568908691,
+      "learning_rate": 0.003,
+      "loss": 4.058,
+      "step": 9833
+    },
+    {
+      "epoch": 0.09834,
+      "grad_norm": 0.7214897274971008,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 9834
+    },
+    {
+      "epoch": 0.09835,
+      "grad_norm": 0.5876917839050293,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 9835
+    },
+    {
+      "epoch": 0.09836,
+      "grad_norm": 0.585267186164856,
+      "learning_rate": 0.003,
+      "loss": 3.9972,
+      "step": 9836
+    },
+    {
+      "epoch": 0.09837,
+      "grad_norm": 0.6745166778564453,
+      "learning_rate": 0.003,
+      "loss": 3.9896,
+      "step": 9837
+    },
+    {
+      "epoch": 0.09838,
+      "grad_norm": 0.7141098380088806,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 9838
+    },
+    {
+      "epoch": 0.09839,
+      "grad_norm": 0.7113993167877197,
+      "learning_rate": 0.003,
+      "loss": 4.0556,
+      "step": 9839
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.7230168581008911,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 9840
+    },
+    {
+      "epoch": 0.09841,
+      "grad_norm": 0.6224821209907532,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 9841
+    },
+    {
+      "epoch": 0.09842,
+      "grad_norm": 0.6236535906791687,
+      "learning_rate": 0.003,
+      "loss": 4.015,
+      "step": 9842
+    },
+    {
+      "epoch": 0.09843,
+      "grad_norm": 0.5319640040397644,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 9843
+    },
+    {
+      "epoch": 0.09844,
+      "grad_norm": 0.5817659497261047,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 9844
+    },
+    {
+      "epoch": 0.09845,
+      "grad_norm": 0.6393766403198242,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 9845
+    },
+    {
+      "epoch": 0.09846,
+      "grad_norm": 0.6582833528518677,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 9846
+    },
+    {
+      "epoch": 0.09847,
+      "grad_norm": 0.6721013188362122,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 9847
+    },
+    {
+      "epoch": 0.09848,
+      "grad_norm": 0.6297866702079773,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 9848
+    },
+    {
+      "epoch": 0.09849,
+      "grad_norm": 0.7541521787643433,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 9849
+    },
+    {
+      "epoch": 0.0985,
+      "grad_norm": 0.9827596545219421,
+      "learning_rate": 0.003,
+      "loss": 4.0507,
+      "step": 9850
+    },
+    {
+      "epoch": 0.09851,
+      "grad_norm": 1.3056297302246094,
+      "learning_rate": 0.003,
+      "loss": 4.0619,
+      "step": 9851
+    },
+    {
+      "epoch": 0.09852,
+      "grad_norm": 0.595446765422821,
+      "learning_rate": 0.003,
+      "loss": 4.0249,
+      "step": 9852
+    },
+    {
+      "epoch": 0.09853,
+      "grad_norm": 0.6285281181335449,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 9853
+    },
+    {
+      "epoch": 0.09854,
+      "grad_norm": 0.7268086671829224,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 9854
+    },
+    {
+      "epoch": 0.09855,
+      "grad_norm": 0.7416141033172607,
+      "learning_rate": 0.003,
+      "loss": 3.9898,
+      "step": 9855
+    },
+    {
+      "epoch": 0.09856,
+      "grad_norm": 0.8398358225822449,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 9856
+    },
+    {
+      "epoch": 0.09857,
+      "grad_norm": 0.87022465467453,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 9857
+    },
+    {
+      "epoch": 0.09858,
+      "grad_norm": 0.9862394332885742,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 9858
+    },
+    {
+      "epoch": 0.09859,
+      "grad_norm": 1.0218671560287476,
+      "learning_rate": 0.003,
+      "loss": 4.0732,
+      "step": 9859
+    },
+    {
+      "epoch": 0.0986,
+      "grad_norm": 0.9830066561698914,
+      "learning_rate": 0.003,
+      "loss": 4.0334,
+      "step": 9860
+    },
+    {
+      "epoch": 0.09861,
+      "grad_norm": 0.9263790845870972,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 9861
+    },
+    {
+      "epoch": 0.09862,
+      "grad_norm": 0.9523604512214661,
+      "learning_rate": 0.003,
+      "loss": 4.0644,
+      "step": 9862
+    },
+    {
+      "epoch": 0.09863,
+      "grad_norm": 0.992346465587616,
+      "learning_rate": 0.003,
+      "loss": 4.0726,
+      "step": 9863
+    },
+    {
+      "epoch": 0.09864,
+      "grad_norm": 1.2398070096969604,
+      "learning_rate": 0.003,
+      "loss": 4.0871,
+      "step": 9864
+    },
+    {
+      "epoch": 0.09865,
+      "grad_norm": 0.8953294157981873,
+      "learning_rate": 0.003,
+      "loss": 4.057,
+      "step": 9865
+    },
+    {
+      "epoch": 0.09866,
+      "grad_norm": 0.9769341349601746,
+      "learning_rate": 0.003,
+      "loss": 4.048,
+      "step": 9866
+    },
+    {
+      "epoch": 0.09867,
+      "grad_norm": 1.0070929527282715,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 9867
+    },
+    {
+      "epoch": 0.09868,
+      "grad_norm": 0.9202451705932617,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 9868
+    },
+    {
+      "epoch": 0.09869,
+      "grad_norm": 1.0466346740722656,
+      "learning_rate": 0.003,
+      "loss": 4.0443,
+      "step": 9869
+    },
+    {
+      "epoch": 0.0987,
+      "grad_norm": 1.0406720638275146,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 9870
+    },
+    {
+      "epoch": 0.09871,
+      "grad_norm": 0.9774965047836304,
+      "learning_rate": 0.003,
+      "loss": 4.1126,
+      "step": 9871
+    },
+    {
+      "epoch": 0.09872,
+      "grad_norm": 1.0072287321090698,
+      "learning_rate": 0.003,
+      "loss": 4.0701,
+      "step": 9872
+    },
+    {
+      "epoch": 0.09873,
+      "grad_norm": 1.0737255811691284,
+      "learning_rate": 0.003,
+      "loss": 4.0615,
+      "step": 9873
+    },
+    {
+      "epoch": 0.09874,
+      "grad_norm": 0.970801591873169,
+      "learning_rate": 0.003,
+      "loss": 4.0809,
+      "step": 9874
+    },
+    {
+      "epoch": 0.09875,
+      "grad_norm": 1.119269847869873,
+      "learning_rate": 0.003,
+      "loss": 4.0635,
+      "step": 9875
+    },
+    {
+      "epoch": 0.09876,
+      "grad_norm": 0.9882863759994507,
+      "learning_rate": 0.003,
+      "loss": 4.0615,
+      "step": 9876
+    },
+    {
+      "epoch": 0.09877,
+      "grad_norm": 0.8598633408546448,
+      "learning_rate": 0.003,
+      "loss": 4.0528,
+      "step": 9877
+    },
+    {
+      "epoch": 0.09878,
+      "grad_norm": 0.8976407647132874,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 9878
+    },
+    {
+      "epoch": 0.09879,
+      "grad_norm": 1.0029267072677612,
+      "learning_rate": 0.003,
+      "loss": 4.0436,
+      "step": 9879
+    },
+    {
+      "epoch": 0.0988,
+      "grad_norm": 1.0229839086532593,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 9880
+    },
+    {
+      "epoch": 0.09881,
+      "grad_norm": 1.1409919261932373,
+      "learning_rate": 0.003,
+      "loss": 4.0478,
+      "step": 9881
+    },
+    {
+      "epoch": 0.09882,
+      "grad_norm": 0.9878159165382385,
+      "learning_rate": 0.003,
+      "loss": 4.0707,
+      "step": 9882
+    },
+    {
+      "epoch": 0.09883,
+      "grad_norm": 0.8589847087860107,
+      "learning_rate": 0.003,
+      "loss": 4.0484,
+      "step": 9883
+    },
+    {
+      "epoch": 0.09884,
+      "grad_norm": 0.645190417766571,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 9884
+    },
+    {
+      "epoch": 0.09885,
+      "grad_norm": 0.5982037782669067,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 9885
+    },
+    {
+      "epoch": 0.09886,
+      "grad_norm": 0.5794589519500732,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 9886
+    },
+    {
+      "epoch": 0.09887,
+      "grad_norm": 0.622081995010376,
+      "learning_rate": 0.003,
+      "loss": 4.0534,
+      "step": 9887
+    },
+    {
+      "epoch": 0.09888,
+      "grad_norm": 0.6355760097503662,
+      "learning_rate": 0.003,
+      "loss": 4.0713,
+      "step": 9888
+    },
+    {
+      "epoch": 0.09889,
+      "grad_norm": 0.5906744003295898,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 9889
+    },
+    {
+      "epoch": 0.0989,
+      "grad_norm": 0.7298018336296082,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 9890
+    },
+    {
+      "epoch": 0.09891,
+      "grad_norm": 0.7977643609046936,
+      "learning_rate": 0.003,
+      "loss": 4.0433,
+      "step": 9891
+    },
+    {
+      "epoch": 0.09892,
+      "grad_norm": 0.7684615254402161,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 9892
+    },
+    {
+      "epoch": 0.09893,
+      "grad_norm": 0.6070899963378906,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 9893
+    },
+    {
+      "epoch": 0.09894,
+      "grad_norm": 0.49417591094970703,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 9894
+    },
+    {
+      "epoch": 0.09895,
+      "grad_norm": 0.4215652644634247,
+      "learning_rate": 0.003,
+      "loss": 4.0382,
+      "step": 9895
+    },
+    {
+      "epoch": 0.09896,
+      "grad_norm": 0.3874669373035431,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 9896
+    },
+    {
+      "epoch": 0.09897,
+      "grad_norm": 0.4470568895339966,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 9897
+    },
+    {
+      "epoch": 0.09898,
+      "grad_norm": 0.4585300087928772,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 9898
+    },
+    {
+      "epoch": 0.09899,
+      "grad_norm": 0.40234246850013733,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 9899
+    },
+    {
+      "epoch": 0.099,
+      "grad_norm": 0.4411037564277649,
+      "learning_rate": 0.003,
+      "loss": 4.0465,
+      "step": 9900
+    },
+    {
+      "epoch": 0.09901,
+      "grad_norm": 0.4925287663936615,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 9901
+    },
+    {
+      "epoch": 0.09902,
+      "grad_norm": 0.5623918771743774,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 9902
+    },
+    {
+      "epoch": 0.09903,
+      "grad_norm": 0.6239189505577087,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 9903
+    },
+    {
+      "epoch": 0.09904,
+      "grad_norm": 0.6974774599075317,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 9904
+    },
+    {
+      "epoch": 0.09905,
+      "grad_norm": 0.847841739654541,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 9905
+    },
+    {
+      "epoch": 0.09906,
+      "grad_norm": 0.9315091967582703,
+      "learning_rate": 0.003,
+      "loss": 4.0428,
+      "step": 9906
+    },
+    {
+      "epoch": 0.09907,
+      "grad_norm": 0.9649443626403809,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 9907
+    },
+    {
+      "epoch": 0.09908,
+      "grad_norm": 1.1252580881118774,
+      "learning_rate": 0.003,
+      "loss": 4.0466,
+      "step": 9908
+    },
+    {
+      "epoch": 0.09909,
+      "grad_norm": 0.7856508493423462,
+      "learning_rate": 0.003,
+      "loss": 4.0342,
+      "step": 9909
+    },
+    {
+      "epoch": 0.0991,
+      "grad_norm": 0.6007184386253357,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 9910
+    },
+    {
+      "epoch": 0.09911,
+      "grad_norm": 0.5230900645256042,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 9911
+    },
+    {
+      "epoch": 0.09912,
+      "grad_norm": 0.6029075384140015,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 9912
+    },
+    {
+      "epoch": 0.09913,
+      "grad_norm": 0.6787557601928711,
+      "learning_rate": 0.003,
+      "loss": 4.0458,
+      "step": 9913
+    },
+    {
+      "epoch": 0.09914,
+      "grad_norm": 0.7057299017906189,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 9914
+    },
+    {
+      "epoch": 0.09915,
+      "grad_norm": 0.8509723544120789,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 9915
+    },
+    {
+      "epoch": 0.09916,
+      "grad_norm": 1.0318037271499634,
+      "learning_rate": 0.003,
+      "loss": 4.0473,
+      "step": 9916
+    },
+    {
+      "epoch": 0.09917,
+      "grad_norm": 0.9847546219825745,
+      "learning_rate": 0.003,
+      "loss": 4.0522,
+      "step": 9917
+    },
+    {
+      "epoch": 0.09918,
+      "grad_norm": 0.8432486653327942,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 9918
+    },
+    {
+      "epoch": 0.09919,
+      "grad_norm": 0.7586849331855774,
+      "learning_rate": 0.003,
+      "loss": 4.057,
+      "step": 9919
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.7176387906074524,
+      "learning_rate": 0.003,
+      "loss": 4.0397,
+      "step": 9920
+    },
+    {
+      "epoch": 0.09921,
+      "grad_norm": 0.6509994864463806,
+      "learning_rate": 0.003,
+      "loss": 3.9855,
+      "step": 9921
+    },
+    {
+      "epoch": 0.09922,
+      "grad_norm": 0.7130661606788635,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 9922
+    },
+    {
+      "epoch": 0.09923,
+      "grad_norm": 0.9413552284240723,
+      "learning_rate": 0.003,
+      "loss": 4.0375,
+      "step": 9923
+    },
+    {
+      "epoch": 0.09924,
+      "grad_norm": 1.1508615016937256,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 9924
+    },
+    {
+      "epoch": 0.09925,
+      "grad_norm": 0.6644554138183594,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 9925
+    },
+    {
+      "epoch": 0.09926,
+      "grad_norm": 0.5794418454170227,
+      "learning_rate": 0.003,
+      "loss": 4.0294,
+      "step": 9926
+    },
+    {
+      "epoch": 0.09927,
+      "grad_norm": 0.5938196778297424,
+      "learning_rate": 0.003,
+      "loss": 4.0591,
+      "step": 9927
+    },
+    {
+      "epoch": 0.09928,
+      "grad_norm": 0.5763348937034607,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 9928
+    },
+    {
+      "epoch": 0.09929,
+      "grad_norm": 0.6028631329536438,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 9929
+    },
+    {
+      "epoch": 0.0993,
+      "grad_norm": 0.6171523928642273,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 9930
+    },
+    {
+      "epoch": 0.09931,
+      "grad_norm": 0.7361319065093994,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 9931
+    },
+    {
+      "epoch": 0.09932,
+      "grad_norm": 0.7786489725112915,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 9932
+    },
+    {
+      "epoch": 0.09933,
+      "grad_norm": 0.7424275279045105,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 9933
+    },
+    {
+      "epoch": 0.09934,
+      "grad_norm": 0.6433824300765991,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 9934
+    },
+    {
+      "epoch": 0.09935,
+      "grad_norm": 0.6526421308517456,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 9935
+    },
+    {
+      "epoch": 0.09936,
+      "grad_norm": 0.5947155952453613,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 9936
+    },
+    {
+      "epoch": 0.09937,
+      "grad_norm": 0.6477751731872559,
+      "learning_rate": 0.003,
+      "loss": 3.9931,
+      "step": 9937
+    },
+    {
+      "epoch": 0.09938,
+      "grad_norm": 0.845565676689148,
+      "learning_rate": 0.003,
+      "loss": 4.0323,
+      "step": 9938
+    },
+    {
+      "epoch": 0.09939,
+      "grad_norm": 1.120976448059082,
+      "learning_rate": 0.003,
+      "loss": 4.0409,
+      "step": 9939
+    },
+    {
+      "epoch": 0.0994,
+      "grad_norm": 0.9667741656303406,
+      "learning_rate": 0.003,
+      "loss": 4.0582,
+      "step": 9940
+    },
+    {
+      "epoch": 0.09941,
+      "grad_norm": 0.7664325833320618,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 9941
+    },
+    {
+      "epoch": 0.09942,
+      "grad_norm": 0.6430301666259766,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 9942
+    },
+    {
+      "epoch": 0.09943,
+      "grad_norm": 0.6047747135162354,
+      "learning_rate": 0.003,
+      "loss": 3.9678,
+      "step": 9943
+    },
+    {
+      "epoch": 0.09944,
+      "grad_norm": 0.582067608833313,
+      "learning_rate": 0.003,
+      "loss": 4.0483,
+      "step": 9944
+    },
+    {
+      "epoch": 0.09945,
+      "grad_norm": 0.8082442283630371,
+      "learning_rate": 0.003,
+      "loss": 3.9894,
+      "step": 9945
+    },
+    {
+      "epoch": 0.09946,
+      "grad_norm": 1.1517560482025146,
+      "learning_rate": 0.003,
+      "loss": 4.05,
+      "step": 9946
+    },
+    {
+      "epoch": 0.09947,
+      "grad_norm": 0.9363013505935669,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 9947
+    },
+    {
+      "epoch": 0.09948,
+      "grad_norm": 0.7373224496841431,
+      "learning_rate": 0.003,
+      "loss": 4.0351,
+      "step": 9948
+    },
+    {
+      "epoch": 0.09949,
+      "grad_norm": 0.6157240867614746,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 9949
+    },
+    {
+      "epoch": 0.0995,
+      "grad_norm": 0.6930040121078491,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 9950
+    },
+    {
+      "epoch": 0.09951,
+      "grad_norm": 0.7911564111709595,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 9951
+    },
+    {
+      "epoch": 0.09952,
+      "grad_norm": 0.8492519855499268,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 9952
+    },
+    {
+      "epoch": 0.09953,
+      "grad_norm": 0.8654104471206665,
+      "learning_rate": 0.003,
+      "loss": 4.0497,
+      "step": 9953
+    },
+    {
+      "epoch": 0.09954,
+      "grad_norm": 0.9337427020072937,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 9954
+    },
+    {
+      "epoch": 0.09955,
+      "grad_norm": 1.0466641187667847,
+      "learning_rate": 0.003,
+      "loss": 4.0176,
+      "step": 9955
+    },
+    {
+      "epoch": 0.09956,
+      "grad_norm": 0.9967005848884583,
+      "learning_rate": 0.003,
+      "loss": 4.0251,
+      "step": 9956
+    },
+    {
+      "epoch": 0.09957,
+      "grad_norm": 1.014816164970398,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 9957
+    },
+    {
+      "epoch": 0.09958,
+      "grad_norm": 0.9045199155807495,
+      "learning_rate": 0.003,
+      "loss": 4.0563,
+      "step": 9958
+    },
+    {
+      "epoch": 0.09959,
+      "grad_norm": 0.9327095746994019,
+      "learning_rate": 0.003,
+      "loss": 4.06,
+      "step": 9959
+    },
+    {
+      "epoch": 0.0996,
+      "grad_norm": 0.9663153886795044,
+      "learning_rate": 0.003,
+      "loss": 4.0449,
+      "step": 9960
+    },
+    {
+      "epoch": 0.09961,
+      "grad_norm": 0.983515739440918,
+      "learning_rate": 0.003,
+      "loss": 4.0494,
+      "step": 9961
+    },
+    {
+      "epoch": 0.09962,
+      "grad_norm": 1.0190386772155762,
+      "learning_rate": 0.003,
+      "loss": 4.0555,
+      "step": 9962
+    },
+    {
+      "epoch": 0.09963,
+      "grad_norm": 0.8583796620368958,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 9963
+    },
+    {
+      "epoch": 0.09964,
+      "grad_norm": 0.7809299826622009,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 9964
+    },
+    {
+      "epoch": 0.09965,
+      "grad_norm": 0.8016514778137207,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 9965
+    },
+    {
+      "epoch": 0.09966,
+      "grad_norm": 0.743135929107666,
+      "learning_rate": 0.003,
+      "loss": 4.0476,
+      "step": 9966
+    },
+    {
+      "epoch": 0.09967,
+      "grad_norm": 0.7582171559333801,
+      "learning_rate": 0.003,
+      "loss": 4.0444,
+      "step": 9967
+    },
+    {
+      "epoch": 0.09968,
+      "grad_norm": 0.6683290004730225,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 9968
+    },
+    {
+      "epoch": 0.09969,
+      "grad_norm": 0.6653891205787659,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 9969
+    },
+    {
+      "epoch": 0.0997,
+      "grad_norm": 0.6902685761451721,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 9970
+    },
+    {
+      "epoch": 0.09971,
+      "grad_norm": 1.0166124105453491,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 9971
+    },
+    {
+      "epoch": 0.09972,
+      "grad_norm": 1.0039961338043213,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 9972
+    },
+    {
+      "epoch": 0.09973,
+      "grad_norm": 0.9374507665634155,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 9973
+    },
+    {
+      "epoch": 0.09974,
+      "grad_norm": 0.8050880432128906,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 9974
+    },
+    {
+      "epoch": 0.09975,
+      "grad_norm": 0.6863667964935303,
+      "learning_rate": 0.003,
+      "loss": 4.042,
+      "step": 9975
+    },
+    {
+      "epoch": 0.09976,
+      "grad_norm": 0.5954990983009338,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 9976
+    },
+    {
+      "epoch": 0.09977,
+      "grad_norm": 0.7258602976799011,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 9977
+    },
+    {
+      "epoch": 0.09978,
+      "grad_norm": 0.8950538039207458,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 9978
+    },
+    {
+      "epoch": 0.09979,
+      "grad_norm": 1.0293006896972656,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 9979
+    },
+    {
+      "epoch": 0.0998,
+      "grad_norm": 0.9026445746421814,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 9980
+    },
+    {
+      "epoch": 0.09981,
+      "grad_norm": 0.7071680426597595,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 9981
+    },
+    {
+      "epoch": 0.09982,
+      "grad_norm": 0.5843554735183716,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 9982
+    },
+    {
+      "epoch": 0.09983,
+      "grad_norm": 0.585106611251831,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 9983
+    },
+    {
+      "epoch": 0.09984,
+      "grad_norm": 0.600675106048584,
+      "learning_rate": 0.003,
+      "loss": 4.045,
+      "step": 9984
+    },
+    {
+      "epoch": 0.09985,
+      "grad_norm": 0.6129768490791321,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 9985
+    },
+    {
+      "epoch": 0.09986,
+      "grad_norm": 0.6755698323249817,
+      "learning_rate": 0.003,
+      "loss": 4.0414,
+      "step": 9986
+    },
+    {
+      "epoch": 0.09987,
+      "grad_norm": 0.7202430367469788,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 9987
+    },
+    {
+      "epoch": 0.09988,
+      "grad_norm": 0.8828611969947815,
+      "learning_rate": 0.003,
+      "loss": 4.0546,
+      "step": 9988
+    },
+    {
+      "epoch": 0.09989,
+      "grad_norm": 0.9760780930519104,
+      "learning_rate": 0.003,
+      "loss": 4.0237,
+      "step": 9989
+    },
+    {
+      "epoch": 0.0999,
+      "grad_norm": 0.8992119431495667,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 9990
+    },
+    {
+      "epoch": 0.09991,
+      "grad_norm": 0.730566680431366,
+      "learning_rate": 0.003,
+      "loss": 4.0478,
+      "step": 9991
+    },
+    {
+      "epoch": 0.09992,
+      "grad_norm": 0.6713852882385254,
+      "learning_rate": 0.003,
+      "loss": 4.0047,
+      "step": 9992
+    },
+    {
+      "epoch": 0.09993,
+      "grad_norm": 0.6310202479362488,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 9993
+    },
+    {
+      "epoch": 0.09994,
+      "grad_norm": 0.5760572552680969,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 9994
+    },
+    {
+      "epoch": 0.09995,
+      "grad_norm": 0.5813117027282715,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 9995
+    },
+    {
+      "epoch": 0.09996,
+      "grad_norm": 0.6562214493751526,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 9996
+    },
+    {
+      "epoch": 0.09997,
+      "grad_norm": 0.6452088952064514,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 9997
+    },
+    {
+      "epoch": 0.09998,
+      "grad_norm": 0.6939143538475037,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 9998
+    },
+    {
+      "epoch": 0.09999,
+      "grad_norm": 0.8035266399383545,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 9999
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.8756668567657471,
+      "learning_rate": 0.003,
+      "loss": 4.0334,
+      "step": 10000
+    },
+    {
+      "epoch": 0.10001,
+      "grad_norm": 0.8760703802108765,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 10001
+    },
+    {
+      "epoch": 0.10002,
+      "grad_norm": 0.7529221177101135,
+      "learning_rate": 0.003,
+      "loss": 4.042,
+      "step": 10002
+    },
+    {
+      "epoch": 0.10003,
+      "grad_norm": 0.7163582444190979,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 10003
+    },
+    {
+      "epoch": 0.10004,
+      "grad_norm": 0.8936163783073425,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 10004
+    },
+    {
+      "epoch": 0.10005,
+      "grad_norm": 1.0073555707931519,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 10005
+    },
+    {
+      "epoch": 0.10006,
+      "grad_norm": 0.917222797870636,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 10006
+    },
+    {
+      "epoch": 0.10007,
+      "grad_norm": 0.8444756269454956,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 10007
+    },
+    {
+      "epoch": 0.10008,
+      "grad_norm": 0.7267651557922363,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 10008
+    },
+    {
+      "epoch": 0.10009,
+      "grad_norm": 0.7675696015357971,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 10009
+    },
+    {
+      "epoch": 0.1001,
+      "grad_norm": 0.7582693696022034,
+      "learning_rate": 0.003,
+      "loss": 4.0466,
+      "step": 10010
+    },
+    {
+      "epoch": 0.10011,
+      "grad_norm": 0.6846451163291931,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 10011
+    },
+    {
+      "epoch": 0.10012,
+      "grad_norm": 0.729497492313385,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 10012
+    },
+    {
+      "epoch": 0.10013,
+      "grad_norm": 0.856055498123169,
+      "learning_rate": 0.003,
+      "loss": 4.0518,
+      "step": 10013
+    },
+    {
+      "epoch": 0.10014,
+      "grad_norm": 0.9490603804588318,
+      "learning_rate": 0.003,
+      "loss": 4.0444,
+      "step": 10014
+    },
+    {
+      "epoch": 0.10015,
+      "grad_norm": 0.8758609890937805,
+      "learning_rate": 0.003,
+      "loss": 4.0595,
+      "step": 10015
+    },
+    {
+      "epoch": 0.10016,
+      "grad_norm": 0.8611948490142822,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 10016
+    },
+    {
+      "epoch": 0.10017,
+      "grad_norm": 0.7740161418914795,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 10017
+    },
+    {
+      "epoch": 0.10018,
+      "grad_norm": 0.6666895151138306,
+      "learning_rate": 0.003,
+      "loss": 4.0423,
+      "step": 10018
+    },
+    {
+      "epoch": 0.10019,
+      "grad_norm": 0.6955499053001404,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 10019
+    },
+    {
+      "epoch": 0.1002,
+      "grad_norm": 0.6729419827461243,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 10020
+    },
+    {
+      "epoch": 0.10021,
+      "grad_norm": 0.7229282855987549,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 10021
+    },
+    {
+      "epoch": 0.10022,
+      "grad_norm": 0.8583958745002747,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 10022
+    },
+    {
+      "epoch": 0.10023,
+      "grad_norm": 0.8976644277572632,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 10023
+    },
+    {
+      "epoch": 0.10024,
+      "grad_norm": 0.8202301859855652,
+      "learning_rate": 0.003,
+      "loss": 4.0437,
+      "step": 10024
+    },
+    {
+      "epoch": 0.10025,
+      "grad_norm": 0.7559810280799866,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 10025
+    },
+    {
+      "epoch": 0.10026,
+      "grad_norm": 0.7426246404647827,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 10026
+    },
+    {
+      "epoch": 0.10027,
+      "grad_norm": 0.7772095203399658,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 10027
+    },
+    {
+      "epoch": 0.10028,
+      "grad_norm": 0.7090165615081787,
+      "learning_rate": 0.003,
+      "loss": 4.0617,
+      "step": 10028
+    },
+    {
+      "epoch": 0.10029,
+      "grad_norm": 0.751837432384491,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 10029
+    },
+    {
+      "epoch": 0.1003,
+      "grad_norm": 0.7565678358078003,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 10030
+    },
+    {
+      "epoch": 0.10031,
+      "grad_norm": 0.723259687423706,
+      "learning_rate": 0.003,
+      "loss": 4.0626,
+      "step": 10031
+    },
+    {
+      "epoch": 0.10032,
+      "grad_norm": 0.6929975748062134,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 10032
+    },
+    {
+      "epoch": 0.10033,
+      "grad_norm": 0.7323466539382935,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 10033
+    },
+    {
+      "epoch": 0.10034,
+      "grad_norm": 0.7538563013076782,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 10034
+    },
+    {
+      "epoch": 0.10035,
+      "grad_norm": 0.7490745782852173,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 10035
+    },
+    {
+      "epoch": 0.10036,
+      "grad_norm": 0.7792319655418396,
+      "learning_rate": 0.003,
+      "loss": 3.9991,
+      "step": 10036
+    },
+    {
+      "epoch": 0.10037,
+      "grad_norm": 0.82288658618927,
+      "learning_rate": 0.003,
+      "loss": 4.0168,
+      "step": 10037
+    },
+    {
+      "epoch": 0.10038,
+      "grad_norm": 0.8214668035507202,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 10038
+    },
+    {
+      "epoch": 0.10039,
+      "grad_norm": 0.8127548098564148,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 10039
+    },
+    {
+      "epoch": 0.1004,
+      "grad_norm": 0.9639657139778137,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 10040
+    },
+    {
+      "epoch": 0.10041,
+      "grad_norm": 1.0831257104873657,
+      "learning_rate": 0.003,
+      "loss": 4.0449,
+      "step": 10041
+    },
+    {
+      "epoch": 0.10042,
+      "grad_norm": 0.8846466541290283,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 10042
+    },
+    {
+      "epoch": 0.10043,
+      "grad_norm": 0.8289374709129333,
+      "learning_rate": 0.003,
+      "loss": 4.0602,
+      "step": 10043
+    },
+    {
+      "epoch": 0.10044,
+      "grad_norm": 0.7353184223175049,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 10044
+    },
+    {
+      "epoch": 0.10045,
+      "grad_norm": 0.7047366499900818,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 10045
+    },
+    {
+      "epoch": 0.10046,
+      "grad_norm": 0.6355120539665222,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 10046
+    },
+    {
+      "epoch": 0.10047,
+      "grad_norm": 0.6405349969863892,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 10047
+    },
+    {
+      "epoch": 0.10048,
+      "grad_norm": 0.5647373795509338,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 10048
+    },
+    {
+      "epoch": 0.10049,
+      "grad_norm": 0.5304549336433411,
+      "learning_rate": 0.003,
+      "loss": 3.9967,
+      "step": 10049
+    },
+    {
+      "epoch": 0.1005,
+      "grad_norm": 0.5612930059432983,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 10050
+    },
+    {
+      "epoch": 0.10051,
+      "grad_norm": 0.6478097438812256,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 10051
+    },
+    {
+      "epoch": 0.10052,
+      "grad_norm": 0.7477694153785706,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 10052
+    },
+    {
+      "epoch": 0.10053,
+      "grad_norm": 0.8523183465003967,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 10053
+    },
+    {
+      "epoch": 0.10054,
+      "grad_norm": 0.9100677371025085,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 10054
+    },
+    {
+      "epoch": 0.10055,
+      "grad_norm": 0.969436764717102,
+      "learning_rate": 0.003,
+      "loss": 4.0448,
+      "step": 10055
+    },
+    {
+      "epoch": 0.10056,
+      "grad_norm": 0.8593199849128723,
+      "learning_rate": 0.003,
+      "loss": 4.0521,
+      "step": 10056
+    },
+    {
+      "epoch": 0.10057,
+      "grad_norm": 0.7369247078895569,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 10057
+    },
+    {
+      "epoch": 0.10058,
+      "grad_norm": 0.6502119898796082,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 10058
+    },
+    {
+      "epoch": 0.10059,
+      "grad_norm": 0.6754664182662964,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 10059
+    },
+    {
+      "epoch": 0.1006,
+      "grad_norm": 0.7896672487258911,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 10060
+    },
+    {
+      "epoch": 0.10061,
+      "grad_norm": 0.7812275886535645,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 10061
+    },
+    {
+      "epoch": 0.10062,
+      "grad_norm": 0.7402509450912476,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 10062
+    },
+    {
+      "epoch": 0.10063,
+      "grad_norm": 0.7005326747894287,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 10063
+    },
+    {
+      "epoch": 0.10064,
+      "grad_norm": 0.6713504791259766,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 10064
+    },
+    {
+      "epoch": 0.10065,
+      "grad_norm": 0.6878002285957336,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 10065
+    },
+    {
+      "epoch": 0.10066,
+      "grad_norm": 0.7443453669548035,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 10066
+    },
+    {
+      "epoch": 0.10067,
+      "grad_norm": 0.8456590175628662,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 10067
+    },
+    {
+      "epoch": 0.10068,
+      "grad_norm": 1.0721101760864258,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 10068
+    },
+    {
+      "epoch": 0.10069,
+      "grad_norm": 0.9349250197410583,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 10069
+    },
+    {
+      "epoch": 0.1007,
+      "grad_norm": 0.7791523933410645,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 10070
+    },
+    {
+      "epoch": 0.10071,
+      "grad_norm": 0.643773078918457,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 10071
+    },
+    {
+      "epoch": 0.10072,
+      "grad_norm": 0.5889289379119873,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 10072
+    },
+    {
+      "epoch": 0.10073,
+      "grad_norm": 0.6941524147987366,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 10073
+    },
+    {
+      "epoch": 0.10074,
+      "grad_norm": 0.812353253364563,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 10074
+    },
+    {
+      "epoch": 0.10075,
+      "grad_norm": 0.848740816116333,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 10075
+    },
+    {
+      "epoch": 0.10076,
+      "grad_norm": 0.8380178213119507,
+      "learning_rate": 0.003,
+      "loss": 4.0369,
+      "step": 10076
+    },
+    {
+      "epoch": 0.10077,
+      "grad_norm": 0.8449537754058838,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 10077
+    },
+    {
+      "epoch": 0.10078,
+      "grad_norm": 0.8786484599113464,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 10078
+    },
+    {
+      "epoch": 0.10079,
+      "grad_norm": 0.7593846321105957,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 10079
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.7023867964744568,
+      "learning_rate": 0.003,
+      "loss": 4.0256,
+      "step": 10080
+    },
+    {
+      "epoch": 0.10081,
+      "grad_norm": 0.7081405520439148,
+      "learning_rate": 0.003,
+      "loss": 4.0517,
+      "step": 10081
+    },
+    {
+      "epoch": 0.10082,
+      "grad_norm": 0.6782942414283752,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 10082
+    },
+    {
+      "epoch": 0.10083,
+      "grad_norm": 0.7835003137588501,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 10083
+    },
+    {
+      "epoch": 0.10084,
+      "grad_norm": 1.1654982566833496,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 10084
+    },
+    {
+      "epoch": 0.10085,
+      "grad_norm": 1.077035665512085,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 10085
+    },
+    {
+      "epoch": 0.10086,
+      "grad_norm": 0.7526758313179016,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 10086
+    },
+    {
+      "epoch": 0.10087,
+      "grad_norm": 0.572903037071228,
+      "learning_rate": 0.003,
+      "loss": 4.042,
+      "step": 10087
+    },
+    {
+      "epoch": 0.10088,
+      "grad_norm": 0.5392580032348633,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 10088
+    },
+    {
+      "epoch": 0.10089,
+      "grad_norm": 0.5970590710639954,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 10089
+    },
+    {
+      "epoch": 0.1009,
+      "grad_norm": 0.6526892185211182,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 10090
+    },
+    {
+      "epoch": 0.10091,
+      "grad_norm": 0.7266455888748169,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 10091
+    },
+    {
+      "epoch": 0.10092,
+      "grad_norm": 0.7619020342826843,
+      "learning_rate": 0.003,
+      "loss": 4.0369,
+      "step": 10092
+    },
+    {
+      "epoch": 0.10093,
+      "grad_norm": 0.7967933416366577,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 10093
+    },
+    {
+      "epoch": 0.10094,
+      "grad_norm": 0.8304819464683533,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 10094
+    },
+    {
+      "epoch": 0.10095,
+      "grad_norm": 0.7637600898742676,
+      "learning_rate": 0.003,
+      "loss": 4.0047,
+      "step": 10095
+    },
+    {
+      "epoch": 0.10096,
+      "grad_norm": 0.6716521382331848,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 10096
+    },
+    {
+      "epoch": 0.10097,
+      "grad_norm": 0.7301881909370422,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 10097
+    },
+    {
+      "epoch": 0.10098,
+      "grad_norm": 0.8763124942779541,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 10098
+    },
+    {
+      "epoch": 0.10099,
+      "grad_norm": 0.9246903657913208,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 10099
+    },
+    {
+      "epoch": 0.101,
+      "grad_norm": 0.852162778377533,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 10100
+    },
+    {
+      "epoch": 0.10101,
+      "grad_norm": 0.7550850510597229,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 10101
+    },
+    {
+      "epoch": 0.10102,
+      "grad_norm": 0.7212772369384766,
+      "learning_rate": 0.003,
+      "loss": 4.045,
+      "step": 10102
+    },
+    {
+      "epoch": 0.10103,
+      "grad_norm": 0.749382734298706,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 10103
+    },
+    {
+      "epoch": 0.10104,
+      "grad_norm": 0.8882433772087097,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 10104
+    },
+    {
+      "epoch": 0.10105,
+      "grad_norm": 1.2675986289978027,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 10105
+    },
+    {
+      "epoch": 0.10106,
+      "grad_norm": 0.8278036117553711,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 10106
+    },
+    {
+      "epoch": 0.10107,
+      "grad_norm": 0.7198725342750549,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 10107
+    },
+    {
+      "epoch": 0.10108,
+      "grad_norm": 0.6387250423431396,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 10108
+    },
+    {
+      "epoch": 0.10109,
+      "grad_norm": 0.5777958631515503,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 10109
+    },
+    {
+      "epoch": 0.1011,
+      "grad_norm": 0.6204786896705627,
+      "learning_rate": 0.003,
+      "loss": 4.0464,
+      "step": 10110
+    },
+    {
+      "epoch": 0.10111,
+      "grad_norm": 0.5600338578224182,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 10111
+    },
+    {
+      "epoch": 0.10112,
+      "grad_norm": 0.6853824853897095,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 10112
+    },
+    {
+      "epoch": 0.10113,
+      "grad_norm": 0.8152238130569458,
+      "learning_rate": 0.003,
+      "loss": 3.9991,
+      "step": 10113
+    },
+    {
+      "epoch": 0.10114,
+      "grad_norm": 0.9304043054580688,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 10114
+    },
+    {
+      "epoch": 0.10115,
+      "grad_norm": 1.061358094215393,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 10115
+    },
+    {
+      "epoch": 0.10116,
+      "grad_norm": 0.9930733442306519,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 10116
+    },
+    {
+      "epoch": 0.10117,
+      "grad_norm": 0.8263692855834961,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 10117
+    },
+    {
+      "epoch": 0.10118,
+      "grad_norm": 0.6010589599609375,
+      "learning_rate": 0.003,
+      "loss": 4.049,
+      "step": 10118
+    },
+    {
+      "epoch": 0.10119,
+      "grad_norm": 0.7774295210838318,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 10119
+    },
+    {
+      "epoch": 0.1012,
+      "grad_norm": 0.9463669061660767,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 10120
+    },
+    {
+      "epoch": 0.10121,
+      "grad_norm": 1.0663201808929443,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 10121
+    },
+    {
+      "epoch": 0.10122,
+      "grad_norm": 0.7437269687652588,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 10122
+    },
+    {
+      "epoch": 0.10123,
+      "grad_norm": 0.5977786779403687,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 10123
+    },
+    {
+      "epoch": 0.10124,
+      "grad_norm": 0.8191568851470947,
+      "learning_rate": 0.003,
+      "loss": 4.0518,
+      "step": 10124
+    },
+    {
+      "epoch": 0.10125,
+      "grad_norm": 0.847891628742218,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 10125
+    },
+    {
+      "epoch": 0.10126,
+      "grad_norm": 0.9732335805892944,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 10126
+    },
+    {
+      "epoch": 0.10127,
+      "grad_norm": 1.0408259630203247,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 10127
+    },
+    {
+      "epoch": 0.10128,
+      "grad_norm": 0.8466975092887878,
+      "learning_rate": 0.003,
+      "loss": 4.0501,
+      "step": 10128
+    },
+    {
+      "epoch": 0.10129,
+      "grad_norm": 0.8226129412651062,
+      "learning_rate": 0.003,
+      "loss": 4.0556,
+      "step": 10129
+    },
+    {
+      "epoch": 0.1013,
+      "grad_norm": 0.7009112238883972,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 10130
+    },
+    {
+      "epoch": 0.10131,
+      "grad_norm": 0.6498497724533081,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 10131
+    },
+    {
+      "epoch": 0.10132,
+      "grad_norm": 0.718295693397522,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 10132
+    },
+    {
+      "epoch": 0.10133,
+      "grad_norm": 0.8029374480247498,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 10133
+    },
+    {
+      "epoch": 0.10134,
+      "grad_norm": 0.7989256381988525,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 10134
+    },
+    {
+      "epoch": 0.10135,
+      "grad_norm": 0.7786409854888916,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 10135
+    },
+    {
+      "epoch": 0.10136,
+      "grad_norm": 0.8297443985939026,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 10136
+    },
+    {
+      "epoch": 0.10137,
+      "grad_norm": 0.8420387506484985,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 10137
+    },
+    {
+      "epoch": 0.10138,
+      "grad_norm": 0.7902016043663025,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 10138
+    },
+    {
+      "epoch": 0.10139,
+      "grad_norm": 0.6987302899360657,
+      "learning_rate": 0.003,
+      "loss": 3.9973,
+      "step": 10139
+    },
+    {
+      "epoch": 0.1014,
+      "grad_norm": 0.7255557179450989,
+      "learning_rate": 0.003,
+      "loss": 4.0576,
+      "step": 10140
+    },
+    {
+      "epoch": 0.10141,
+      "grad_norm": 0.8124567866325378,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 10141
+    },
+    {
+      "epoch": 0.10142,
+      "grad_norm": 0.8950157761573792,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 10142
+    },
+    {
+      "epoch": 0.10143,
+      "grad_norm": 1.0962830781936646,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 10143
+    },
+    {
+      "epoch": 0.10144,
+      "grad_norm": 0.9343271255493164,
+      "learning_rate": 0.003,
+      "loss": 4.0414,
+      "step": 10144
+    },
+    {
+      "epoch": 0.10145,
+      "grad_norm": 0.8869456052780151,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 10145
+    },
+    {
+      "epoch": 0.10146,
+      "grad_norm": 0.875791609287262,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 10146
+    },
+    {
+      "epoch": 0.10147,
+      "grad_norm": 0.7010773420333862,
+      "learning_rate": 0.003,
+      "loss": 4.0316,
+      "step": 10147
+    },
+    {
+      "epoch": 0.10148,
+      "grad_norm": 0.6326318383216858,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 10148
+    },
+    {
+      "epoch": 0.10149,
+      "grad_norm": 0.6735743284225464,
+      "learning_rate": 0.003,
+      "loss": 3.9912,
+      "step": 10149
+    },
+    {
+      "epoch": 0.1015,
+      "grad_norm": 0.6270667910575867,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 10150
+    },
+    {
+      "epoch": 0.10151,
+      "grad_norm": 0.7168855667114258,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 10151
+    },
+    {
+      "epoch": 0.10152,
+      "grad_norm": 0.7000669836997986,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 10152
+    },
+    {
+      "epoch": 0.10153,
+      "grad_norm": 0.7381427884101868,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 10153
+    },
+    {
+      "epoch": 0.10154,
+      "grad_norm": 0.7226046919822693,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 10154
+    },
+    {
+      "epoch": 0.10155,
+      "grad_norm": 0.653364896774292,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 10155
+    },
+    {
+      "epoch": 0.10156,
+      "grad_norm": 0.7581577301025391,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 10156
+    },
+    {
+      "epoch": 0.10157,
+      "grad_norm": 0.7567413449287415,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 10157
+    },
+    {
+      "epoch": 0.10158,
+      "grad_norm": 0.6570749282836914,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 10158
+    },
+    {
+      "epoch": 0.10159,
+      "grad_norm": 0.8515116572380066,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 10159
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.830341637134552,
+      "learning_rate": 0.003,
+      "loss": 4.0421,
+      "step": 10160
+    },
+    {
+      "epoch": 0.10161,
+      "grad_norm": 0.6846141219139099,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 10161
+    },
+    {
+      "epoch": 0.10162,
+      "grad_norm": 0.7712964415550232,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 10162
+    },
+    {
+      "epoch": 0.10163,
+      "grad_norm": 0.8644002079963684,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 10163
+    },
+    {
+      "epoch": 0.10164,
+      "grad_norm": 1.1422466039657593,
+      "learning_rate": 0.003,
+      "loss": 4.0582,
+      "step": 10164
+    },
+    {
+      "epoch": 0.10165,
+      "grad_norm": 0.9871522188186646,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 10165
+    },
+    {
+      "epoch": 0.10166,
+      "grad_norm": 1.0399481058120728,
+      "learning_rate": 0.003,
+      "loss": 4.0479,
+      "step": 10166
+    },
+    {
+      "epoch": 0.10167,
+      "grad_norm": 0.9927687644958496,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 10167
+    },
+    {
+      "epoch": 0.10168,
+      "grad_norm": 0.9502347111701965,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 10168
+    },
+    {
+      "epoch": 0.10169,
+      "grad_norm": 0.784534215927124,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 10169
+    },
+    {
+      "epoch": 0.1017,
+      "grad_norm": 0.6386174559593201,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 10170
+    },
+    {
+      "epoch": 0.10171,
+      "grad_norm": 0.5796541571617126,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 10171
+    },
+    {
+      "epoch": 0.10172,
+      "grad_norm": 0.5448047518730164,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 10172
+    },
+    {
+      "epoch": 0.10173,
+      "grad_norm": 0.5247647166252136,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 10173
+    },
+    {
+      "epoch": 0.10174,
+      "grad_norm": 0.6259518265724182,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 10174
+    },
+    {
+      "epoch": 0.10175,
+      "grad_norm": 0.6033033132553101,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 10175
+    },
+    {
+      "epoch": 0.10176,
+      "grad_norm": 0.5712244510650635,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 10176
+    },
+    {
+      "epoch": 0.10177,
+      "grad_norm": 0.515041172504425,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 10177
+    },
+    {
+      "epoch": 0.10178,
+      "grad_norm": 0.45743808150291443,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 10178
+    },
+    {
+      "epoch": 0.10179,
+      "grad_norm": 0.42596837878227234,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 10179
+    },
+    {
+      "epoch": 0.1018,
+      "grad_norm": 0.47992241382598877,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 10180
+    },
+    {
+      "epoch": 0.10181,
+      "grad_norm": 0.5535048246383667,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 10181
+    },
+    {
+      "epoch": 0.10182,
+      "grad_norm": 0.7175001502037048,
+      "learning_rate": 0.003,
+      "loss": 4.0339,
+      "step": 10182
+    },
+    {
+      "epoch": 0.10183,
+      "grad_norm": 0.9698479175567627,
+      "learning_rate": 0.003,
+      "loss": 4.0402,
+      "step": 10183
+    },
+    {
+      "epoch": 0.10184,
+      "grad_norm": 1.2325149774551392,
+      "learning_rate": 0.003,
+      "loss": 4.0529,
+      "step": 10184
+    },
+    {
+      "epoch": 0.10185,
+      "grad_norm": 0.6251413226127625,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 10185
+    },
+    {
+      "epoch": 0.10186,
+      "grad_norm": 0.6279639005661011,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 10186
+    },
+    {
+      "epoch": 0.10187,
+      "grad_norm": 0.7553200721740723,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 10187
+    },
+    {
+      "epoch": 0.10188,
+      "grad_norm": 0.9480890035629272,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 10188
+    },
+    {
+      "epoch": 0.10189,
+      "grad_norm": 1.0561100244522095,
+      "learning_rate": 0.003,
+      "loss": 4.0002,
+      "step": 10189
+    },
+    {
+      "epoch": 0.1019,
+      "grad_norm": 0.9566508531570435,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 10190
+    },
+    {
+      "epoch": 0.10191,
+      "grad_norm": 0.9126545786857605,
+      "learning_rate": 0.003,
+      "loss": 4.0457,
+      "step": 10191
+    },
+    {
+      "epoch": 0.10192,
+      "grad_norm": 0.8582307696342468,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 10192
+    },
+    {
+      "epoch": 0.10193,
+      "grad_norm": 0.859767735004425,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 10193
+    },
+    {
+      "epoch": 0.10194,
+      "grad_norm": 0.9116681218147278,
+      "learning_rate": 0.003,
+      "loss": 4.0535,
+      "step": 10194
+    },
+    {
+      "epoch": 0.10195,
+      "grad_norm": 0.8132261037826538,
+      "learning_rate": 0.003,
+      "loss": 4.0497,
+      "step": 10195
+    },
+    {
+      "epoch": 0.10196,
+      "grad_norm": 0.7543628215789795,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 10196
+    },
+    {
+      "epoch": 0.10197,
+      "grad_norm": 0.7262082099914551,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 10197
+    },
+    {
+      "epoch": 0.10198,
+      "grad_norm": 0.6579393744468689,
+      "learning_rate": 0.003,
+      "loss": 4.0587,
+      "step": 10198
+    },
+    {
+      "epoch": 0.10199,
+      "grad_norm": 0.5839563608169556,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 10199
+    },
+    {
+      "epoch": 0.102,
+      "grad_norm": 0.6411651968955994,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 10200
+    },
+    {
+      "epoch": 0.10201,
+      "grad_norm": 0.8397372364997864,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 10201
+    },
+    {
+      "epoch": 0.10202,
+      "grad_norm": 1.038834810256958,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 10202
+    },
+    {
+      "epoch": 0.10203,
+      "grad_norm": 1.2775077819824219,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 10203
+    },
+    {
+      "epoch": 0.10204,
+      "grad_norm": 0.778706431388855,
+      "learning_rate": 0.003,
+      "loss": 4.0535,
+      "step": 10204
+    },
+    {
+      "epoch": 0.10205,
+      "grad_norm": 0.7379663586616516,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 10205
+    },
+    {
+      "epoch": 0.10206,
+      "grad_norm": 0.7955054640769958,
+      "learning_rate": 0.003,
+      "loss": 4.043,
+      "step": 10206
+    },
+    {
+      "epoch": 0.10207,
+      "grad_norm": 0.8948112726211548,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 10207
+    },
+    {
+      "epoch": 0.10208,
+      "grad_norm": 1.1121752262115479,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 10208
+    },
+    {
+      "epoch": 0.10209,
+      "grad_norm": 1.0435898303985596,
+      "learning_rate": 0.003,
+      "loss": 4.0521,
+      "step": 10209
+    },
+    {
+      "epoch": 0.1021,
+      "grad_norm": 0.8086763024330139,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 10210
+    },
+    {
+      "epoch": 0.10211,
+      "grad_norm": 0.796384334564209,
+      "learning_rate": 0.003,
+      "loss": 4.0535,
+      "step": 10211
+    },
+    {
+      "epoch": 0.10212,
+      "grad_norm": 0.8457871079444885,
+      "learning_rate": 0.003,
+      "loss": 4.0389,
+      "step": 10212
+    },
+    {
+      "epoch": 0.10213,
+      "grad_norm": 0.8126521110534668,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 10213
+    },
+    {
+      "epoch": 0.10214,
+      "grad_norm": 0.7107754945755005,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 10214
+    },
+    {
+      "epoch": 0.10215,
+      "grad_norm": 0.6939681172370911,
+      "learning_rate": 0.003,
+      "loss": 4.0528,
+      "step": 10215
+    },
+    {
+      "epoch": 0.10216,
+      "grad_norm": 0.7338607907295227,
+      "learning_rate": 0.003,
+      "loss": 4.0421,
+      "step": 10216
+    },
+    {
+      "epoch": 0.10217,
+      "grad_norm": 0.8102380037307739,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 10217
+    },
+    {
+      "epoch": 0.10218,
+      "grad_norm": 0.8121943473815918,
+      "learning_rate": 0.003,
+      "loss": 4.059,
+      "step": 10218
+    },
+    {
+      "epoch": 0.10219,
+      "grad_norm": 0.7863835692405701,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 10219
+    },
+    {
+      "epoch": 0.1022,
+      "grad_norm": 0.6928033232688904,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 10220
+    },
+    {
+      "epoch": 0.10221,
+      "grad_norm": 0.675073504447937,
+      "learning_rate": 0.003,
+      "loss": 4.0485,
+      "step": 10221
+    },
+    {
+      "epoch": 0.10222,
+      "grad_norm": 0.5772913098335266,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 10222
+    },
+    {
+      "epoch": 0.10223,
+      "grad_norm": 0.5414987802505493,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 10223
+    },
+    {
+      "epoch": 0.10224,
+      "grad_norm": 0.5829790830612183,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 10224
+    },
+    {
+      "epoch": 0.10225,
+      "grad_norm": 0.638757586479187,
+      "learning_rate": 0.003,
+      "loss": 4.0263,
+      "step": 10225
+    },
+    {
+      "epoch": 0.10226,
+      "grad_norm": 0.6315435767173767,
+      "learning_rate": 0.003,
+      "loss": 4.0735,
+      "step": 10226
+    },
+    {
+      "epoch": 0.10227,
+      "grad_norm": 0.7391036152839661,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 10227
+    },
+    {
+      "epoch": 0.10228,
+      "grad_norm": 0.8362815976142883,
+      "learning_rate": 0.003,
+      "loss": 4.0441,
+      "step": 10228
+    },
+    {
+      "epoch": 0.10229,
+      "grad_norm": 1.0286784172058105,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 10229
+    },
+    {
+      "epoch": 0.1023,
+      "grad_norm": 1.009440541267395,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 10230
+    },
+    {
+      "epoch": 0.10231,
+      "grad_norm": 0.8905285000801086,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 10231
+    },
+    {
+      "epoch": 0.10232,
+      "grad_norm": 0.921527624130249,
+      "learning_rate": 0.003,
+      "loss": 4.046,
+      "step": 10232
+    },
+    {
+      "epoch": 0.10233,
+      "grad_norm": 0.8988191485404968,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 10233
+    },
+    {
+      "epoch": 0.10234,
+      "grad_norm": 0.7774051427841187,
+      "learning_rate": 0.003,
+      "loss": 4.05,
+      "step": 10234
+    },
+    {
+      "epoch": 0.10235,
+      "grad_norm": 0.7421836256980896,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 10235
+    },
+    {
+      "epoch": 0.10236,
+      "grad_norm": 0.7218093872070312,
+      "learning_rate": 0.003,
+      "loss": 4.0667,
+      "step": 10236
+    },
+    {
+      "epoch": 0.10237,
+      "grad_norm": 0.6867312788963318,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 10237
+    },
+    {
+      "epoch": 0.10238,
+      "grad_norm": 0.6688364148139954,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 10238
+    },
+    {
+      "epoch": 0.10239,
+      "grad_norm": 0.6108981370925903,
+      "learning_rate": 0.003,
+      "loss": 4.0627,
+      "step": 10239
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.6995348930358887,
+      "learning_rate": 0.003,
+      "loss": 3.9841,
+      "step": 10240
+    },
+    {
+      "epoch": 0.10241,
+      "grad_norm": 0.8642280101776123,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 10241
+    },
+    {
+      "epoch": 0.10242,
+      "grad_norm": 1.1544699668884277,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 10242
+    },
+    {
+      "epoch": 0.10243,
+      "grad_norm": 0.9920125007629395,
+      "learning_rate": 0.003,
+      "loss": 4.0416,
+      "step": 10243
+    },
+    {
+      "epoch": 0.10244,
+      "grad_norm": 0.9115767478942871,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 10244
+    },
+    {
+      "epoch": 0.10245,
+      "grad_norm": 0.7791694402694702,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 10245
+    },
+    {
+      "epoch": 0.10246,
+      "grad_norm": 0.7047367691993713,
+      "learning_rate": 0.003,
+      "loss": 4.0563,
+      "step": 10246
+    },
+    {
+      "epoch": 0.10247,
+      "grad_norm": 0.6797937154769897,
+      "learning_rate": 0.003,
+      "loss": 4.0586,
+      "step": 10247
+    },
+    {
+      "epoch": 0.10248,
+      "grad_norm": 0.7237415313720703,
+      "learning_rate": 0.003,
+      "loss": 4.0472,
+      "step": 10248
+    },
+    {
+      "epoch": 0.10249,
+      "grad_norm": 0.8145940899848938,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 10249
+    },
+    {
+      "epoch": 0.1025,
+      "grad_norm": 0.7724632024765015,
+      "learning_rate": 0.003,
+      "loss": 4.0449,
+      "step": 10250
+    },
+    {
+      "epoch": 0.10251,
+      "grad_norm": 0.7524970769882202,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 10251
+    },
+    {
+      "epoch": 0.10252,
+      "grad_norm": 0.8881193399429321,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 10252
+    },
+    {
+      "epoch": 0.10253,
+      "grad_norm": 1.0016326904296875,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 10253
+    },
+    {
+      "epoch": 0.10254,
+      "grad_norm": 1.0284184217453003,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 10254
+    },
+    {
+      "epoch": 0.10255,
+      "grad_norm": 0.7833191156387329,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 10255
+    },
+    {
+      "epoch": 0.10256,
+      "grad_norm": 0.6149386167526245,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 10256
+    },
+    {
+      "epoch": 0.10257,
+      "grad_norm": 0.5318538546562195,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 10257
+    },
+    {
+      "epoch": 0.10258,
+      "grad_norm": 0.6054723262786865,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 10258
+    },
+    {
+      "epoch": 0.10259,
+      "grad_norm": 0.690203845500946,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 10259
+    },
+    {
+      "epoch": 0.1026,
+      "grad_norm": 0.6764129400253296,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 10260
+    },
+    {
+      "epoch": 0.10261,
+      "grad_norm": 0.6417571902275085,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 10261
+    },
+    {
+      "epoch": 0.10262,
+      "grad_norm": 0.6889247894287109,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 10262
+    },
+    {
+      "epoch": 0.10263,
+      "grad_norm": 0.8735169172286987,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 10263
+    },
+    {
+      "epoch": 0.10264,
+      "grad_norm": 1.0683711767196655,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 10264
+    },
+    {
+      "epoch": 0.10265,
+      "grad_norm": 1.0963785648345947,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 10265
+    },
+    {
+      "epoch": 0.10266,
+      "grad_norm": 0.9746417999267578,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 10266
+    },
+    {
+      "epoch": 0.10267,
+      "grad_norm": 0.9337226152420044,
+      "learning_rate": 0.003,
+      "loss": 4.0531,
+      "step": 10267
+    },
+    {
+      "epoch": 0.10268,
+      "grad_norm": 0.8803682327270508,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 10268
+    },
+    {
+      "epoch": 0.10269,
+      "grad_norm": 0.8062136769294739,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 10269
+    },
+    {
+      "epoch": 0.1027,
+      "grad_norm": 0.6083574891090393,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 10270
+    },
+    {
+      "epoch": 0.10271,
+      "grad_norm": 0.5812376141548157,
+      "learning_rate": 0.003,
+      "loss": 4.0389,
+      "step": 10271
+    },
+    {
+      "epoch": 0.10272,
+      "grad_norm": 0.5521930456161499,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 10272
+    },
+    {
+      "epoch": 0.10273,
+      "grad_norm": 0.528708815574646,
+      "learning_rate": 0.003,
+      "loss": 4.0384,
+      "step": 10273
+    },
+    {
+      "epoch": 0.10274,
+      "grad_norm": 0.5105339288711548,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 10274
+    },
+    {
+      "epoch": 0.10275,
+      "grad_norm": 0.465678870677948,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 10275
+    },
+    {
+      "epoch": 0.10276,
+      "grad_norm": 0.49017584323883057,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 10276
+    },
+    {
+      "epoch": 0.10277,
+      "grad_norm": 0.5855206251144409,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 10277
+    },
+    {
+      "epoch": 0.10278,
+      "grad_norm": 0.5988587141036987,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 10278
+    },
+    {
+      "epoch": 0.10279,
+      "grad_norm": 0.625211238861084,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 10279
+    },
+    {
+      "epoch": 0.1028,
+      "grad_norm": 0.803665041923523,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 10280
+    },
+    {
+      "epoch": 0.10281,
+      "grad_norm": 0.9801792502403259,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 10281
+    },
+    {
+      "epoch": 0.10282,
+      "grad_norm": 1.018873691558838,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 10282
+    },
+    {
+      "epoch": 0.10283,
+      "grad_norm": 0.8730368614196777,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 10283
+    },
+    {
+      "epoch": 0.10284,
+      "grad_norm": 0.7729176878929138,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 10284
+    },
+    {
+      "epoch": 0.10285,
+      "grad_norm": 0.7743112444877625,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 10285
+    },
+    {
+      "epoch": 0.10286,
+      "grad_norm": 0.7332449555397034,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 10286
+    },
+    {
+      "epoch": 0.10287,
+      "grad_norm": 0.9000879526138306,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 10287
+    },
+    {
+      "epoch": 0.10288,
+      "grad_norm": 0.9760184288024902,
+      "learning_rate": 0.003,
+      "loss": 4.0675,
+      "step": 10288
+    },
+    {
+      "epoch": 0.10289,
+      "grad_norm": 1.027477741241455,
+      "learning_rate": 0.003,
+      "loss": 4.0572,
+      "step": 10289
+    },
+    {
+      "epoch": 0.1029,
+      "grad_norm": 0.8924638032913208,
+      "learning_rate": 0.003,
+      "loss": 4.0538,
+      "step": 10290
+    },
+    {
+      "epoch": 0.10291,
+      "grad_norm": 0.7968407869338989,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 10291
+    },
+    {
+      "epoch": 0.10292,
+      "grad_norm": 0.8795375227928162,
+      "learning_rate": 0.003,
+      "loss": 4.0595,
+      "step": 10292
+    },
+    {
+      "epoch": 0.10293,
+      "grad_norm": 0.9991517663002014,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 10293
+    },
+    {
+      "epoch": 0.10294,
+      "grad_norm": 0.9769951701164246,
+      "learning_rate": 0.003,
+      "loss": 4.0687,
+      "step": 10294
+    },
+    {
+      "epoch": 0.10295,
+      "grad_norm": 0.9654301404953003,
+      "learning_rate": 0.003,
+      "loss": 4.0568,
+      "step": 10295
+    },
+    {
+      "epoch": 0.10296,
+      "grad_norm": 1.0473653078079224,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 10296
+    },
+    {
+      "epoch": 0.10297,
+      "grad_norm": 0.9287710785865784,
+      "learning_rate": 0.003,
+      "loss": 4.0587,
+      "step": 10297
+    },
+    {
+      "epoch": 0.10298,
+      "grad_norm": 0.8924084305763245,
+      "learning_rate": 0.003,
+      "loss": 4.0785,
+      "step": 10298
+    },
+    {
+      "epoch": 0.10299,
+      "grad_norm": 0.9555318355560303,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 10299
+    },
+    {
+      "epoch": 0.103,
+      "grad_norm": 1.0484460592269897,
+      "learning_rate": 0.003,
+      "loss": 4.0546,
+      "step": 10300
+    },
+    {
+      "epoch": 0.10301,
+      "grad_norm": 0.8988922238349915,
+      "learning_rate": 0.003,
+      "loss": 4.0619,
+      "step": 10301
+    },
+    {
+      "epoch": 0.10302,
+      "grad_norm": 0.7982579469680786,
+      "learning_rate": 0.003,
+      "loss": 4.0639,
+      "step": 10302
+    },
+    {
+      "epoch": 0.10303,
+      "grad_norm": 0.8110583424568176,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 10303
+    },
+    {
+      "epoch": 0.10304,
+      "grad_norm": 0.8419620394706726,
+      "learning_rate": 0.003,
+      "loss": 4.0478,
+      "step": 10304
+    },
+    {
+      "epoch": 0.10305,
+      "grad_norm": 0.9462392926216125,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 10305
+    },
+    {
+      "epoch": 0.10306,
+      "grad_norm": 1.0094115734100342,
+      "learning_rate": 0.003,
+      "loss": 4.0623,
+      "step": 10306
+    },
+    {
+      "epoch": 0.10307,
+      "grad_norm": 0.9687174558639526,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 10307
+    },
+    {
+      "epoch": 0.10308,
+      "grad_norm": 0.9006624221801758,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 10308
+    },
+    {
+      "epoch": 0.10309,
+      "grad_norm": 0.8222994208335876,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 10309
+    },
+    {
+      "epoch": 0.1031,
+      "grad_norm": 0.8331215381622314,
+      "learning_rate": 0.003,
+      "loss": 4.0256,
+      "step": 10310
+    },
+    {
+      "epoch": 0.10311,
+      "grad_norm": 0.6884284019470215,
+      "learning_rate": 0.003,
+      "loss": 4.034,
+      "step": 10311
+    },
+    {
+      "epoch": 0.10312,
+      "grad_norm": 0.6938837766647339,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 10312
+    },
+    {
+      "epoch": 0.10313,
+      "grad_norm": 0.659064769744873,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 10313
+    },
+    {
+      "epoch": 0.10314,
+      "grad_norm": 0.6628589630126953,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 10314
+    },
+    {
+      "epoch": 0.10315,
+      "grad_norm": 0.7294396758079529,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 10315
+    },
+    {
+      "epoch": 0.10316,
+      "grad_norm": 0.6614591479301453,
+      "learning_rate": 0.003,
+      "loss": 4.0657,
+      "step": 10316
+    },
+    {
+      "epoch": 0.10317,
+      "grad_norm": 0.684831440448761,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 10317
+    },
+    {
+      "epoch": 0.10318,
+      "grad_norm": 0.7045228481292725,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 10318
+    },
+    {
+      "epoch": 0.10319,
+      "grad_norm": 0.7830377817153931,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 10319
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.9162354469299316,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 10320
+    },
+    {
+      "epoch": 0.10321,
+      "grad_norm": 0.9490805864334106,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 10321
+    },
+    {
+      "epoch": 0.10322,
+      "grad_norm": 0.6477469205856323,
+      "learning_rate": 0.003,
+      "loss": 4.0564,
+      "step": 10322
+    },
+    {
+      "epoch": 0.10323,
+      "grad_norm": 0.5870311856269836,
+      "learning_rate": 0.003,
+      "loss": 4.0373,
+      "step": 10323
+    },
+    {
+      "epoch": 0.10324,
+      "grad_norm": 0.5911048054695129,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 10324
+    },
+    {
+      "epoch": 0.10325,
+      "grad_norm": 0.6378191709518433,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 10325
+    },
+    {
+      "epoch": 0.10326,
+      "grad_norm": 0.6543301343917847,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 10326
+    },
+    {
+      "epoch": 0.10327,
+      "grad_norm": 0.6948620080947876,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 10327
+    },
+    {
+      "epoch": 0.10328,
+      "grad_norm": 0.7429428100585938,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 10328
+    },
+    {
+      "epoch": 0.10329,
+      "grad_norm": 0.7913004159927368,
+      "learning_rate": 0.003,
+      "loss": 4.0403,
+      "step": 10329
+    },
+    {
+      "epoch": 0.1033,
+      "grad_norm": 0.8257701992988586,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 10330
+    },
+    {
+      "epoch": 0.10331,
+      "grad_norm": 0.8367582559585571,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 10331
+    },
+    {
+      "epoch": 0.10332,
+      "grad_norm": 0.7799370884895325,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 10332
+    },
+    {
+      "epoch": 0.10333,
+      "grad_norm": 0.7020469307899475,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 10333
+    },
+    {
+      "epoch": 0.10334,
+      "grad_norm": 0.6871790289878845,
+      "learning_rate": 0.003,
+      "loss": 4.0501,
+      "step": 10334
+    },
+    {
+      "epoch": 0.10335,
+      "grad_norm": 0.583629846572876,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 10335
+    },
+    {
+      "epoch": 0.10336,
+      "grad_norm": 0.6028733849525452,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 10336
+    },
+    {
+      "epoch": 0.10337,
+      "grad_norm": 0.7084194421768188,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 10337
+    },
+    {
+      "epoch": 0.10338,
+      "grad_norm": 0.7569435834884644,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 10338
+    },
+    {
+      "epoch": 0.10339,
+      "grad_norm": 0.6454418301582336,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 10339
+    },
+    {
+      "epoch": 0.1034,
+      "grad_norm": 0.6388258337974548,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 10340
+    },
+    {
+      "epoch": 0.10341,
+      "grad_norm": 0.6123440861701965,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 10341
+    },
+    {
+      "epoch": 0.10342,
+      "grad_norm": 0.600523054599762,
+      "learning_rate": 0.003,
+      "loss": 3.9972,
+      "step": 10342
+    },
+    {
+      "epoch": 0.10343,
+      "grad_norm": 0.7250362634658813,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 10343
+    },
+    {
+      "epoch": 0.10344,
+      "grad_norm": 0.8328801393508911,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 10344
+    },
+    {
+      "epoch": 0.10345,
+      "grad_norm": 0.9125838875770569,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 10345
+    },
+    {
+      "epoch": 0.10346,
+      "grad_norm": 0.9645897746086121,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 10346
+    },
+    {
+      "epoch": 0.10347,
+      "grad_norm": 0.9420467019081116,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 10347
+    },
+    {
+      "epoch": 0.10348,
+      "grad_norm": 0.9969478249549866,
+      "learning_rate": 0.003,
+      "loss": 4.0575,
+      "step": 10348
+    },
+    {
+      "epoch": 0.10349,
+      "grad_norm": 1.1243056058883667,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 10349
+    },
+    {
+      "epoch": 0.1035,
+      "grad_norm": 1.056504726409912,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 10350
+    },
+    {
+      "epoch": 0.10351,
+      "grad_norm": 1.0015921592712402,
+      "learning_rate": 0.003,
+      "loss": 4.0487,
+      "step": 10351
+    },
+    {
+      "epoch": 0.10352,
+      "grad_norm": 1.054189682006836,
+      "learning_rate": 0.003,
+      "loss": 4.0398,
+      "step": 10352
+    },
+    {
+      "epoch": 0.10353,
+      "grad_norm": 0.9281718730926514,
+      "learning_rate": 0.003,
+      "loss": 4.0732,
+      "step": 10353
+    },
+    {
+      "epoch": 0.10354,
+      "grad_norm": 0.8637033700942993,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 10354
+    },
+    {
+      "epoch": 0.10355,
+      "grad_norm": 0.6888679265975952,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 10355
+    },
+    {
+      "epoch": 0.10356,
+      "grad_norm": 0.5851486921310425,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 10356
+    },
+    {
+      "epoch": 0.10357,
+      "grad_norm": 0.5784017443656921,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 10357
+    },
+    {
+      "epoch": 0.10358,
+      "grad_norm": 0.569531261920929,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 10358
+    },
+    {
+      "epoch": 0.10359,
+      "grad_norm": 0.5688461065292358,
+      "learning_rate": 0.003,
+      "loss": 4.0176,
+      "step": 10359
+    },
+    {
+      "epoch": 0.1036,
+      "grad_norm": 0.6449005007743835,
+      "learning_rate": 0.003,
+      "loss": 4.0289,
+      "step": 10360
+    },
+    {
+      "epoch": 0.10361,
+      "grad_norm": 0.7200685739517212,
+      "learning_rate": 0.003,
+      "loss": 3.9871,
+      "step": 10361
+    },
+    {
+      "epoch": 0.10362,
+      "grad_norm": 0.7981273531913757,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 10362
+    },
+    {
+      "epoch": 0.10363,
+      "grad_norm": 0.8022664785385132,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 10363
+    },
+    {
+      "epoch": 0.10364,
+      "grad_norm": 0.8001551628112793,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 10364
+    },
+    {
+      "epoch": 0.10365,
+      "grad_norm": 0.8692436814308167,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 10365
+    },
+    {
+      "epoch": 0.10366,
+      "grad_norm": 0.8658043146133423,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 10366
+    },
+    {
+      "epoch": 0.10367,
+      "grad_norm": 0.8140656352043152,
+      "learning_rate": 0.003,
+      "loss": 4.0462,
+      "step": 10367
+    },
+    {
+      "epoch": 0.10368,
+      "grad_norm": 0.9589678049087524,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 10368
+    },
+    {
+      "epoch": 0.10369,
+      "grad_norm": 0.9657092094421387,
+      "learning_rate": 0.003,
+      "loss": 4.0323,
+      "step": 10369
+    },
+    {
+      "epoch": 0.1037,
+      "grad_norm": 0.9821603298187256,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 10370
+    },
+    {
+      "epoch": 0.10371,
+      "grad_norm": 1.0179591178894043,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 10371
+    },
+    {
+      "epoch": 0.10372,
+      "grad_norm": 1.0082340240478516,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 10372
+    },
+    {
+      "epoch": 0.10373,
+      "grad_norm": 1.0298233032226562,
+      "learning_rate": 0.003,
+      "loss": 4.0484,
+      "step": 10373
+    },
+    {
+      "epoch": 0.10374,
+      "grad_norm": 0.8384741544723511,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 10374
+    },
+    {
+      "epoch": 0.10375,
+      "grad_norm": 0.7988961338996887,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 10375
+    },
+    {
+      "epoch": 0.10376,
+      "grad_norm": 0.9472295641899109,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 10376
+    },
+    {
+      "epoch": 0.10377,
+      "grad_norm": 1.0347899198532104,
+      "learning_rate": 0.003,
+      "loss": 4.0463,
+      "step": 10377
+    },
+    {
+      "epoch": 0.10378,
+      "grad_norm": 1.1437108516693115,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 10378
+    },
+    {
+      "epoch": 0.10379,
+      "grad_norm": 0.7097597718238831,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 10379
+    },
+    {
+      "epoch": 0.1038,
+      "grad_norm": 0.6190682649612427,
+      "learning_rate": 0.003,
+      "loss": 4.0583,
+      "step": 10380
+    },
+    {
+      "epoch": 0.10381,
+      "grad_norm": 0.6848435401916504,
+      "learning_rate": 0.003,
+      "loss": 4.0376,
+      "step": 10381
+    },
+    {
+      "epoch": 0.10382,
+      "grad_norm": 0.8380157947540283,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 10382
+    },
+    {
+      "epoch": 0.10383,
+      "grad_norm": 0.9357398748397827,
+      "learning_rate": 0.003,
+      "loss": 4.0427,
+      "step": 10383
+    },
+    {
+      "epoch": 0.10384,
+      "grad_norm": 0.8331278562545776,
+      "learning_rate": 0.003,
+      "loss": 4.0464,
+      "step": 10384
+    },
+    {
+      "epoch": 0.10385,
+      "grad_norm": 0.7992599010467529,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 10385
+    },
+    {
+      "epoch": 0.10386,
+      "grad_norm": 0.7941585779190063,
+      "learning_rate": 0.003,
+      "loss": 4.0467,
+      "step": 10386
+    },
+    {
+      "epoch": 0.10387,
+      "grad_norm": 0.7473763227462769,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 10387
+    },
+    {
+      "epoch": 0.10388,
+      "grad_norm": 0.6912757158279419,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 10388
+    },
+    {
+      "epoch": 0.10389,
+      "grad_norm": 0.6043686866760254,
+      "learning_rate": 0.003,
+      "loss": 4.0436,
+      "step": 10389
+    },
+    {
+      "epoch": 0.1039,
+      "grad_norm": 0.593113124370575,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 10390
+    },
+    {
+      "epoch": 0.10391,
+      "grad_norm": 0.5677477717399597,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 10391
+    },
+    {
+      "epoch": 0.10392,
+      "grad_norm": 0.5792213082313538,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 10392
+    },
+    {
+      "epoch": 0.10393,
+      "grad_norm": 0.6402269005775452,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 10393
+    },
+    {
+      "epoch": 0.10394,
+      "grad_norm": 0.6701036691665649,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 10394
+    },
+    {
+      "epoch": 0.10395,
+      "grad_norm": 0.7115254402160645,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 10395
+    },
+    {
+      "epoch": 0.10396,
+      "grad_norm": 0.7768694162368774,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 10396
+    },
+    {
+      "epoch": 0.10397,
+      "grad_norm": 0.7983322739601135,
+      "learning_rate": 0.003,
+      "loss": 4.0382,
+      "step": 10397
+    },
+    {
+      "epoch": 0.10398,
+      "grad_norm": 0.8996577262878418,
+      "learning_rate": 0.003,
+      "loss": 4.0341,
+      "step": 10398
+    },
+    {
+      "epoch": 0.10399,
+      "grad_norm": 0.9261710047721863,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 10399
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.8924296498298645,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 10400
+    },
+    {
+      "epoch": 0.10401,
+      "grad_norm": 0.6983252763748169,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 10401
+    },
+    {
+      "epoch": 0.10402,
+      "grad_norm": 0.6650741100311279,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 10402
+    },
+    {
+      "epoch": 0.10403,
+      "grad_norm": 0.7275328636169434,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 10403
+    },
+    {
+      "epoch": 0.10404,
+      "grad_norm": 0.7682779431343079,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 10404
+    },
+    {
+      "epoch": 0.10405,
+      "grad_norm": 0.6787878274917603,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 10405
+    },
+    {
+      "epoch": 0.10406,
+      "grad_norm": 0.8612582087516785,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 10406
+    },
+    {
+      "epoch": 0.10407,
+      "grad_norm": 0.9241533279418945,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 10407
+    },
+    {
+      "epoch": 0.10408,
+      "grad_norm": 0.9687588810920715,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 10408
+    },
+    {
+      "epoch": 0.10409,
+      "grad_norm": 1.0829697847366333,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 10409
+    },
+    {
+      "epoch": 0.1041,
+      "grad_norm": 0.7547574043273926,
+      "learning_rate": 0.003,
+      "loss": 4.0577,
+      "step": 10410
+    },
+    {
+      "epoch": 0.10411,
+      "grad_norm": 0.6516535878181458,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 10411
+    },
+    {
+      "epoch": 0.10412,
+      "grad_norm": 0.732597291469574,
+      "learning_rate": 0.003,
+      "loss": 3.9836,
+      "step": 10412
+    },
+    {
+      "epoch": 0.10413,
+      "grad_norm": 0.6244125366210938,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 10413
+    },
+    {
+      "epoch": 0.10414,
+      "grad_norm": 0.6561394929885864,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 10414
+    },
+    {
+      "epoch": 0.10415,
+      "grad_norm": 0.7480183839797974,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 10415
+    },
+    {
+      "epoch": 0.10416,
+      "grad_norm": 0.7821226716041565,
+      "learning_rate": 0.003,
+      "loss": 4.0476,
+      "step": 10416
+    },
+    {
+      "epoch": 0.10417,
+      "grad_norm": 0.8230247497558594,
+      "learning_rate": 0.003,
+      "loss": 4.0536,
+      "step": 10417
+    },
+    {
+      "epoch": 0.10418,
+      "grad_norm": 0.9134896993637085,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 10418
+    },
+    {
+      "epoch": 0.10419,
+      "grad_norm": 0.976138710975647,
+      "learning_rate": 0.003,
+      "loss": 4.0774,
+      "step": 10419
+    },
+    {
+      "epoch": 0.1042,
+      "grad_norm": 0.9060264229774475,
+      "learning_rate": 0.003,
+      "loss": 4.052,
+      "step": 10420
+    },
+    {
+      "epoch": 0.10421,
+      "grad_norm": 0.8544201850891113,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 10421
+    },
+    {
+      "epoch": 0.10422,
+      "grad_norm": 0.7737321257591248,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 10422
+    },
+    {
+      "epoch": 0.10423,
+      "grad_norm": 0.7796673774719238,
+      "learning_rate": 0.003,
+      "loss": 4.0429,
+      "step": 10423
+    },
+    {
+      "epoch": 0.10424,
+      "grad_norm": 0.9727042317390442,
+      "learning_rate": 0.003,
+      "loss": 4.0263,
+      "step": 10424
+    },
+    {
+      "epoch": 0.10425,
+      "grad_norm": 1.0700771808624268,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 10425
+    },
+    {
+      "epoch": 0.10426,
+      "grad_norm": 1.0908294916152954,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 10426
+    },
+    {
+      "epoch": 0.10427,
+      "grad_norm": 0.800527811050415,
+      "learning_rate": 0.003,
+      "loss": 4.0481,
+      "step": 10427
+    },
+    {
+      "epoch": 0.10428,
+      "grad_norm": 0.6922642588615417,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 10428
+    },
+    {
+      "epoch": 0.10429,
+      "grad_norm": 0.7603942155838013,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 10429
+    },
+    {
+      "epoch": 0.1043,
+      "grad_norm": 0.9487042427062988,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 10430
+    },
+    {
+      "epoch": 0.10431,
+      "grad_norm": 0.9603556394577026,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 10431
+    },
+    {
+      "epoch": 0.10432,
+      "grad_norm": 0.7391037344932556,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 10432
+    },
+    {
+      "epoch": 0.10433,
+      "grad_norm": 0.5578556060791016,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 10433
+    },
+    {
+      "epoch": 0.10434,
+      "grad_norm": 0.7265412211418152,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 10434
+    },
+    {
+      "epoch": 0.10435,
+      "grad_norm": 0.8804828524589539,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 10435
+    },
+    {
+      "epoch": 0.10436,
+      "grad_norm": 0.7199718952178955,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 10436
+    },
+    {
+      "epoch": 0.10437,
+      "grad_norm": 0.6212375164031982,
+      "learning_rate": 0.003,
+      "loss": 4.0373,
+      "step": 10437
+    },
+    {
+      "epoch": 0.10438,
+      "grad_norm": 0.621557354927063,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 10438
+    },
+    {
+      "epoch": 0.10439,
+      "grad_norm": 0.6311930418014526,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 10439
+    },
+    {
+      "epoch": 0.1044,
+      "grad_norm": 0.6799618601799011,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 10440
+    },
+    {
+      "epoch": 0.10441,
+      "grad_norm": 0.6392710208892822,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 10441
+    },
+    {
+      "epoch": 0.10442,
+      "grad_norm": 0.5309514403343201,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 10442
+    },
+    {
+      "epoch": 0.10443,
+      "grad_norm": 0.46626293659210205,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 10443
+    },
+    {
+      "epoch": 0.10444,
+      "grad_norm": 0.5864386558532715,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 10444
+    },
+    {
+      "epoch": 0.10445,
+      "grad_norm": 0.6950222849845886,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 10445
+    },
+    {
+      "epoch": 0.10446,
+      "grad_norm": 0.7721315622329712,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 10446
+    },
+    {
+      "epoch": 0.10447,
+      "grad_norm": 0.8094618320465088,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 10447
+    },
+    {
+      "epoch": 0.10448,
+      "grad_norm": 0.7723525762557983,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 10448
+    },
+    {
+      "epoch": 0.10449,
+      "grad_norm": 0.6535624265670776,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 10449
+    },
+    {
+      "epoch": 0.1045,
+      "grad_norm": 0.615446925163269,
+      "learning_rate": 0.003,
+      "loss": 4.0421,
+      "step": 10450
+    },
+    {
+      "epoch": 0.10451,
+      "grad_norm": 0.8183608055114746,
+      "learning_rate": 0.003,
+      "loss": 3.9983,
+      "step": 10451
+    },
+    {
+      "epoch": 0.10452,
+      "grad_norm": 0.9758654832839966,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 10452
+    },
+    {
+      "epoch": 0.10453,
+      "grad_norm": 1.1344112157821655,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 10453
+    },
+    {
+      "epoch": 0.10454,
+      "grad_norm": 0.7159709930419922,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 10454
+    },
+    {
+      "epoch": 0.10455,
+      "grad_norm": 0.6631528735160828,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 10455
+    },
+    {
+      "epoch": 0.10456,
+      "grad_norm": 0.747515857219696,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 10456
+    },
+    {
+      "epoch": 0.10457,
+      "grad_norm": 0.7781107425689697,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 10457
+    },
+    {
+      "epoch": 0.10458,
+      "grad_norm": 0.9523149728775024,
+      "learning_rate": 0.003,
+      "loss": 4.047,
+      "step": 10458
+    },
+    {
+      "epoch": 0.10459,
+      "grad_norm": 1.1178628206253052,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 10459
+    },
+    {
+      "epoch": 0.1046,
+      "grad_norm": 0.6914238929748535,
+      "learning_rate": 0.003,
+      "loss": 4.0427,
+      "step": 10460
+    },
+    {
+      "epoch": 0.10461,
+      "grad_norm": 0.611783504486084,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 10461
+    },
+    {
+      "epoch": 0.10462,
+      "grad_norm": 0.7709699869155884,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 10462
+    },
+    {
+      "epoch": 0.10463,
+      "grad_norm": 0.6816902160644531,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 10463
+    },
+    {
+      "epoch": 0.10464,
+      "grad_norm": 0.6006267070770264,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 10464
+    },
+    {
+      "epoch": 0.10465,
+      "grad_norm": 0.5709377527236938,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 10465
+    },
+    {
+      "epoch": 0.10466,
+      "grad_norm": 0.5002264976501465,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 10466
+    },
+    {
+      "epoch": 0.10467,
+      "grad_norm": 0.5063186287879944,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 10467
+    },
+    {
+      "epoch": 0.10468,
+      "grad_norm": 0.594136118888855,
+      "learning_rate": 0.003,
+      "loss": 4.0,
+      "step": 10468
+    },
+    {
+      "epoch": 0.10469,
+      "grad_norm": 0.5744364261627197,
+      "learning_rate": 0.003,
+      "loss": 4.0271,
+      "step": 10469
+    },
+    {
+      "epoch": 0.1047,
+      "grad_norm": 0.6634460687637329,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 10470
+    },
+    {
+      "epoch": 0.10471,
+      "grad_norm": 0.7383639812469482,
+      "learning_rate": 0.003,
+      "loss": 4.0294,
+      "step": 10471
+    },
+    {
+      "epoch": 0.10472,
+      "grad_norm": 0.9079957604408264,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 10472
+    },
+    {
+      "epoch": 0.10473,
+      "grad_norm": 1.1386297941207886,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 10473
+    },
+    {
+      "epoch": 0.10474,
+      "grad_norm": 0.7866982817649841,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 10474
+    },
+    {
+      "epoch": 0.10475,
+      "grad_norm": 0.7195823192596436,
+      "learning_rate": 0.003,
+      "loss": 4.0168,
+      "step": 10475
+    },
+    {
+      "epoch": 0.10476,
+      "grad_norm": 0.7635900974273682,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 10476
+    },
+    {
+      "epoch": 0.10477,
+      "grad_norm": 0.7521525621414185,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 10477
+    },
+    {
+      "epoch": 0.10478,
+      "grad_norm": 0.8314422369003296,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 10478
+    },
+    {
+      "epoch": 0.10479,
+      "grad_norm": 0.9330824613571167,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 10479
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.9943487644195557,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 10480
+    },
+    {
+      "epoch": 0.10481,
+      "grad_norm": 1.000788927078247,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 10481
+    },
+    {
+      "epoch": 0.10482,
+      "grad_norm": 0.9992671608924866,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 10482
+    },
+    {
+      "epoch": 0.10483,
+      "grad_norm": 0.9347127676010132,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 10483
+    },
+    {
+      "epoch": 0.10484,
+      "grad_norm": 0.9052039384841919,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 10484
+    },
+    {
+      "epoch": 0.10485,
+      "grad_norm": 0.9239107966423035,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 10485
+    },
+    {
+      "epoch": 0.10486,
+      "grad_norm": 0.996957004070282,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 10486
+    },
+    {
+      "epoch": 0.10487,
+      "grad_norm": 0.9323636889457703,
+      "learning_rate": 0.003,
+      "loss": 4.0543,
+      "step": 10487
+    },
+    {
+      "epoch": 0.10488,
+      "grad_norm": 0.9457101821899414,
+      "learning_rate": 0.003,
+      "loss": 4.0384,
+      "step": 10488
+    },
+    {
+      "epoch": 0.10489,
+      "grad_norm": 0.8397849202156067,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 10489
+    },
+    {
+      "epoch": 0.1049,
+      "grad_norm": 0.7614753842353821,
+      "learning_rate": 0.003,
+      "loss": 4.0554,
+      "step": 10490
+    },
+    {
+      "epoch": 0.10491,
+      "grad_norm": 0.7990318536758423,
+      "learning_rate": 0.003,
+      "loss": 4.0591,
+      "step": 10491
+    },
+    {
+      "epoch": 0.10492,
+      "grad_norm": 0.7685605883598328,
+      "learning_rate": 0.003,
+      "loss": 4.0386,
+      "step": 10492
+    },
+    {
+      "epoch": 0.10493,
+      "grad_norm": 0.8393089771270752,
+      "learning_rate": 0.003,
+      "loss": 4.0402,
+      "step": 10493
+    },
+    {
+      "epoch": 0.10494,
+      "grad_norm": 0.7939583659172058,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 10494
+    },
+    {
+      "epoch": 0.10495,
+      "grad_norm": 0.6594287753105164,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 10495
+    },
+    {
+      "epoch": 0.10496,
+      "grad_norm": 0.6255642175674438,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 10496
+    },
+    {
+      "epoch": 0.10497,
+      "grad_norm": 0.5651243925094604,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 10497
+    },
+    {
+      "epoch": 0.10498,
+      "grad_norm": 0.4910907447338104,
+      "learning_rate": 0.003,
+      "loss": 4.04,
+      "step": 10498
+    },
+    {
+      "epoch": 0.10499,
+      "grad_norm": 0.5701528191566467,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 10499
+    },
+    {
+      "epoch": 0.105,
+      "grad_norm": 0.6363165378570557,
+      "learning_rate": 0.003,
+      "loss": 4.0522,
+      "step": 10500
+    },
+    {
+      "epoch": 0.10501,
+      "grad_norm": 0.687296450138092,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 10501
+    },
+    {
+      "epoch": 0.10502,
+      "grad_norm": 0.9270140528678894,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 10502
+    },
+    {
+      "epoch": 0.10503,
+      "grad_norm": 1.2313417196273804,
+      "learning_rate": 0.003,
+      "loss": 4.0579,
+      "step": 10503
+    },
+    {
+      "epoch": 0.10504,
+      "grad_norm": 0.6528260707855225,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 10504
+    },
+    {
+      "epoch": 0.10505,
+      "grad_norm": 0.5283305644989014,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 10505
+    },
+    {
+      "epoch": 0.10506,
+      "grad_norm": 0.5796220898628235,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 10506
+    },
+    {
+      "epoch": 0.10507,
+      "grad_norm": 0.623137891292572,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 10507
+    },
+    {
+      "epoch": 0.10508,
+      "grad_norm": 0.6215649843215942,
+      "learning_rate": 0.003,
+      "loss": 3.9926,
+      "step": 10508
+    },
+    {
+      "epoch": 0.10509,
+      "grad_norm": 0.6104245781898499,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 10509
+    },
+    {
+      "epoch": 0.1051,
+      "grad_norm": 0.6872464418411255,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 10510
+    },
+    {
+      "epoch": 0.10511,
+      "grad_norm": 0.7551093101501465,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 10511
+    },
+    {
+      "epoch": 0.10512,
+      "grad_norm": 0.8498075008392334,
+      "learning_rate": 0.003,
+      "loss": 4.0501,
+      "step": 10512
+    },
+    {
+      "epoch": 0.10513,
+      "grad_norm": 0.832697331905365,
+      "learning_rate": 0.003,
+      "loss": 3.9913,
+      "step": 10513
+    },
+    {
+      "epoch": 0.10514,
+      "grad_norm": 0.7095122337341309,
+      "learning_rate": 0.003,
+      "loss": 3.9761,
+      "step": 10514
+    },
+    {
+      "epoch": 0.10515,
+      "grad_norm": 0.6831356883049011,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 10515
+    },
+    {
+      "epoch": 0.10516,
+      "grad_norm": 0.7304404377937317,
+      "learning_rate": 0.003,
+      "loss": 3.99,
+      "step": 10516
+    },
+    {
+      "epoch": 0.10517,
+      "grad_norm": 0.7606459856033325,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 10517
+    },
+    {
+      "epoch": 0.10518,
+      "grad_norm": 0.7579407095909119,
+      "learning_rate": 0.003,
+      "loss": 4.0441,
+      "step": 10518
+    },
+    {
+      "epoch": 0.10519,
+      "grad_norm": 0.7492026090621948,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 10519
+    },
+    {
+      "epoch": 0.1052,
+      "grad_norm": 0.7874739766120911,
+      "learning_rate": 0.003,
+      "loss": 4.0513,
+      "step": 10520
+    },
+    {
+      "epoch": 0.10521,
+      "grad_norm": 0.7780115008354187,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 10521
+    },
+    {
+      "epoch": 0.10522,
+      "grad_norm": 0.774425745010376,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 10522
+    },
+    {
+      "epoch": 0.10523,
+      "grad_norm": 0.8482930064201355,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 10523
+    },
+    {
+      "epoch": 0.10524,
+      "grad_norm": 1.0661581754684448,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 10524
+    },
+    {
+      "epoch": 0.10525,
+      "grad_norm": 1.2727166414260864,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 10525
+    },
+    {
+      "epoch": 0.10526,
+      "grad_norm": 0.8416107296943665,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 10526
+    },
+    {
+      "epoch": 0.10527,
+      "grad_norm": 0.7589594721794128,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 10527
+    },
+    {
+      "epoch": 0.10528,
+      "grad_norm": 0.7027614116668701,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 10528
+    },
+    {
+      "epoch": 0.10529,
+      "grad_norm": 0.7037528157234192,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 10529
+    },
+    {
+      "epoch": 0.1053,
+      "grad_norm": 0.6253616213798523,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 10530
+    },
+    {
+      "epoch": 0.10531,
+      "grad_norm": 0.7041178941726685,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 10531
+    },
+    {
+      "epoch": 0.10532,
+      "grad_norm": 0.7070190906524658,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 10532
+    },
+    {
+      "epoch": 0.10533,
+      "grad_norm": 0.8403984308242798,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 10533
+    },
+    {
+      "epoch": 0.10534,
+      "grad_norm": 0.8914600014686584,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 10534
+    },
+    {
+      "epoch": 0.10535,
+      "grad_norm": 0.8591518998146057,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 10535
+    },
+    {
+      "epoch": 0.10536,
+      "grad_norm": 0.8553505539894104,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 10536
+    },
+    {
+      "epoch": 0.10537,
+      "grad_norm": 0.9386612772941589,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 10537
+    },
+    {
+      "epoch": 0.10538,
+      "grad_norm": 1.1708186864852905,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 10538
+    },
+    {
+      "epoch": 0.10539,
+      "grad_norm": 0.7604055404663086,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 10539
+    },
+    {
+      "epoch": 0.1054,
+      "grad_norm": 0.6964765787124634,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 10540
+    },
+    {
+      "epoch": 0.10541,
+      "grad_norm": 0.7417082190513611,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 10541
+    },
+    {
+      "epoch": 0.10542,
+      "grad_norm": 0.783746600151062,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 10542
+    },
+    {
+      "epoch": 0.10543,
+      "grad_norm": 0.8471524119377136,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 10543
+    },
+    {
+      "epoch": 0.10544,
+      "grad_norm": 0.8709812760353088,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 10544
+    },
+    {
+      "epoch": 0.10545,
+      "grad_norm": 0.8089157938957214,
+      "learning_rate": 0.003,
+      "loss": 4.042,
+      "step": 10545
+    },
+    {
+      "epoch": 0.10546,
+      "grad_norm": 0.8024004101753235,
+      "learning_rate": 0.003,
+      "loss": 4.0621,
+      "step": 10546
+    },
+    {
+      "epoch": 0.10547,
+      "grad_norm": 0.771805465221405,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 10547
+    },
+    {
+      "epoch": 0.10548,
+      "grad_norm": 0.7884362936019897,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 10548
+    },
+    {
+      "epoch": 0.10549,
+      "grad_norm": 0.8107030987739563,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 10549
+    },
+    {
+      "epoch": 0.1055,
+      "grad_norm": 0.7356095314025879,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 10550
+    },
+    {
+      "epoch": 0.10551,
+      "grad_norm": 0.7435442805290222,
+      "learning_rate": 0.003,
+      "loss": 4.0516,
+      "step": 10551
+    },
+    {
+      "epoch": 0.10552,
+      "grad_norm": 0.6796212792396545,
+      "learning_rate": 0.003,
+      "loss": 4.0486,
+      "step": 10552
+    },
+    {
+      "epoch": 0.10553,
+      "grad_norm": 0.6590840816497803,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 10553
+    },
+    {
+      "epoch": 0.10554,
+      "grad_norm": 0.6578130722045898,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 10554
+    },
+    {
+      "epoch": 0.10555,
+      "grad_norm": 0.854569673538208,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 10555
+    },
+    {
+      "epoch": 0.10556,
+      "grad_norm": 1.234406590461731,
+      "learning_rate": 0.003,
+      "loss": 4.0355,
+      "step": 10556
+    },
+    {
+      "epoch": 0.10557,
+      "grad_norm": 0.9004664421081543,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 10557
+    },
+    {
+      "epoch": 0.10558,
+      "grad_norm": 0.8317961692810059,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 10558
+    },
+    {
+      "epoch": 0.10559,
+      "grad_norm": 0.7906206250190735,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 10559
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.7413365244865417,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 10560
+    },
+    {
+      "epoch": 0.10561,
+      "grad_norm": 0.6539586782455444,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 10561
+    },
+    {
+      "epoch": 0.10562,
+      "grad_norm": 0.5383809804916382,
+      "learning_rate": 0.003,
+      "loss": 3.997,
+      "step": 10562
+    },
+    {
+      "epoch": 0.10563,
+      "grad_norm": 0.529626190662384,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 10563
+    },
+    {
+      "epoch": 0.10564,
+      "grad_norm": 0.557378351688385,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 10564
+    },
+    {
+      "epoch": 0.10565,
+      "grad_norm": 0.5710329413414001,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 10565
+    },
+    {
+      "epoch": 0.10566,
+      "grad_norm": 0.7581164240837097,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 10566
+    },
+    {
+      "epoch": 0.10567,
+      "grad_norm": 0.9188153147697449,
+      "learning_rate": 0.003,
+      "loss": 4.0532,
+      "step": 10567
+    },
+    {
+      "epoch": 0.10568,
+      "grad_norm": 1.2158697843551636,
+      "learning_rate": 0.003,
+      "loss": 4.054,
+      "step": 10568
+    },
+    {
+      "epoch": 0.10569,
+      "grad_norm": 0.799419105052948,
+      "learning_rate": 0.003,
+      "loss": 4.0481,
+      "step": 10569
+    },
+    {
+      "epoch": 0.1057,
+      "grad_norm": 0.8842980265617371,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 10570
+    },
+    {
+      "epoch": 0.10571,
+      "grad_norm": 1.089672565460205,
+      "learning_rate": 0.003,
+      "loss": 4.0402,
+      "step": 10571
+    },
+    {
+      "epoch": 0.10572,
+      "grad_norm": 0.9158157110214233,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 10572
+    },
+    {
+      "epoch": 0.10573,
+      "grad_norm": 1.0048152208328247,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 10573
+    },
+    {
+      "epoch": 0.10574,
+      "grad_norm": 0.9467569589614868,
+      "learning_rate": 0.003,
+      "loss": 4.0487,
+      "step": 10574
+    },
+    {
+      "epoch": 0.10575,
+      "grad_norm": 0.9308494329452515,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 10575
+    },
+    {
+      "epoch": 0.10576,
+      "grad_norm": 0.8847599029541016,
+      "learning_rate": 0.003,
+      "loss": 4.0627,
+      "step": 10576
+    },
+    {
+      "epoch": 0.10577,
+      "grad_norm": 0.8620772957801819,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 10577
+    },
+    {
+      "epoch": 0.10578,
+      "grad_norm": 0.9054552316665649,
+      "learning_rate": 0.003,
+      "loss": 4.0477,
+      "step": 10578
+    },
+    {
+      "epoch": 0.10579,
+      "grad_norm": 0.9128457903862,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 10579
+    },
+    {
+      "epoch": 0.1058,
+      "grad_norm": 0.734600305557251,
+      "learning_rate": 0.003,
+      "loss": 4.0571,
+      "step": 10580
+    },
+    {
+      "epoch": 0.10581,
+      "grad_norm": 0.7643128037452698,
+      "learning_rate": 0.003,
+      "loss": 4.0614,
+      "step": 10581
+    },
+    {
+      "epoch": 0.10582,
+      "grad_norm": 0.6332104802131653,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 10582
+    },
+    {
+      "epoch": 0.10583,
+      "grad_norm": 0.5396142601966858,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 10583
+    },
+    {
+      "epoch": 0.10584,
+      "grad_norm": 0.5539082884788513,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 10584
+    },
+    {
+      "epoch": 0.10585,
+      "grad_norm": 0.5713188648223877,
+      "learning_rate": 0.003,
+      "loss": 4.0688,
+      "step": 10585
+    },
+    {
+      "epoch": 0.10586,
+      "grad_norm": 0.6109192371368408,
+      "learning_rate": 0.003,
+      "loss": 4.0498,
+      "step": 10586
+    },
+    {
+      "epoch": 0.10587,
+      "grad_norm": 0.7975706458091736,
+      "learning_rate": 0.003,
+      "loss": 4.0596,
+      "step": 10587
+    },
+    {
+      "epoch": 0.10588,
+      "grad_norm": 1.0050781965255737,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 10588
+    },
+    {
+      "epoch": 0.10589,
+      "grad_norm": 1.3392750024795532,
+      "learning_rate": 0.003,
+      "loss": 4.0446,
+      "step": 10589
+    },
+    {
+      "epoch": 0.1059,
+      "grad_norm": 0.63167804479599,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 10590
+    },
+    {
+      "epoch": 0.10591,
+      "grad_norm": 0.6843758821487427,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 10591
+    },
+    {
+      "epoch": 0.10592,
+      "grad_norm": 0.7719295620918274,
+      "learning_rate": 0.003,
+      "loss": 4.0697,
+      "step": 10592
+    },
+    {
+      "epoch": 0.10593,
+      "grad_norm": 0.9197074174880981,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 10593
+    },
+    {
+      "epoch": 0.10594,
+      "grad_norm": 1.031716227531433,
+      "learning_rate": 0.003,
+      "loss": 4.0402,
+      "step": 10594
+    },
+    {
+      "epoch": 0.10595,
+      "grad_norm": 0.7711551785469055,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 10595
+    },
+    {
+      "epoch": 0.10596,
+      "grad_norm": 0.6337047815322876,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 10596
+    },
+    {
+      "epoch": 0.10597,
+      "grad_norm": 0.5785505175590515,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 10597
+    },
+    {
+      "epoch": 0.10598,
+      "grad_norm": 0.5775535702705383,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 10598
+    },
+    {
+      "epoch": 0.10599,
+      "grad_norm": 0.5415910482406616,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 10599
+    },
+    {
+      "epoch": 0.106,
+      "grad_norm": 0.5284204483032227,
+      "learning_rate": 0.003,
+      "loss": 4.0249,
+      "step": 10600
+    },
+    {
+      "epoch": 0.10601,
+      "grad_norm": 0.485622763633728,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 10601
+    },
+    {
+      "epoch": 0.10602,
+      "grad_norm": 0.5149875283241272,
+      "learning_rate": 0.003,
+      "loss": 4.0376,
+      "step": 10602
+    },
+    {
+      "epoch": 0.10603,
+      "grad_norm": 0.6656335592269897,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 10603
+    },
+    {
+      "epoch": 0.10604,
+      "grad_norm": 0.8385307788848877,
+      "learning_rate": 0.003,
+      "loss": 4.0545,
+      "step": 10604
+    },
+    {
+      "epoch": 0.10605,
+      "grad_norm": 1.0076050758361816,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 10605
+    },
+    {
+      "epoch": 0.10606,
+      "grad_norm": 1.0600577592849731,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 10606
+    },
+    {
+      "epoch": 0.10607,
+      "grad_norm": 0.7597001791000366,
+      "learning_rate": 0.003,
+      "loss": 4.0547,
+      "step": 10607
+    },
+    {
+      "epoch": 0.10608,
+      "grad_norm": 0.6262304186820984,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 10608
+    },
+    {
+      "epoch": 0.10609,
+      "grad_norm": 0.6129266619682312,
+      "learning_rate": 0.003,
+      "loss": 4.0375,
+      "step": 10609
+    },
+    {
+      "epoch": 0.1061,
+      "grad_norm": 0.6446408033370972,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 10610
+    },
+    {
+      "epoch": 0.10611,
+      "grad_norm": 0.5875228047370911,
+      "learning_rate": 0.003,
+      "loss": 3.984,
+      "step": 10611
+    },
+    {
+      "epoch": 0.10612,
+      "grad_norm": 0.5728915929794312,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 10612
+    },
+    {
+      "epoch": 0.10613,
+      "grad_norm": 0.6378050446510315,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 10613
+    },
+    {
+      "epoch": 0.10614,
+      "grad_norm": 0.6682537198066711,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 10614
+    },
+    {
+      "epoch": 0.10615,
+      "grad_norm": 0.6920739412307739,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 10615
+    },
+    {
+      "epoch": 0.10616,
+      "grad_norm": 0.7054862380027771,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 10616
+    },
+    {
+      "epoch": 0.10617,
+      "grad_norm": 0.8225484490394592,
+      "learning_rate": 0.003,
+      "loss": 4.042,
+      "step": 10617
+    },
+    {
+      "epoch": 0.10618,
+      "grad_norm": 0.8290918469429016,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 10618
+    },
+    {
+      "epoch": 0.10619,
+      "grad_norm": 0.9801556468009949,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 10619
+    },
+    {
+      "epoch": 0.1062,
+      "grad_norm": 1.2758433818817139,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 10620
+    },
+    {
+      "epoch": 0.10621,
+      "grad_norm": 0.7299548387527466,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 10621
+    },
+    {
+      "epoch": 0.10622,
+      "grad_norm": 0.5834721326828003,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 10622
+    },
+    {
+      "epoch": 0.10623,
+      "grad_norm": 0.5982388257980347,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 10623
+    },
+    {
+      "epoch": 0.10624,
+      "grad_norm": 0.5697194933891296,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 10624
+    },
+    {
+      "epoch": 0.10625,
+      "grad_norm": 0.5805581212043762,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 10625
+    },
+    {
+      "epoch": 0.10626,
+      "grad_norm": 0.6096617579460144,
+      "learning_rate": 0.003,
+      "loss": 3.9919,
+      "step": 10626
+    },
+    {
+      "epoch": 0.10627,
+      "grad_norm": 0.7148505449295044,
+      "learning_rate": 0.003,
+      "loss": 4.0294,
+      "step": 10627
+    },
+    {
+      "epoch": 0.10628,
+      "grad_norm": 0.9222334623336792,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 10628
+    },
+    {
+      "epoch": 0.10629,
+      "grad_norm": 1.1694058179855347,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 10629
+    },
+    {
+      "epoch": 0.1063,
+      "grad_norm": 1.0497586727142334,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 10630
+    },
+    {
+      "epoch": 0.10631,
+      "grad_norm": 0.8866661190986633,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 10631
+    },
+    {
+      "epoch": 0.10632,
+      "grad_norm": 0.7329339981079102,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 10632
+    },
+    {
+      "epoch": 0.10633,
+      "grad_norm": 0.7718518972396851,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 10633
+    },
+    {
+      "epoch": 0.10634,
+      "grad_norm": 0.7611184120178223,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 10634
+    },
+    {
+      "epoch": 0.10635,
+      "grad_norm": 0.7974138855934143,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 10635
+    },
+    {
+      "epoch": 0.10636,
+      "grad_norm": 0.6915683150291443,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 10636
+    },
+    {
+      "epoch": 0.10637,
+      "grad_norm": 0.6084825396537781,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 10637
+    },
+    {
+      "epoch": 0.10638,
+      "grad_norm": 0.611084520816803,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 10638
+    },
+    {
+      "epoch": 0.10639,
+      "grad_norm": 0.6165266633033752,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 10639
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.6598978042602539,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 10640
+    },
+    {
+      "epoch": 0.10641,
+      "grad_norm": 0.7608402967453003,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 10641
+    },
+    {
+      "epoch": 0.10642,
+      "grad_norm": 0.7880569100379944,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 10642
+    },
+    {
+      "epoch": 0.10643,
+      "grad_norm": 0.8869519829750061,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 10643
+    },
+    {
+      "epoch": 0.10644,
+      "grad_norm": 1.1587555408477783,
+      "learning_rate": 0.003,
+      "loss": 4.0465,
+      "step": 10644
+    },
+    {
+      "epoch": 0.10645,
+      "grad_norm": 0.9315388202667236,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 10645
+    },
+    {
+      "epoch": 0.10646,
+      "grad_norm": 0.7919732928276062,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 10646
+    },
+    {
+      "epoch": 0.10647,
+      "grad_norm": 0.812877357006073,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 10647
+    },
+    {
+      "epoch": 0.10648,
+      "grad_norm": 0.9086081981658936,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 10648
+    },
+    {
+      "epoch": 0.10649,
+      "grad_norm": 0.9742897152900696,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 10649
+    },
+    {
+      "epoch": 0.1065,
+      "grad_norm": 0.9627885818481445,
+      "learning_rate": 0.003,
+      "loss": 4.0735,
+      "step": 10650
+    },
+    {
+      "epoch": 0.10651,
+      "grad_norm": 0.99714595079422,
+      "learning_rate": 0.003,
+      "loss": 4.0072,
+      "step": 10651
+    },
+    {
+      "epoch": 0.10652,
+      "grad_norm": 0.9961947202682495,
+      "learning_rate": 0.003,
+      "loss": 4.0471,
+      "step": 10652
+    },
+    {
+      "epoch": 0.10653,
+      "grad_norm": 0.9269364476203918,
+      "learning_rate": 0.003,
+      "loss": 4.0513,
+      "step": 10653
+    },
+    {
+      "epoch": 0.10654,
+      "grad_norm": 0.8528988361358643,
+      "learning_rate": 0.003,
+      "loss": 4.0388,
+      "step": 10654
+    },
+    {
+      "epoch": 0.10655,
+      "grad_norm": 0.9138254523277283,
+      "learning_rate": 0.003,
+      "loss": 4.0488,
+      "step": 10655
+    },
+    {
+      "epoch": 0.10656,
+      "grad_norm": 1.1768444776535034,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 10656
+    },
+    {
+      "epoch": 0.10657,
+      "grad_norm": 0.8890489935874939,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 10657
+    },
+    {
+      "epoch": 0.10658,
+      "grad_norm": 0.961041271686554,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 10658
+    },
+    {
+      "epoch": 0.10659,
+      "grad_norm": 1.1152033805847168,
+      "learning_rate": 0.003,
+      "loss": 4.0538,
+      "step": 10659
+    },
+    {
+      "epoch": 0.1066,
+      "grad_norm": 0.9879071712493896,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 10660
+    },
+    {
+      "epoch": 0.10661,
+      "grad_norm": 0.9049113392829895,
+      "learning_rate": 0.003,
+      "loss": 4.0464,
+      "step": 10661
+    },
+    {
+      "epoch": 0.10662,
+      "grad_norm": 0.9326181411743164,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 10662
+    },
+    {
+      "epoch": 0.10663,
+      "grad_norm": 0.8698120713233948,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 10663
+    },
+    {
+      "epoch": 0.10664,
+      "grad_norm": 0.8739762902259827,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 10664
+    },
+    {
+      "epoch": 0.10665,
+      "grad_norm": 0.9346017241477966,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 10665
+    },
+    {
+      "epoch": 0.10666,
+      "grad_norm": 0.8549253940582275,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 10666
+    },
+    {
+      "epoch": 0.10667,
+      "grad_norm": 0.8709663152694702,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 10667
+    },
+    {
+      "epoch": 0.10668,
+      "grad_norm": 1.018907070159912,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 10668
+    },
+    {
+      "epoch": 0.10669,
+      "grad_norm": 1.186867594718933,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 10669
+    },
+    {
+      "epoch": 0.1067,
+      "grad_norm": 0.7143198251724243,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 10670
+    },
+    {
+      "epoch": 0.10671,
+      "grad_norm": 0.7373873591423035,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 10671
+    },
+    {
+      "epoch": 0.10672,
+      "grad_norm": 0.8877958059310913,
+      "learning_rate": 0.003,
+      "loss": 4.0357,
+      "step": 10672
+    },
+    {
+      "epoch": 0.10673,
+      "grad_norm": 0.8976678848266602,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 10673
+    },
+    {
+      "epoch": 0.10674,
+      "grad_norm": 0.809621274471283,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 10674
+    },
+    {
+      "epoch": 0.10675,
+      "grad_norm": 0.6401908993721008,
+      "learning_rate": 0.003,
+      "loss": 4.0469,
+      "step": 10675
+    },
+    {
+      "epoch": 0.10676,
+      "grad_norm": 0.6899451017379761,
+      "learning_rate": 0.003,
+      "loss": 4.0416,
+      "step": 10676
+    },
+    {
+      "epoch": 0.10677,
+      "grad_norm": 0.47715431451797485,
+      "learning_rate": 0.003,
+      "loss": 3.9851,
+      "step": 10677
+    },
+    {
+      "epoch": 0.10678,
+      "grad_norm": 0.5788038372993469,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 10678
+    },
+    {
+      "epoch": 0.10679,
+      "grad_norm": 0.5781792998313904,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 10679
+    },
+    {
+      "epoch": 0.1068,
+      "grad_norm": 0.5460559725761414,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 10680
+    },
+    {
+      "epoch": 0.10681,
+      "grad_norm": 0.531968891620636,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 10681
+    },
+    {
+      "epoch": 0.10682,
+      "grad_norm": 0.5341914892196655,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 10682
+    },
+    {
+      "epoch": 0.10683,
+      "grad_norm": 0.6349906921386719,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 10683
+    },
+    {
+      "epoch": 0.10684,
+      "grad_norm": 0.6948190927505493,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 10684
+    },
+    {
+      "epoch": 0.10685,
+      "grad_norm": 0.7251952886581421,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 10685
+    },
+    {
+      "epoch": 0.10686,
+      "grad_norm": 0.7874016761779785,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 10686
+    },
+    {
+      "epoch": 0.10687,
+      "grad_norm": 0.8624654412269592,
+      "learning_rate": 0.003,
+      "loss": 4.0321,
+      "step": 10687
+    },
+    {
+      "epoch": 0.10688,
+      "grad_norm": 0.8007801175117493,
+      "learning_rate": 0.003,
+      "loss": 4.039,
+      "step": 10688
+    },
+    {
+      "epoch": 0.10689,
+      "grad_norm": 0.8008077144622803,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 10689
+    },
+    {
+      "epoch": 0.1069,
+      "grad_norm": 0.7206010818481445,
+      "learning_rate": 0.003,
+      "loss": 4.031,
+      "step": 10690
+    },
+    {
+      "epoch": 0.10691,
+      "grad_norm": 0.8642178177833557,
+      "learning_rate": 0.003,
+      "loss": 4.0624,
+      "step": 10691
+    },
+    {
+      "epoch": 0.10692,
+      "grad_norm": 1.0821588039398193,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 10692
+    },
+    {
+      "epoch": 0.10693,
+      "grad_norm": 1.2005447149276733,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 10693
+    },
+    {
+      "epoch": 0.10694,
+      "grad_norm": 0.9520975351333618,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 10694
+    },
+    {
+      "epoch": 0.10695,
+      "grad_norm": 0.7902743220329285,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 10695
+    },
+    {
+      "epoch": 0.10696,
+      "grad_norm": 0.6577290892601013,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 10696
+    },
+    {
+      "epoch": 0.10697,
+      "grad_norm": 0.677960991859436,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 10697
+    },
+    {
+      "epoch": 0.10698,
+      "grad_norm": 0.6639751195907593,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 10698
+    },
+    {
+      "epoch": 0.10699,
+      "grad_norm": 0.6529691815376282,
+      "learning_rate": 0.003,
+      "loss": 4.056,
+      "step": 10699
+    },
+    {
+      "epoch": 0.107,
+      "grad_norm": 0.6431300640106201,
+      "learning_rate": 0.003,
+      "loss": 4.0474,
+      "step": 10700
+    },
+    {
+      "epoch": 0.10701,
+      "grad_norm": 0.5310377478599548,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 10701
+    },
+    {
+      "epoch": 0.10702,
+      "grad_norm": 0.6834575533866882,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 10702
+    },
+    {
+      "epoch": 0.10703,
+      "grad_norm": 0.8570904731750488,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 10703
+    },
+    {
+      "epoch": 0.10704,
+      "grad_norm": 0.9088355898857117,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 10704
+    },
+    {
+      "epoch": 0.10705,
+      "grad_norm": 0.9915702939033508,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 10705
+    },
+    {
+      "epoch": 0.10706,
+      "grad_norm": 0.8690432906150818,
+      "learning_rate": 0.003,
+      "loss": 4.0433,
+      "step": 10706
+    },
+    {
+      "epoch": 0.10707,
+      "grad_norm": 0.7811831831932068,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 10707
+    },
+    {
+      "epoch": 0.10708,
+      "grad_norm": 0.7660472989082336,
+      "learning_rate": 0.003,
+      "loss": 4.0414,
+      "step": 10708
+    },
+    {
+      "epoch": 0.10709,
+      "grad_norm": 0.7577682733535767,
+      "learning_rate": 0.003,
+      "loss": 4.0432,
+      "step": 10709
+    },
+    {
+      "epoch": 0.1071,
+      "grad_norm": 0.688403308391571,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 10710
+    },
+    {
+      "epoch": 0.10711,
+      "grad_norm": 0.5480352640151978,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 10711
+    },
+    {
+      "epoch": 0.10712,
+      "grad_norm": 0.6282423734664917,
+      "learning_rate": 0.003,
+      "loss": 4.0458,
+      "step": 10712
+    },
+    {
+      "epoch": 0.10713,
+      "grad_norm": 0.6788033246994019,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 10713
+    },
+    {
+      "epoch": 0.10714,
+      "grad_norm": 0.713354229927063,
+      "learning_rate": 0.003,
+      "loss": 4.0432,
+      "step": 10714
+    },
+    {
+      "epoch": 0.10715,
+      "grad_norm": 0.7679527997970581,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 10715
+    },
+    {
+      "epoch": 0.10716,
+      "grad_norm": 0.9965485334396362,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 10716
+    },
+    {
+      "epoch": 0.10717,
+      "grad_norm": 1.2599714994430542,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 10717
+    },
+    {
+      "epoch": 0.10718,
+      "grad_norm": 0.6802042126655579,
+      "learning_rate": 0.003,
+      "loss": 3.9924,
+      "step": 10718
+    },
+    {
+      "epoch": 0.10719,
+      "grad_norm": 0.5995518565177917,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 10719
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.6362910270690918,
+      "learning_rate": 0.003,
+      "loss": 4.0586,
+      "step": 10720
+    },
+    {
+      "epoch": 0.10721,
+      "grad_norm": 0.6041771173477173,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 10721
+    },
+    {
+      "epoch": 0.10722,
+      "grad_norm": 0.7603263258934021,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 10722
+    },
+    {
+      "epoch": 0.10723,
+      "grad_norm": 0.8332676291465759,
+      "learning_rate": 0.003,
+      "loss": 4.0331,
+      "step": 10723
+    },
+    {
+      "epoch": 0.10724,
+      "grad_norm": 0.9279446601867676,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 10724
+    },
+    {
+      "epoch": 0.10725,
+      "grad_norm": 1.125430703163147,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 10725
+    },
+    {
+      "epoch": 0.10726,
+      "grad_norm": 1.0059179067611694,
+      "learning_rate": 0.003,
+      "loss": 4.0498,
+      "step": 10726
+    },
+    {
+      "epoch": 0.10727,
+      "grad_norm": 1.1123380661010742,
+      "learning_rate": 0.003,
+      "loss": 4.0467,
+      "step": 10727
+    },
+    {
+      "epoch": 0.10728,
+      "grad_norm": 0.9447655081748962,
+      "learning_rate": 0.003,
+      "loss": 4.0525,
+      "step": 10728
+    },
+    {
+      "epoch": 0.10729,
+      "grad_norm": 0.9554311633110046,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 10729
+    },
+    {
+      "epoch": 0.1073,
+      "grad_norm": 1.0092507600784302,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 10730
+    },
+    {
+      "epoch": 0.10731,
+      "grad_norm": 1.0152878761291504,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 10731
+    },
+    {
+      "epoch": 0.10732,
+      "grad_norm": 0.9422562718391418,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 10732
+    },
+    {
+      "epoch": 0.10733,
+      "grad_norm": 0.9458036422729492,
+      "learning_rate": 0.003,
+      "loss": 4.053,
+      "step": 10733
+    },
+    {
+      "epoch": 0.10734,
+      "grad_norm": 0.9883454442024231,
+      "learning_rate": 0.003,
+      "loss": 4.0428,
+      "step": 10734
+    },
+    {
+      "epoch": 0.10735,
+      "grad_norm": 0.8949477672576904,
+      "learning_rate": 0.003,
+      "loss": 4.0489,
+      "step": 10735
+    },
+    {
+      "epoch": 0.10736,
+      "grad_norm": 0.7392562627792358,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 10736
+    },
+    {
+      "epoch": 0.10737,
+      "grad_norm": 0.7315913438796997,
+      "learning_rate": 0.003,
+      "loss": 4.0355,
+      "step": 10737
+    },
+    {
+      "epoch": 0.10738,
+      "grad_norm": 0.7146841883659363,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 10738
+    },
+    {
+      "epoch": 0.10739,
+      "grad_norm": 0.7426468729972839,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 10739
+    },
+    {
+      "epoch": 0.1074,
+      "grad_norm": 0.5836867690086365,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 10740
+    },
+    {
+      "epoch": 0.10741,
+      "grad_norm": 0.5937772989273071,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 10741
+    },
+    {
+      "epoch": 0.10742,
+      "grad_norm": 0.694475531578064,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 10742
+    },
+    {
+      "epoch": 0.10743,
+      "grad_norm": 0.6357514262199402,
+      "learning_rate": 0.003,
+      "loss": 4.0478,
+      "step": 10743
+    },
+    {
+      "epoch": 0.10744,
+      "grad_norm": 0.5116270184516907,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 10744
+    },
+    {
+      "epoch": 0.10745,
+      "grad_norm": 0.5230345726013184,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 10745
+    },
+    {
+      "epoch": 0.10746,
+      "grad_norm": 0.5541887283325195,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 10746
+    },
+    {
+      "epoch": 0.10747,
+      "grad_norm": 0.5771496891975403,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 10747
+    },
+    {
+      "epoch": 0.10748,
+      "grad_norm": 0.6870571970939636,
+      "learning_rate": 0.003,
+      "loss": 4.0466,
+      "step": 10748
+    },
+    {
+      "epoch": 0.10749,
+      "grad_norm": 0.8241485357284546,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 10749
+    },
+    {
+      "epoch": 0.1075,
+      "grad_norm": 0.9226048588752747,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 10750
+    },
+    {
+      "epoch": 0.10751,
+      "grad_norm": 0.9833307266235352,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 10751
+    },
+    {
+      "epoch": 0.10752,
+      "grad_norm": 1.0102068185806274,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 10752
+    },
+    {
+      "epoch": 0.10753,
+      "grad_norm": 0.9321718811988831,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 10753
+    },
+    {
+      "epoch": 0.10754,
+      "grad_norm": 0.8044393062591553,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 10754
+    },
+    {
+      "epoch": 0.10755,
+      "grad_norm": 0.8157967925071716,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 10755
+    },
+    {
+      "epoch": 0.10756,
+      "grad_norm": 0.9599362015724182,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 10756
+    },
+    {
+      "epoch": 0.10757,
+      "grad_norm": 0.8057109117507935,
+      "learning_rate": 0.003,
+      "loss": 4.0376,
+      "step": 10757
+    },
+    {
+      "epoch": 0.10758,
+      "grad_norm": 0.84975266456604,
+      "learning_rate": 0.003,
+      "loss": 4.0449,
+      "step": 10758
+    },
+    {
+      "epoch": 0.10759,
+      "grad_norm": 0.7976199388504028,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 10759
+    },
+    {
+      "epoch": 0.1076,
+      "grad_norm": 0.8996418714523315,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 10760
+    },
+    {
+      "epoch": 0.10761,
+      "grad_norm": 0.952308714389801,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 10761
+    },
+    {
+      "epoch": 0.10762,
+      "grad_norm": 1.00287663936615,
+      "learning_rate": 0.003,
+      "loss": 4.0486,
+      "step": 10762
+    },
+    {
+      "epoch": 0.10763,
+      "grad_norm": 0.9806409478187561,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 10763
+    },
+    {
+      "epoch": 0.10764,
+      "grad_norm": 0.7794227004051208,
+      "learning_rate": 0.003,
+      "loss": 4.0376,
+      "step": 10764
+    },
+    {
+      "epoch": 0.10765,
+      "grad_norm": 0.7748934030532837,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 10765
+    },
+    {
+      "epoch": 0.10766,
+      "grad_norm": 0.8416560888290405,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 10766
+    },
+    {
+      "epoch": 0.10767,
+      "grad_norm": 0.8533070087432861,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 10767
+    },
+    {
+      "epoch": 0.10768,
+      "grad_norm": 0.8723945617675781,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 10768
+    },
+    {
+      "epoch": 0.10769,
+      "grad_norm": 0.8659939765930176,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 10769
+    },
+    {
+      "epoch": 0.1077,
+      "grad_norm": 0.8741641640663147,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 10770
+    },
+    {
+      "epoch": 0.10771,
+      "grad_norm": 0.8021601438522339,
+      "learning_rate": 0.003,
+      "loss": 4.0373,
+      "step": 10771
+    },
+    {
+      "epoch": 0.10772,
+      "grad_norm": 0.6761582493782043,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 10772
+    },
+    {
+      "epoch": 0.10773,
+      "grad_norm": 0.584344744682312,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 10773
+    },
+    {
+      "epoch": 0.10774,
+      "grad_norm": 0.6257133483886719,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 10774
+    },
+    {
+      "epoch": 0.10775,
+      "grad_norm": 0.6287021040916443,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 10775
+    },
+    {
+      "epoch": 0.10776,
+      "grad_norm": 0.6459553837776184,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 10776
+    },
+    {
+      "epoch": 0.10777,
+      "grad_norm": 0.6836524605751038,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 10777
+    },
+    {
+      "epoch": 0.10778,
+      "grad_norm": 0.7361724972724915,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 10778
+    },
+    {
+      "epoch": 0.10779,
+      "grad_norm": 0.8304905295372009,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 10779
+    },
+    {
+      "epoch": 0.1078,
+      "grad_norm": 0.856549084186554,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 10780
+    },
+    {
+      "epoch": 0.10781,
+      "grad_norm": 0.942664623260498,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 10781
+    },
+    {
+      "epoch": 0.10782,
+      "grad_norm": 0.9167718887329102,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 10782
+    },
+    {
+      "epoch": 0.10783,
+      "grad_norm": 0.7978794574737549,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 10783
+    },
+    {
+      "epoch": 0.10784,
+      "grad_norm": 0.7039570212364197,
+      "learning_rate": 0.003,
+      "loss": 4.0423,
+      "step": 10784
+    },
+    {
+      "epoch": 0.10785,
+      "grad_norm": 0.6894760727882385,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 10785
+    },
+    {
+      "epoch": 0.10786,
+      "grad_norm": 0.7555667757987976,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 10786
+    },
+    {
+      "epoch": 0.10787,
+      "grad_norm": 0.7514839768409729,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 10787
+    },
+    {
+      "epoch": 0.10788,
+      "grad_norm": 0.6158230900764465,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 10788
+    },
+    {
+      "epoch": 0.10789,
+      "grad_norm": 0.5220729112625122,
+      "learning_rate": 0.003,
+      "loss": 3.9715,
+      "step": 10789
+    },
+    {
+      "epoch": 0.1079,
+      "grad_norm": 0.4973139762878418,
+      "learning_rate": 0.003,
+      "loss": 4.0321,
+      "step": 10790
+    },
+    {
+      "epoch": 0.10791,
+      "grad_norm": 0.5057622790336609,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 10791
+    },
+    {
+      "epoch": 0.10792,
+      "grad_norm": 0.459989458322525,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 10792
+    },
+    {
+      "epoch": 0.10793,
+      "grad_norm": 0.46567296981811523,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 10793
+    },
+    {
+      "epoch": 0.10794,
+      "grad_norm": 0.5016858577728271,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 10794
+    },
+    {
+      "epoch": 0.10795,
+      "grad_norm": 0.5655898451805115,
+      "learning_rate": 0.003,
+      "loss": 3.982,
+      "step": 10795
+    },
+    {
+      "epoch": 0.10796,
+      "grad_norm": 0.7223862409591675,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 10796
+    },
+    {
+      "epoch": 0.10797,
+      "grad_norm": 0.9427323937416077,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 10797
+    },
+    {
+      "epoch": 0.10798,
+      "grad_norm": 1.170189619064331,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 10798
+    },
+    {
+      "epoch": 0.10799,
+      "grad_norm": 0.9903934597969055,
+      "learning_rate": 0.003,
+      "loss": 3.9946,
+      "step": 10799
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.9310336709022522,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 10800
+    },
+    {
+      "epoch": 0.10801,
+      "grad_norm": 0.8072385787963867,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 10801
+    },
+    {
+      "epoch": 0.10802,
+      "grad_norm": 0.802480936050415,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 10802
+    },
+    {
+      "epoch": 0.10803,
+      "grad_norm": 0.8250213861465454,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 10803
+    },
+    {
+      "epoch": 0.10804,
+      "grad_norm": 0.9733414649963379,
+      "learning_rate": 0.003,
+      "loss": 4.0347,
+      "step": 10804
+    },
+    {
+      "epoch": 0.10805,
+      "grad_norm": 0.8959097266197205,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 10805
+    },
+    {
+      "epoch": 0.10806,
+      "grad_norm": 0.8214556574821472,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 10806
+    },
+    {
+      "epoch": 0.10807,
+      "grad_norm": 0.8318617343902588,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 10807
+    },
+    {
+      "epoch": 0.10808,
+      "grad_norm": 0.8079638481140137,
+      "learning_rate": 0.003,
+      "loss": 4.1108,
+      "step": 10808
+    },
+    {
+      "epoch": 0.10809,
+      "grad_norm": 0.7618487477302551,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 10809
+    },
+    {
+      "epoch": 0.1081,
+      "grad_norm": 0.8443053960800171,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 10810
+    },
+    {
+      "epoch": 0.10811,
+      "grad_norm": 0.8693640232086182,
+      "learning_rate": 0.003,
+      "loss": 4.0616,
+      "step": 10811
+    },
+    {
+      "epoch": 0.10812,
+      "grad_norm": 0.802323579788208,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 10812
+    },
+    {
+      "epoch": 0.10813,
+      "grad_norm": 0.8824979662895203,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 10813
+    },
+    {
+      "epoch": 0.10814,
+      "grad_norm": 1.0556713342666626,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 10814
+    },
+    {
+      "epoch": 0.10815,
+      "grad_norm": 1.0019479990005493,
+      "learning_rate": 0.003,
+      "loss": 4.0388,
+      "step": 10815
+    },
+    {
+      "epoch": 0.10816,
+      "grad_norm": 1.1382358074188232,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 10816
+    },
+    {
+      "epoch": 0.10817,
+      "grad_norm": 0.7958524227142334,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 10817
+    },
+    {
+      "epoch": 0.10818,
+      "grad_norm": 0.6067748665809631,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 10818
+    },
+    {
+      "epoch": 0.10819,
+      "grad_norm": 0.6149799227714539,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 10819
+    },
+    {
+      "epoch": 0.1082,
+      "grad_norm": 0.6444442272186279,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 10820
+    },
+    {
+      "epoch": 0.10821,
+      "grad_norm": 0.7835311889648438,
+      "learning_rate": 0.003,
+      "loss": 3.9861,
+      "step": 10821
+    },
+    {
+      "epoch": 0.10822,
+      "grad_norm": 0.9068350195884705,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 10822
+    },
+    {
+      "epoch": 0.10823,
+      "grad_norm": 0.9789934754371643,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 10823
+    },
+    {
+      "epoch": 0.10824,
+      "grad_norm": 0.9751971364021301,
+      "learning_rate": 0.003,
+      "loss": 4.0388,
+      "step": 10824
+    },
+    {
+      "epoch": 0.10825,
+      "grad_norm": 0.7828039526939392,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 10825
+    },
+    {
+      "epoch": 0.10826,
+      "grad_norm": 0.6331886649131775,
+      "learning_rate": 0.003,
+      "loss": 3.99,
+      "step": 10826
+    },
+    {
+      "epoch": 0.10827,
+      "grad_norm": 0.6150903105735779,
+      "learning_rate": 0.003,
+      "loss": 4.0528,
+      "step": 10827
+    },
+    {
+      "epoch": 0.10828,
+      "grad_norm": 0.5984993577003479,
+      "learning_rate": 0.003,
+      "loss": 4.0589,
+      "step": 10828
+    },
+    {
+      "epoch": 0.10829,
+      "grad_norm": 0.7864992022514343,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 10829
+    },
+    {
+      "epoch": 0.1083,
+      "grad_norm": 0.9425749778747559,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 10830
+    },
+    {
+      "epoch": 0.10831,
+      "grad_norm": 1.058080792427063,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 10831
+    },
+    {
+      "epoch": 0.10832,
+      "grad_norm": 0.7845868468284607,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 10832
+    },
+    {
+      "epoch": 0.10833,
+      "grad_norm": 0.6163697242736816,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 10833
+    },
+    {
+      "epoch": 0.10834,
+      "grad_norm": 0.6821464896202087,
+      "learning_rate": 0.003,
+      "loss": 3.9831,
+      "step": 10834
+    },
+    {
+      "epoch": 0.10835,
+      "grad_norm": 0.6944262981414795,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 10835
+    },
+    {
+      "epoch": 0.10836,
+      "grad_norm": 0.6780229806900024,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 10836
+    },
+    {
+      "epoch": 0.10837,
+      "grad_norm": 0.703480064868927,
+      "learning_rate": 0.003,
+      "loss": 4.0242,
+      "step": 10837
+    },
+    {
+      "epoch": 0.10838,
+      "grad_norm": 0.8821530938148499,
+      "learning_rate": 0.003,
+      "loss": 4.0436,
+      "step": 10838
+    },
+    {
+      "epoch": 0.10839,
+      "grad_norm": 1.2261230945587158,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 10839
+    },
+    {
+      "epoch": 0.1084,
+      "grad_norm": 0.6746938228607178,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 10840
+    },
+    {
+      "epoch": 0.10841,
+      "grad_norm": 0.5455398559570312,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 10841
+    },
+    {
+      "epoch": 0.10842,
+      "grad_norm": 0.5697435736656189,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 10842
+    },
+    {
+      "epoch": 0.10843,
+      "grad_norm": 0.557073175907135,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 10843
+    },
+    {
+      "epoch": 0.10844,
+      "grad_norm": 0.5535928606987,
+      "learning_rate": 0.003,
+      "loss": 4.0462,
+      "step": 10844
+    },
+    {
+      "epoch": 0.10845,
+      "grad_norm": 0.6609616875648499,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 10845
+    },
+    {
+      "epoch": 0.10846,
+      "grad_norm": 0.7913950085639954,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 10846
+    },
+    {
+      "epoch": 0.10847,
+      "grad_norm": 0.8395732045173645,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 10847
+    },
+    {
+      "epoch": 0.10848,
+      "grad_norm": 0.7630530595779419,
+      "learning_rate": 0.003,
+      "loss": 3.9926,
+      "step": 10848
+    },
+    {
+      "epoch": 0.10849,
+      "grad_norm": 0.7382761836051941,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 10849
+    },
+    {
+      "epoch": 0.1085,
+      "grad_norm": 0.7676743865013123,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 10850
+    },
+    {
+      "epoch": 0.10851,
+      "grad_norm": 0.7738227248191833,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 10851
+    },
+    {
+      "epoch": 0.10852,
+      "grad_norm": 0.727118968963623,
+      "learning_rate": 0.003,
+      "loss": 4.0334,
+      "step": 10852
+    },
+    {
+      "epoch": 0.10853,
+      "grad_norm": 0.5983321070671082,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 10853
+    },
+    {
+      "epoch": 0.10854,
+      "grad_norm": 0.5814149379730225,
+      "learning_rate": 0.003,
+      "loss": 3.9975,
+      "step": 10854
+    },
+    {
+      "epoch": 0.10855,
+      "grad_norm": 0.7128687500953674,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 10855
+    },
+    {
+      "epoch": 0.10856,
+      "grad_norm": 0.8423973321914673,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 10856
+    },
+    {
+      "epoch": 0.10857,
+      "grad_norm": 1.0030944347381592,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 10857
+    },
+    {
+      "epoch": 0.10858,
+      "grad_norm": 1.2414257526397705,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 10858
+    },
+    {
+      "epoch": 0.10859,
+      "grad_norm": 0.8019863963127136,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 10859
+    },
+    {
+      "epoch": 0.1086,
+      "grad_norm": 0.7122812867164612,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 10860
+    },
+    {
+      "epoch": 0.10861,
+      "grad_norm": 0.6793990135192871,
+      "learning_rate": 0.003,
+      "loss": 4.0371,
+      "step": 10861
+    },
+    {
+      "epoch": 0.10862,
+      "grad_norm": 0.7667950987815857,
+      "learning_rate": 0.003,
+      "loss": 4.0655,
+      "step": 10862
+    },
+    {
+      "epoch": 0.10863,
+      "grad_norm": 0.7574916481971741,
+      "learning_rate": 0.003,
+      "loss": 4.0397,
+      "step": 10863
+    },
+    {
+      "epoch": 0.10864,
+      "grad_norm": 0.7576637864112854,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 10864
+    },
+    {
+      "epoch": 0.10865,
+      "grad_norm": 0.7982226014137268,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 10865
+    },
+    {
+      "epoch": 0.10866,
+      "grad_norm": 0.831477165222168,
+      "learning_rate": 0.003,
+      "loss": 4.0364,
+      "step": 10866
+    },
+    {
+      "epoch": 0.10867,
+      "grad_norm": 0.8126018643379211,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 10867
+    },
+    {
+      "epoch": 0.10868,
+      "grad_norm": 0.7804986834526062,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 10868
+    },
+    {
+      "epoch": 0.10869,
+      "grad_norm": 0.8123776912689209,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 10869
+    },
+    {
+      "epoch": 0.1087,
+      "grad_norm": 0.9683001041412354,
+      "learning_rate": 0.003,
+      "loss": 4.0366,
+      "step": 10870
+    },
+    {
+      "epoch": 0.10871,
+      "grad_norm": 1.090463399887085,
+      "learning_rate": 0.003,
+      "loss": 4.0185,
+      "step": 10871
+    },
+    {
+      "epoch": 0.10872,
+      "grad_norm": 0.8968812823295593,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 10872
+    },
+    {
+      "epoch": 0.10873,
+      "grad_norm": 0.8717519640922546,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 10873
+    },
+    {
+      "epoch": 0.10874,
+      "grad_norm": 0.7509618997573853,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 10874
+    },
+    {
+      "epoch": 0.10875,
+      "grad_norm": 0.6578442454338074,
+      "learning_rate": 0.003,
+      "loss": 4.043,
+      "step": 10875
+    },
+    {
+      "epoch": 0.10876,
+      "grad_norm": 0.6749911308288574,
+      "learning_rate": 0.003,
+      "loss": 3.9844,
+      "step": 10876
+    },
+    {
+      "epoch": 0.10877,
+      "grad_norm": 0.7552892565727234,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 10877
+    },
+    {
+      "epoch": 0.10878,
+      "grad_norm": 0.9866586923599243,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 10878
+    },
+    {
+      "epoch": 0.10879,
+      "grad_norm": 1.1038225889205933,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 10879
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.9424269795417786,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 10880
+    },
+    {
+      "epoch": 0.10881,
+      "grad_norm": 0.931976318359375,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 10881
+    },
+    {
+      "epoch": 0.10882,
+      "grad_norm": 0.9206244349479675,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 10882
+    },
+    {
+      "epoch": 0.10883,
+      "grad_norm": 0.8365570306777954,
+      "learning_rate": 0.003,
+      "loss": 4.05,
+      "step": 10883
+    },
+    {
+      "epoch": 0.10884,
+      "grad_norm": 0.8806115388870239,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 10884
+    },
+    {
+      "epoch": 0.10885,
+      "grad_norm": 0.71012282371521,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 10885
+    },
+    {
+      "epoch": 0.10886,
+      "grad_norm": 0.6836910247802734,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 10886
+    },
+    {
+      "epoch": 0.10887,
+      "grad_norm": 0.590783417224884,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 10887
+    },
+    {
+      "epoch": 0.10888,
+      "grad_norm": 0.6044482588768005,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 10888
+    },
+    {
+      "epoch": 0.10889,
+      "grad_norm": 0.5061213970184326,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 10889
+    },
+    {
+      "epoch": 0.1089,
+      "grad_norm": 0.4831678867340088,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 10890
+    },
+    {
+      "epoch": 0.10891,
+      "grad_norm": 0.5567334294319153,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 10891
+    },
+    {
+      "epoch": 0.10892,
+      "grad_norm": 0.7139768004417419,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 10892
+    },
+    {
+      "epoch": 0.10893,
+      "grad_norm": 0.939973771572113,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 10893
+    },
+    {
+      "epoch": 0.10894,
+      "grad_norm": 1.1920320987701416,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 10894
+    },
+    {
+      "epoch": 0.10895,
+      "grad_norm": 0.6929920315742493,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 10895
+    },
+    {
+      "epoch": 0.10896,
+      "grad_norm": 0.6842626333236694,
+      "learning_rate": 0.003,
+      "loss": 4.0234,
+      "step": 10896
+    },
+    {
+      "epoch": 0.10897,
+      "grad_norm": 0.7854790091514587,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 10897
+    },
+    {
+      "epoch": 0.10898,
+      "grad_norm": 0.9311167597770691,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 10898
+    },
+    {
+      "epoch": 0.10899,
+      "grad_norm": 1.1679928302764893,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 10899
+    },
+    {
+      "epoch": 0.109,
+      "grad_norm": 0.8089187145233154,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 10900
+    },
+    {
+      "epoch": 0.10901,
+      "grad_norm": 0.6758765578269958,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 10901
+    },
+    {
+      "epoch": 0.10902,
+      "grad_norm": 0.7646622061729431,
+      "learning_rate": 0.003,
+      "loss": 4.0119,
+      "step": 10902
+    },
+    {
+      "epoch": 0.10903,
+      "grad_norm": 0.7202122807502747,
+      "learning_rate": 0.003,
+      "loss": 4.0295,
+      "step": 10903
+    },
+    {
+      "epoch": 0.10904,
+      "grad_norm": 0.7561991810798645,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 10904
+    },
+    {
+      "epoch": 0.10905,
+      "grad_norm": 0.5963756442070007,
+      "learning_rate": 0.003,
+      "loss": 3.9972,
+      "step": 10905
+    },
+    {
+      "epoch": 0.10906,
+      "grad_norm": 0.5371752977371216,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 10906
+    },
+    {
+      "epoch": 0.10907,
+      "grad_norm": 0.5870004892349243,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 10907
+    },
+    {
+      "epoch": 0.10908,
+      "grad_norm": 0.6057379841804504,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 10908
+    },
+    {
+      "epoch": 0.10909,
+      "grad_norm": 0.593961238861084,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 10909
+    },
+    {
+      "epoch": 0.1091,
+      "grad_norm": 0.6798774600028992,
+      "learning_rate": 0.003,
+      "loss": 4.015,
+      "step": 10910
+    },
+    {
+      "epoch": 0.10911,
+      "grad_norm": 0.8016900420188904,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 10911
+    },
+    {
+      "epoch": 0.10912,
+      "grad_norm": 0.8332940340042114,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 10912
+    },
+    {
+      "epoch": 0.10913,
+      "grad_norm": 0.9600593447685242,
+      "learning_rate": 0.003,
+      "loss": 4.0427,
+      "step": 10913
+    },
+    {
+      "epoch": 0.10914,
+      "grad_norm": 1.1089653968811035,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 10914
+    },
+    {
+      "epoch": 0.10915,
+      "grad_norm": 0.9480727314949036,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 10915
+    },
+    {
+      "epoch": 0.10916,
+      "grad_norm": 0.894866406917572,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 10916
+    },
+    {
+      "epoch": 0.10917,
+      "grad_norm": 0.8441586494445801,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 10917
+    },
+    {
+      "epoch": 0.10918,
+      "grad_norm": 0.9097299575805664,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 10918
+    },
+    {
+      "epoch": 0.10919,
+      "grad_norm": 0.9293150305747986,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 10919
+    },
+    {
+      "epoch": 0.1092,
+      "grad_norm": 1.0343148708343506,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 10920
+    },
+    {
+      "epoch": 0.10921,
+      "grad_norm": 1.063411831855774,
+      "learning_rate": 0.003,
+      "loss": 4.043,
+      "step": 10921
+    },
+    {
+      "epoch": 0.10922,
+      "grad_norm": 0.8918190598487854,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 10922
+    },
+    {
+      "epoch": 0.10923,
+      "grad_norm": 0.7885593771934509,
+      "learning_rate": 0.003,
+      "loss": 3.9936,
+      "step": 10923
+    },
+    {
+      "epoch": 0.10924,
+      "grad_norm": 0.6758106350898743,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 10924
+    },
+    {
+      "epoch": 0.10925,
+      "grad_norm": 0.73740553855896,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 10925
+    },
+    {
+      "epoch": 0.10926,
+      "grad_norm": 0.7766514420509338,
+      "learning_rate": 0.003,
+      "loss": 3.9915,
+      "step": 10926
+    },
+    {
+      "epoch": 0.10927,
+      "grad_norm": 0.8272937536239624,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 10927
+    },
+    {
+      "epoch": 0.10928,
+      "grad_norm": 0.8774792551994324,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 10928
+    },
+    {
+      "epoch": 0.10929,
+      "grad_norm": 1.0164790153503418,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 10929
+    },
+    {
+      "epoch": 0.1093,
+      "grad_norm": 1.0911238193511963,
+      "learning_rate": 0.003,
+      "loss": 4.0449,
+      "step": 10930
+    },
+    {
+      "epoch": 0.10931,
+      "grad_norm": 0.8725080490112305,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 10931
+    },
+    {
+      "epoch": 0.10932,
+      "grad_norm": 0.6738268733024597,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 10932
+    },
+    {
+      "epoch": 0.10933,
+      "grad_norm": 0.6225630640983582,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 10933
+    },
+    {
+      "epoch": 0.10934,
+      "grad_norm": 0.7357386946678162,
+      "learning_rate": 0.003,
+      "loss": 4.0647,
+      "step": 10934
+    },
+    {
+      "epoch": 0.10935,
+      "grad_norm": 0.7856181859970093,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 10935
+    },
+    {
+      "epoch": 0.10936,
+      "grad_norm": 0.6953492164611816,
+      "learning_rate": 0.003,
+      "loss": 3.9988,
+      "step": 10936
+    },
+    {
+      "epoch": 0.10937,
+      "grad_norm": 0.6342230439186096,
+      "learning_rate": 0.003,
+      "loss": 4.0457,
+      "step": 10937
+    },
+    {
+      "epoch": 0.10938,
+      "grad_norm": 0.8185663223266602,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 10938
+    },
+    {
+      "epoch": 0.10939,
+      "grad_norm": 0.9318892955780029,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 10939
+    },
+    {
+      "epoch": 0.1094,
+      "grad_norm": 0.8709768056869507,
+      "learning_rate": 0.003,
+      "loss": 4.0591,
+      "step": 10940
+    },
+    {
+      "epoch": 0.10941,
+      "grad_norm": 1.0035088062286377,
+      "learning_rate": 0.003,
+      "loss": 4.0546,
+      "step": 10941
+    },
+    {
+      "epoch": 0.10942,
+      "grad_norm": 0.9826143383979797,
+      "learning_rate": 0.003,
+      "loss": 4.0485,
+      "step": 10942
+    },
+    {
+      "epoch": 0.10943,
+      "grad_norm": 0.8794414401054382,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 10943
+    },
+    {
+      "epoch": 0.10944,
+      "grad_norm": 0.8242260217666626,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 10944
+    },
+    {
+      "epoch": 0.10945,
+      "grad_norm": 0.7647247314453125,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 10945
+    },
+    {
+      "epoch": 0.10946,
+      "grad_norm": 0.7977977991104126,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 10946
+    },
+    {
+      "epoch": 0.10947,
+      "grad_norm": 0.7217475175857544,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 10947
+    },
+    {
+      "epoch": 0.10948,
+      "grad_norm": 0.68278568983078,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 10948
+    },
+    {
+      "epoch": 0.10949,
+      "grad_norm": 0.6589925289154053,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 10949
+    },
+    {
+      "epoch": 0.1095,
+      "grad_norm": 0.733100414276123,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 10950
+    },
+    {
+      "epoch": 0.10951,
+      "grad_norm": 0.9500106573104858,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 10951
+    },
+    {
+      "epoch": 0.10952,
+      "grad_norm": 1.2109297513961792,
+      "learning_rate": 0.003,
+      "loss": 4.0388,
+      "step": 10952
+    },
+    {
+      "epoch": 0.10953,
+      "grad_norm": 0.8722065091133118,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 10953
+    },
+    {
+      "epoch": 0.10954,
+      "grad_norm": 0.7249215841293335,
+      "learning_rate": 0.003,
+      "loss": 4.0546,
+      "step": 10954
+    },
+    {
+      "epoch": 0.10955,
+      "grad_norm": 0.6058104038238525,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 10955
+    },
+    {
+      "epoch": 0.10956,
+      "grad_norm": 0.6156740784645081,
+      "learning_rate": 0.003,
+      "loss": 4.0375,
+      "step": 10956
+    },
+    {
+      "epoch": 0.10957,
+      "grad_norm": 0.6245015859603882,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 10957
+    },
+    {
+      "epoch": 0.10958,
+      "grad_norm": 0.740548849105835,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 10958
+    },
+    {
+      "epoch": 0.10959,
+      "grad_norm": 1.0325299501419067,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 10959
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 1.0633299350738525,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 10960
+    },
+    {
+      "epoch": 0.10961,
+      "grad_norm": 0.7050127983093262,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 10961
+    },
+    {
+      "epoch": 0.10962,
+      "grad_norm": 0.6579815149307251,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 10962
+    },
+    {
+      "epoch": 0.10963,
+      "grad_norm": 0.8424527645111084,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 10963
+    },
+    {
+      "epoch": 0.10964,
+      "grad_norm": 0.9569191336631775,
+      "learning_rate": 0.003,
+      "loss": 4.0449,
+      "step": 10964
+    },
+    {
+      "epoch": 0.10965,
+      "grad_norm": 1.1351628303527832,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 10965
+    },
+    {
+      "epoch": 0.10966,
+      "grad_norm": 0.7953414916992188,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 10966
+    },
+    {
+      "epoch": 0.10967,
+      "grad_norm": 0.6877783536911011,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 10967
+    },
+    {
+      "epoch": 0.10968,
+      "grad_norm": 0.7406366467475891,
+      "learning_rate": 0.003,
+      "loss": 4.0447,
+      "step": 10968
+    },
+    {
+      "epoch": 0.10969,
+      "grad_norm": 0.7145115733146667,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 10969
+    },
+    {
+      "epoch": 0.1097,
+      "grad_norm": 0.6158110499382019,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 10970
+    },
+    {
+      "epoch": 0.10971,
+      "grad_norm": 0.5443043112754822,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 10971
+    },
+    {
+      "epoch": 0.10972,
+      "grad_norm": 0.542644202709198,
+      "learning_rate": 0.003,
+      "loss": 4.0631,
+      "step": 10972
+    },
+    {
+      "epoch": 0.10973,
+      "grad_norm": 0.6166226863861084,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 10973
+    },
+    {
+      "epoch": 0.10974,
+      "grad_norm": 0.6657989025115967,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 10974
+    },
+    {
+      "epoch": 0.10975,
+      "grad_norm": 0.7187350988388062,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 10975
+    },
+    {
+      "epoch": 0.10976,
+      "grad_norm": 0.7222906351089478,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 10976
+    },
+    {
+      "epoch": 0.10977,
+      "grad_norm": 0.8421234488487244,
+      "learning_rate": 0.003,
+      "loss": 4.0263,
+      "step": 10977
+    },
+    {
+      "epoch": 0.10978,
+      "grad_norm": 1.1006399393081665,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 10978
+    },
+    {
+      "epoch": 0.10979,
+      "grad_norm": 0.8633862733840942,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 10979
+    },
+    {
+      "epoch": 0.1098,
+      "grad_norm": 0.6699802875518799,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 10980
+    },
+    {
+      "epoch": 0.10981,
+      "grad_norm": 0.6813558340072632,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 10981
+    },
+    {
+      "epoch": 0.10982,
+      "grad_norm": 0.9310502409934998,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 10982
+    },
+    {
+      "epoch": 0.10983,
+      "grad_norm": 1.1476507186889648,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 10983
+    },
+    {
+      "epoch": 0.10984,
+      "grad_norm": 0.706564724445343,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 10984
+    },
+    {
+      "epoch": 0.10985,
+      "grad_norm": 0.584486722946167,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 10985
+    },
+    {
+      "epoch": 0.10986,
+      "grad_norm": 0.6373727917671204,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 10986
+    },
+    {
+      "epoch": 0.10987,
+      "grad_norm": 0.715704083442688,
+      "learning_rate": 0.003,
+      "loss": 4.0375,
+      "step": 10987
+    },
+    {
+      "epoch": 0.10988,
+      "grad_norm": 0.8571540713310242,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 10988
+    },
+    {
+      "epoch": 0.10989,
+      "grad_norm": 0.8616876602172852,
+      "learning_rate": 0.003,
+      "loss": 4.0256,
+      "step": 10989
+    },
+    {
+      "epoch": 0.1099,
+      "grad_norm": 0.7467997670173645,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 10990
+    },
+    {
+      "epoch": 0.10991,
+      "grad_norm": 0.7104257941246033,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 10991
+    },
+    {
+      "epoch": 0.10992,
+      "grad_norm": 0.8819006085395813,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 10992
+    },
+    {
+      "epoch": 0.10993,
+      "grad_norm": 1.1179277896881104,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 10993
+    },
+    {
+      "epoch": 0.10994,
+      "grad_norm": 0.7531784772872925,
+      "learning_rate": 0.003,
+      "loss": 4.0607,
+      "step": 10994
+    },
+    {
+      "epoch": 0.10995,
+      "grad_norm": 0.6037430763244629,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 10995
+    },
+    {
+      "epoch": 0.10996,
+      "grad_norm": 0.5394481420516968,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 10996
+    },
+    {
+      "epoch": 0.10997,
+      "grad_norm": 0.4965377449989319,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 10997
+    },
+    {
+      "epoch": 0.10998,
+      "grad_norm": 0.4652479290962219,
+      "learning_rate": 0.003,
+      "loss": 4.0508,
+      "step": 10998
+    },
+    {
+      "epoch": 0.10999,
+      "grad_norm": 0.47271811962127686,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 10999
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.5026476383209229,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 11000
+    },
+    {
+      "epoch": 0.11001,
+      "grad_norm": 0.600214958190918,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 11001
+    },
+    {
+      "epoch": 0.11002,
+      "grad_norm": 0.7847859859466553,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 11002
+    },
+    {
+      "epoch": 0.11003,
+      "grad_norm": 1.0072622299194336,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 11003
+    },
+    {
+      "epoch": 0.11004,
+      "grad_norm": 1.0832382440567017,
+      "learning_rate": 0.003,
+      "loss": 4.0355,
+      "step": 11004
+    },
+    {
+      "epoch": 0.11005,
+      "grad_norm": 0.8995561003684998,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 11005
+    },
+    {
+      "epoch": 0.11006,
+      "grad_norm": 0.940889298915863,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 11006
+    },
+    {
+      "epoch": 0.11007,
+      "grad_norm": 0.9138043522834778,
+      "learning_rate": 0.003,
+      "loss": 4.0302,
+      "step": 11007
+    },
+    {
+      "epoch": 0.11008,
+      "grad_norm": 0.8481074571609497,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 11008
+    },
+    {
+      "epoch": 0.11009,
+      "grad_norm": 0.9567734599113464,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 11009
+    },
+    {
+      "epoch": 0.1101,
+      "grad_norm": 1.285965919494629,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 11010
+    },
+    {
+      "epoch": 0.11011,
+      "grad_norm": 0.8564926981925964,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 11011
+    },
+    {
+      "epoch": 0.11012,
+      "grad_norm": 0.8530118465423584,
+      "learning_rate": 0.003,
+      "loss": 4.0249,
+      "step": 11012
+    },
+    {
+      "epoch": 0.11013,
+      "grad_norm": 0.8939397931098938,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 11013
+    },
+    {
+      "epoch": 0.11014,
+      "grad_norm": 0.8238596320152283,
+      "learning_rate": 0.003,
+      "loss": 4.0388,
+      "step": 11014
+    },
+    {
+      "epoch": 0.11015,
+      "grad_norm": 0.8376443386077881,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 11015
+    },
+    {
+      "epoch": 0.11016,
+      "grad_norm": 0.7546934485435486,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 11016
+    },
+    {
+      "epoch": 0.11017,
+      "grad_norm": 0.7620981931686401,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 11017
+    },
+    {
+      "epoch": 0.11018,
+      "grad_norm": 0.7299874424934387,
+      "learning_rate": 0.003,
+      "loss": 4.0456,
+      "step": 11018
+    },
+    {
+      "epoch": 0.11019,
+      "grad_norm": 0.6225110292434692,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 11019
+    },
+    {
+      "epoch": 0.1102,
+      "grad_norm": 0.6360118389129639,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 11020
+    },
+    {
+      "epoch": 0.11021,
+      "grad_norm": 0.6085506081581116,
+      "learning_rate": 0.003,
+      "loss": 4.0072,
+      "step": 11021
+    },
+    {
+      "epoch": 0.11022,
+      "grad_norm": 0.6367476582527161,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 11022
+    },
+    {
+      "epoch": 0.11023,
+      "grad_norm": 0.7030669450759888,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 11023
+    },
+    {
+      "epoch": 0.11024,
+      "grad_norm": 0.7498410940170288,
+      "learning_rate": 0.003,
+      "loss": 4.0437,
+      "step": 11024
+    },
+    {
+      "epoch": 0.11025,
+      "grad_norm": 0.8611958622932434,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 11025
+    },
+    {
+      "epoch": 0.11026,
+      "grad_norm": 0.8228955268859863,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 11026
+    },
+    {
+      "epoch": 0.11027,
+      "grad_norm": 0.6890997290611267,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 11027
+    },
+    {
+      "epoch": 0.11028,
+      "grad_norm": 0.7558037638664246,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 11028
+    },
+    {
+      "epoch": 0.11029,
+      "grad_norm": 0.8738684058189392,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 11029
+    },
+    {
+      "epoch": 0.1103,
+      "grad_norm": 0.8827449083328247,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 11030
+    },
+    {
+      "epoch": 0.11031,
+      "grad_norm": 0.7657271027565002,
+      "learning_rate": 0.003,
+      "loss": 4.0234,
+      "step": 11031
+    },
+    {
+      "epoch": 0.11032,
+      "grad_norm": 0.7138214111328125,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 11032
+    },
+    {
+      "epoch": 0.11033,
+      "grad_norm": 0.7493338584899902,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 11033
+    },
+    {
+      "epoch": 0.11034,
+      "grad_norm": 0.7448801398277283,
+      "learning_rate": 0.003,
+      "loss": 3.9938,
+      "step": 11034
+    },
+    {
+      "epoch": 0.11035,
+      "grad_norm": 0.66679447889328,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 11035
+    },
+    {
+      "epoch": 0.11036,
+      "grad_norm": 0.6494250297546387,
+      "learning_rate": 0.003,
+      "loss": 4.0402,
+      "step": 11036
+    },
+    {
+      "epoch": 0.11037,
+      "grad_norm": 0.700835108757019,
+      "learning_rate": 0.003,
+      "loss": 3.9923,
+      "step": 11037
+    },
+    {
+      "epoch": 0.11038,
+      "grad_norm": 0.7485676407814026,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 11038
+    },
+    {
+      "epoch": 0.11039,
+      "grad_norm": 0.6627346277236938,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 11039
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.6443310379981995,
+      "learning_rate": 0.003,
+      "loss": 4.0513,
+      "step": 11040
+    },
+    {
+      "epoch": 0.11041,
+      "grad_norm": 0.6941895484924316,
+      "learning_rate": 0.003,
+      "loss": 4.0366,
+      "step": 11041
+    },
+    {
+      "epoch": 0.11042,
+      "grad_norm": 0.7193214893341064,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 11042
+    },
+    {
+      "epoch": 0.11043,
+      "grad_norm": 0.8162452578544617,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 11043
+    },
+    {
+      "epoch": 0.11044,
+      "grad_norm": 0.9481807351112366,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 11044
+    },
+    {
+      "epoch": 0.11045,
+      "grad_norm": 0.7185466885566711,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 11045
+    },
+    {
+      "epoch": 0.11046,
+      "grad_norm": 0.7170248031616211,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 11046
+    },
+    {
+      "epoch": 0.11047,
+      "grad_norm": 0.8825567364692688,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 11047
+    },
+    {
+      "epoch": 0.11048,
+      "grad_norm": 1.2538373470306396,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 11048
+    },
+    {
+      "epoch": 0.11049,
+      "grad_norm": 0.856818675994873,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 11049
+    },
+    {
+      "epoch": 0.1105,
+      "grad_norm": 0.7342973351478577,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 11050
+    },
+    {
+      "epoch": 0.11051,
+      "grad_norm": 0.8246771693229675,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 11051
+    },
+    {
+      "epoch": 0.11052,
+      "grad_norm": 1.0061192512512207,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 11052
+    },
+    {
+      "epoch": 0.11053,
+      "grad_norm": 1.0631734132766724,
+      "learning_rate": 0.003,
+      "loss": 4.0364,
+      "step": 11053
+    },
+    {
+      "epoch": 0.11054,
+      "grad_norm": 1.0199459791183472,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 11054
+    },
+    {
+      "epoch": 0.11055,
+      "grad_norm": 0.9731594920158386,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 11055
+    },
+    {
+      "epoch": 0.11056,
+      "grad_norm": 0.8760973215103149,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 11056
+    },
+    {
+      "epoch": 0.11057,
+      "grad_norm": 0.789323627948761,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 11057
+    },
+    {
+      "epoch": 0.11058,
+      "grad_norm": 0.8253927826881409,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 11058
+    },
+    {
+      "epoch": 0.11059,
+      "grad_norm": 0.766977071762085,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 11059
+    },
+    {
+      "epoch": 0.1106,
+      "grad_norm": 0.7662436366081238,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 11060
+    },
+    {
+      "epoch": 0.11061,
+      "grad_norm": 0.8208341002464294,
+      "learning_rate": 0.003,
+      "loss": 4.0486,
+      "step": 11061
+    },
+    {
+      "epoch": 0.11062,
+      "grad_norm": 0.8970367908477783,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 11062
+    },
+    {
+      "epoch": 0.11063,
+      "grad_norm": 1.0099105834960938,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 11063
+    },
+    {
+      "epoch": 0.11064,
+      "grad_norm": 0.9661181569099426,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 11064
+    },
+    {
+      "epoch": 0.11065,
+      "grad_norm": 1.0960055589675903,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 11065
+    },
+    {
+      "epoch": 0.11066,
+      "grad_norm": 0.9327487945556641,
+      "learning_rate": 0.003,
+      "loss": 4.0477,
+      "step": 11066
+    },
+    {
+      "epoch": 0.11067,
+      "grad_norm": 0.7413168549537659,
+      "learning_rate": 0.003,
+      "loss": 4.0465,
+      "step": 11067
+    },
+    {
+      "epoch": 0.11068,
+      "grad_norm": 0.6693038940429688,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 11068
+    },
+    {
+      "epoch": 0.11069,
+      "grad_norm": 0.6442914605140686,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 11069
+    },
+    {
+      "epoch": 0.1107,
+      "grad_norm": 0.6646853089332581,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 11070
+    },
+    {
+      "epoch": 0.11071,
+      "grad_norm": 0.5942994356155396,
+      "learning_rate": 0.003,
+      "loss": 4.0242,
+      "step": 11071
+    },
+    {
+      "epoch": 0.11072,
+      "grad_norm": 0.547498345375061,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 11072
+    },
+    {
+      "epoch": 0.11073,
+      "grad_norm": 0.5513029098510742,
+      "learning_rate": 0.003,
+      "loss": 4.0491,
+      "step": 11073
+    },
+    {
+      "epoch": 0.11074,
+      "grad_norm": 0.6343658566474915,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 11074
+    },
+    {
+      "epoch": 0.11075,
+      "grad_norm": 0.7820315957069397,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 11075
+    },
+    {
+      "epoch": 0.11076,
+      "grad_norm": 0.9613561630249023,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 11076
+    },
+    {
+      "epoch": 0.11077,
+      "grad_norm": 1.0932581424713135,
+      "learning_rate": 0.003,
+      "loss": 4.0738,
+      "step": 11077
+    },
+    {
+      "epoch": 0.11078,
+      "grad_norm": 0.9842900633811951,
+      "learning_rate": 0.003,
+      "loss": 4.0366,
+      "step": 11078
+    },
+    {
+      "epoch": 0.11079,
+      "grad_norm": 0.8640671968460083,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 11079
+    },
+    {
+      "epoch": 0.1108,
+      "grad_norm": 0.7365666627883911,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 11080
+    },
+    {
+      "epoch": 0.11081,
+      "grad_norm": 0.7927987575531006,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 11081
+    },
+    {
+      "epoch": 0.11082,
+      "grad_norm": 0.8462502360343933,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 11082
+    },
+    {
+      "epoch": 0.11083,
+      "grad_norm": 0.8290241360664368,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 11083
+    },
+    {
+      "epoch": 0.11084,
+      "grad_norm": 0.7888922095298767,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 11084
+    },
+    {
+      "epoch": 0.11085,
+      "grad_norm": 0.6967290639877319,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 11085
+    },
+    {
+      "epoch": 0.11086,
+      "grad_norm": 0.7021198868751526,
+      "learning_rate": 0.003,
+      "loss": 4.0525,
+      "step": 11086
+    },
+    {
+      "epoch": 0.11087,
+      "grad_norm": 0.7120110988616943,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 11087
+    },
+    {
+      "epoch": 0.11088,
+      "grad_norm": 0.6755147576332092,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 11088
+    },
+    {
+      "epoch": 0.11089,
+      "grad_norm": 0.8151680827140808,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 11089
+    },
+    {
+      "epoch": 0.1109,
+      "grad_norm": 1.1533037424087524,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 11090
+    },
+    {
+      "epoch": 0.11091,
+      "grad_norm": 0.9690134525299072,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 11091
+    },
+    {
+      "epoch": 0.11092,
+      "grad_norm": 0.8287317752838135,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 11092
+    },
+    {
+      "epoch": 0.11093,
+      "grad_norm": 0.6802783608436584,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 11093
+    },
+    {
+      "epoch": 0.11094,
+      "grad_norm": 0.5430587530136108,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 11094
+    },
+    {
+      "epoch": 0.11095,
+      "grad_norm": 0.4955750107765198,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 11095
+    },
+    {
+      "epoch": 0.11096,
+      "grad_norm": 0.5845931768417358,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 11096
+    },
+    {
+      "epoch": 0.11097,
+      "grad_norm": 0.7685054540634155,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 11097
+    },
+    {
+      "epoch": 0.11098,
+      "grad_norm": 0.9369683265686035,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 11098
+    },
+    {
+      "epoch": 0.11099,
+      "grad_norm": 1.0887501239776611,
+      "learning_rate": 0.003,
+      "loss": 4.052,
+      "step": 11099
+    },
+    {
+      "epoch": 0.111,
+      "grad_norm": 0.7608708143234253,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 11100
+    },
+    {
+      "epoch": 0.11101,
+      "grad_norm": 0.6959680318832397,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 11101
+    },
+    {
+      "epoch": 0.11102,
+      "grad_norm": 1.0319355726242065,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 11102
+    },
+    {
+      "epoch": 0.11103,
+      "grad_norm": 1.1587380170822144,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 11103
+    },
+    {
+      "epoch": 0.11104,
+      "grad_norm": 0.6529046297073364,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 11104
+    },
+    {
+      "epoch": 0.11105,
+      "grad_norm": 0.6380882263183594,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 11105
+    },
+    {
+      "epoch": 0.11106,
+      "grad_norm": 0.801237940788269,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 11106
+    },
+    {
+      "epoch": 0.11107,
+      "grad_norm": 1.0300419330596924,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 11107
+    },
+    {
+      "epoch": 0.11108,
+      "grad_norm": 0.8984727263450623,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 11108
+    },
+    {
+      "epoch": 0.11109,
+      "grad_norm": 0.7662938237190247,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 11109
+    },
+    {
+      "epoch": 0.1111,
+      "grad_norm": 0.7453399300575256,
+      "learning_rate": 0.003,
+      "loss": 4.0332,
+      "step": 11110
+    },
+    {
+      "epoch": 0.11111,
+      "grad_norm": 0.815127968788147,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 11111
+    },
+    {
+      "epoch": 0.11112,
+      "grad_norm": 0.7698167562484741,
+      "learning_rate": 0.003,
+      "loss": 4.0491,
+      "step": 11112
+    },
+    {
+      "epoch": 0.11113,
+      "grad_norm": 0.6643550992012024,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 11113
+    },
+    {
+      "epoch": 0.11114,
+      "grad_norm": 0.5844383239746094,
+      "learning_rate": 0.003,
+      "loss": 4.056,
+      "step": 11114
+    },
+    {
+      "epoch": 0.11115,
+      "grad_norm": 0.6911113262176514,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 11115
+    },
+    {
+      "epoch": 0.11116,
+      "grad_norm": 0.7149133086204529,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 11116
+    },
+    {
+      "epoch": 0.11117,
+      "grad_norm": 0.6734727025032043,
+      "learning_rate": 0.003,
+      "loss": 4.0441,
+      "step": 11117
+    },
+    {
+      "epoch": 0.11118,
+      "grad_norm": 0.6846103668212891,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 11118
+    },
+    {
+      "epoch": 0.11119,
+      "grad_norm": 0.7427072525024414,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 11119
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.8775559067726135,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 11120
+    },
+    {
+      "epoch": 0.11121,
+      "grad_norm": 1.1779645681381226,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 11121
+    },
+    {
+      "epoch": 0.11122,
+      "grad_norm": 0.8917052149772644,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 11122
+    },
+    {
+      "epoch": 0.11123,
+      "grad_norm": 0.8060603141784668,
+      "learning_rate": 0.003,
+      "loss": 4.046,
+      "step": 11123
+    },
+    {
+      "epoch": 0.11124,
+      "grad_norm": 0.8351340293884277,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 11124
+    },
+    {
+      "epoch": 0.11125,
+      "grad_norm": 0.7737427353858948,
+      "learning_rate": 0.003,
+      "loss": 4.0409,
+      "step": 11125
+    },
+    {
+      "epoch": 0.11126,
+      "grad_norm": 0.7786157131195068,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 11126
+    },
+    {
+      "epoch": 0.11127,
+      "grad_norm": 0.8442794680595398,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 11127
+    },
+    {
+      "epoch": 0.11128,
+      "grad_norm": 0.9874286651611328,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 11128
+    },
+    {
+      "epoch": 0.11129,
+      "grad_norm": 1.0912935733795166,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 11129
+    },
+    {
+      "epoch": 0.1113,
+      "grad_norm": 0.9720292091369629,
+      "learning_rate": 0.003,
+      "loss": 4.0091,
+      "step": 11130
+    },
+    {
+      "epoch": 0.11131,
+      "grad_norm": 0.866678774356842,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 11131
+    },
+    {
+      "epoch": 0.11132,
+      "grad_norm": 0.9276267886161804,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 11132
+    },
+    {
+      "epoch": 0.11133,
+      "grad_norm": 1.0052250623703003,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 11133
+    },
+    {
+      "epoch": 0.11134,
+      "grad_norm": 0.9750459790229797,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 11134
+    },
+    {
+      "epoch": 0.11135,
+      "grad_norm": 1.0204553604125977,
+      "learning_rate": 0.003,
+      "loss": 4.0462,
+      "step": 11135
+    },
+    {
+      "epoch": 0.11136,
+      "grad_norm": 1.1887991428375244,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 11136
+    },
+    {
+      "epoch": 0.11137,
+      "grad_norm": 0.8008458018302917,
+      "learning_rate": 0.003,
+      "loss": 4.0357,
+      "step": 11137
+    },
+    {
+      "epoch": 0.11138,
+      "grad_norm": 0.6591132879257202,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 11138
+    },
+    {
+      "epoch": 0.11139,
+      "grad_norm": 0.5977212190628052,
+      "learning_rate": 0.003,
+      "loss": 4.0524,
+      "step": 11139
+    },
+    {
+      "epoch": 0.1114,
+      "grad_norm": 0.5454531311988831,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 11140
+    },
+    {
+      "epoch": 0.11141,
+      "grad_norm": 0.535681962966919,
+      "learning_rate": 0.003,
+      "loss": 4.0357,
+      "step": 11141
+    },
+    {
+      "epoch": 0.11142,
+      "grad_norm": 0.5493536591529846,
+      "learning_rate": 0.003,
+      "loss": 4.0323,
+      "step": 11142
+    },
+    {
+      "epoch": 0.11143,
+      "grad_norm": 0.6497412323951721,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 11143
+    },
+    {
+      "epoch": 0.11144,
+      "grad_norm": 0.7990849614143372,
+      "learning_rate": 0.003,
+      "loss": 4.0027,
+      "step": 11144
+    },
+    {
+      "epoch": 0.11145,
+      "grad_norm": 0.7726136445999146,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 11145
+    },
+    {
+      "epoch": 0.11146,
+      "grad_norm": 0.6456999182701111,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 11146
+    },
+    {
+      "epoch": 0.11147,
+      "grad_norm": 0.6681917905807495,
+      "learning_rate": 0.003,
+      "loss": 4.0461,
+      "step": 11147
+    },
+    {
+      "epoch": 0.11148,
+      "grad_norm": 0.6563037633895874,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 11148
+    },
+    {
+      "epoch": 0.11149,
+      "grad_norm": 0.6741673350334167,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 11149
+    },
+    {
+      "epoch": 0.1115,
+      "grad_norm": 0.8483068943023682,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 11150
+    },
+    {
+      "epoch": 0.11151,
+      "grad_norm": 1.0420079231262207,
+      "learning_rate": 0.003,
+      "loss": 4.0479,
+      "step": 11151
+    },
+    {
+      "epoch": 0.11152,
+      "grad_norm": 1.0746169090270996,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 11152
+    },
+    {
+      "epoch": 0.11153,
+      "grad_norm": 0.8414022922515869,
+      "learning_rate": 0.003,
+      "loss": 4.0406,
+      "step": 11153
+    },
+    {
+      "epoch": 0.11154,
+      "grad_norm": 0.7460391521453857,
+      "learning_rate": 0.003,
+      "loss": 4.0331,
+      "step": 11154
+    },
+    {
+      "epoch": 0.11155,
+      "grad_norm": 0.8425396084785461,
+      "learning_rate": 0.003,
+      "loss": 4.0425,
+      "step": 11155
+    },
+    {
+      "epoch": 0.11156,
+      "grad_norm": 0.7851555347442627,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 11156
+    },
+    {
+      "epoch": 0.11157,
+      "grad_norm": 0.7829062938690186,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 11157
+    },
+    {
+      "epoch": 0.11158,
+      "grad_norm": 0.8758364915847778,
+      "learning_rate": 0.003,
+      "loss": 4.0406,
+      "step": 11158
+    },
+    {
+      "epoch": 0.11159,
+      "grad_norm": 0.8003804683685303,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 11159
+    },
+    {
+      "epoch": 0.1116,
+      "grad_norm": 0.7548210620880127,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 11160
+    },
+    {
+      "epoch": 0.11161,
+      "grad_norm": 0.7185114026069641,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 11161
+    },
+    {
+      "epoch": 0.11162,
+      "grad_norm": 0.6627770066261292,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 11162
+    },
+    {
+      "epoch": 0.11163,
+      "grad_norm": 0.6781023144721985,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 11163
+    },
+    {
+      "epoch": 0.11164,
+      "grad_norm": 0.7373916506767273,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 11164
+    },
+    {
+      "epoch": 0.11165,
+      "grad_norm": 0.8808543086051941,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 11165
+    },
+    {
+      "epoch": 0.11166,
+      "grad_norm": 0.934785783290863,
+      "learning_rate": 0.003,
+      "loss": 4.0351,
+      "step": 11166
+    },
+    {
+      "epoch": 0.11167,
+      "grad_norm": 0.8570555448532104,
+      "learning_rate": 0.003,
+      "loss": 4.0331,
+      "step": 11167
+    },
+    {
+      "epoch": 0.11168,
+      "grad_norm": 0.8547791242599487,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 11168
+    },
+    {
+      "epoch": 0.11169,
+      "grad_norm": 0.9320052266120911,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 11169
+    },
+    {
+      "epoch": 0.1117,
+      "grad_norm": 1.164999008178711,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 11170
+    },
+    {
+      "epoch": 0.11171,
+      "grad_norm": 0.9276113510131836,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 11171
+    },
+    {
+      "epoch": 0.11172,
+      "grad_norm": 0.8860996961593628,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 11172
+    },
+    {
+      "epoch": 0.11173,
+      "grad_norm": 0.9282903671264648,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 11173
+    },
+    {
+      "epoch": 0.11174,
+      "grad_norm": 0.8367762565612793,
+      "learning_rate": 0.003,
+      "loss": 4.0409,
+      "step": 11174
+    },
+    {
+      "epoch": 0.11175,
+      "grad_norm": 0.7053303122520447,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 11175
+    },
+    {
+      "epoch": 0.11176,
+      "grad_norm": 0.6328437328338623,
+      "learning_rate": 0.003,
+      "loss": 4.0027,
+      "step": 11176
+    },
+    {
+      "epoch": 0.11177,
+      "grad_norm": 0.6176173686981201,
+      "learning_rate": 0.003,
+      "loss": 4.0468,
+      "step": 11177
+    },
+    {
+      "epoch": 0.11178,
+      "grad_norm": 0.6604607105255127,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 11178
+    },
+    {
+      "epoch": 0.11179,
+      "grad_norm": 0.6398141980171204,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 11179
+    },
+    {
+      "epoch": 0.1118,
+      "grad_norm": 0.5881091952323914,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 11180
+    },
+    {
+      "epoch": 0.11181,
+      "grad_norm": 0.5375813841819763,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 11181
+    },
+    {
+      "epoch": 0.11182,
+      "grad_norm": 0.5335777997970581,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 11182
+    },
+    {
+      "epoch": 0.11183,
+      "grad_norm": 0.4691711366176605,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 11183
+    },
+    {
+      "epoch": 0.11184,
+      "grad_norm": 0.524629533290863,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 11184
+    },
+    {
+      "epoch": 0.11185,
+      "grad_norm": 0.6009156703948975,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 11185
+    },
+    {
+      "epoch": 0.11186,
+      "grad_norm": 0.8225870132446289,
+      "learning_rate": 0.003,
+      "loss": 4.0022,
+      "step": 11186
+    },
+    {
+      "epoch": 0.11187,
+      "grad_norm": 1.3781869411468506,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 11187
+    },
+    {
+      "epoch": 0.11188,
+      "grad_norm": 0.8902261257171631,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 11188
+    },
+    {
+      "epoch": 0.11189,
+      "grad_norm": 0.6796999573707581,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 11189
+    },
+    {
+      "epoch": 0.1119,
+      "grad_norm": 0.5967490673065186,
+      "learning_rate": 0.003,
+      "loss": 4.0454,
+      "step": 11190
+    },
+    {
+      "epoch": 0.11191,
+      "grad_norm": 0.7360259890556335,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 11191
+    },
+    {
+      "epoch": 0.11192,
+      "grad_norm": 0.8131880164146423,
+      "learning_rate": 0.003,
+      "loss": 4.0497,
+      "step": 11192
+    },
+    {
+      "epoch": 0.11193,
+      "grad_norm": 0.7584455609321594,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 11193
+    },
+    {
+      "epoch": 0.11194,
+      "grad_norm": 0.6299670934677124,
+      "learning_rate": 0.003,
+      "loss": 4.05,
+      "step": 11194
+    },
+    {
+      "epoch": 0.11195,
+      "grad_norm": 0.6245558857917786,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 11195
+    },
+    {
+      "epoch": 0.11196,
+      "grad_norm": 0.7212068438529968,
+      "learning_rate": 0.003,
+      "loss": 4.0176,
+      "step": 11196
+    },
+    {
+      "epoch": 0.11197,
+      "grad_norm": 0.7425583600997925,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 11197
+    },
+    {
+      "epoch": 0.11198,
+      "grad_norm": 0.8863121271133423,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 11198
+    },
+    {
+      "epoch": 0.11199,
+      "grad_norm": 0.9826465845108032,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 11199
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.9267696738243103,
+      "learning_rate": 0.003,
+      "loss": 4.0004,
+      "step": 11200
+    },
+    {
+      "epoch": 0.11201,
+      "grad_norm": 0.9320189356803894,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 11201
+    },
+    {
+      "epoch": 0.11202,
+      "grad_norm": 0.9510216116905212,
+      "learning_rate": 0.003,
+      "loss": 4.0373,
+      "step": 11202
+    },
+    {
+      "epoch": 0.11203,
+      "grad_norm": 0.9528765678405762,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 11203
+    },
+    {
+      "epoch": 0.11204,
+      "grad_norm": 1.0288702249526978,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 11204
+    },
+    {
+      "epoch": 0.11205,
+      "grad_norm": 1.0346468687057495,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 11205
+    },
+    {
+      "epoch": 0.11206,
+      "grad_norm": 0.92815101146698,
+      "learning_rate": 0.003,
+      "loss": 4.0492,
+      "step": 11206
+    },
+    {
+      "epoch": 0.11207,
+      "grad_norm": 0.8600759506225586,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 11207
+    },
+    {
+      "epoch": 0.11208,
+      "grad_norm": 0.7563555836677551,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 11208
+    },
+    {
+      "epoch": 0.11209,
+      "grad_norm": 0.7098404169082642,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 11209
+    },
+    {
+      "epoch": 0.1121,
+      "grad_norm": 0.8382050395011902,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 11210
+    },
+    {
+      "epoch": 0.11211,
+      "grad_norm": 1.0424740314483643,
+      "learning_rate": 0.003,
+      "loss": 4.0545,
+      "step": 11211
+    },
+    {
+      "epoch": 0.11212,
+      "grad_norm": 1.058436393737793,
+      "learning_rate": 0.003,
+      "loss": 4.0449,
+      "step": 11212
+    },
+    {
+      "epoch": 0.11213,
+      "grad_norm": 0.9474709630012512,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 11213
+    },
+    {
+      "epoch": 0.11214,
+      "grad_norm": 0.9382772445678711,
+      "learning_rate": 0.003,
+      "loss": 4.0691,
+      "step": 11214
+    },
+    {
+      "epoch": 0.11215,
+      "grad_norm": 1.0095430612564087,
+      "learning_rate": 0.003,
+      "loss": 4.0743,
+      "step": 11215
+    },
+    {
+      "epoch": 0.11216,
+      "grad_norm": 0.99806809425354,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 11216
+    },
+    {
+      "epoch": 0.11217,
+      "grad_norm": 0.874643087387085,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 11217
+    },
+    {
+      "epoch": 0.11218,
+      "grad_norm": 0.8845378160476685,
+      "learning_rate": 0.003,
+      "loss": 4.0423,
+      "step": 11218
+    },
+    {
+      "epoch": 0.11219,
+      "grad_norm": 1.0587880611419678,
+      "learning_rate": 0.003,
+      "loss": 4.0386,
+      "step": 11219
+    },
+    {
+      "epoch": 0.1122,
+      "grad_norm": 0.9981693029403687,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 11220
+    },
+    {
+      "epoch": 0.11221,
+      "grad_norm": 0.8588231205940247,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 11221
+    },
+    {
+      "epoch": 0.11222,
+      "grad_norm": 0.67852783203125,
+      "learning_rate": 0.003,
+      "loss": 4.0472,
+      "step": 11222
+    },
+    {
+      "epoch": 0.11223,
+      "grad_norm": 0.6346800923347473,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 11223
+    },
+    {
+      "epoch": 0.11224,
+      "grad_norm": 0.5292540192604065,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 11224
+    },
+    {
+      "epoch": 0.11225,
+      "grad_norm": 0.5228301882743835,
+      "learning_rate": 0.003,
+      "loss": 4.0448,
+      "step": 11225
+    },
+    {
+      "epoch": 0.11226,
+      "grad_norm": 0.48205044865608215,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 11226
+    },
+    {
+      "epoch": 0.11227,
+      "grad_norm": 0.47511619329452515,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 11227
+    },
+    {
+      "epoch": 0.11228,
+      "grad_norm": 0.5459652543067932,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 11228
+    },
+    {
+      "epoch": 0.11229,
+      "grad_norm": 0.6039158701896667,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 11229
+    },
+    {
+      "epoch": 0.1123,
+      "grad_norm": 0.5797991156578064,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 11230
+    },
+    {
+      "epoch": 0.11231,
+      "grad_norm": 0.677551805973053,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 11231
+    },
+    {
+      "epoch": 0.11232,
+      "grad_norm": 0.6512748003005981,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 11232
+    },
+    {
+      "epoch": 0.11233,
+      "grad_norm": 0.6309075355529785,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 11233
+    },
+    {
+      "epoch": 0.11234,
+      "grad_norm": 0.5747607350349426,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 11234
+    },
+    {
+      "epoch": 0.11235,
+      "grad_norm": 0.6133723855018616,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 11235
+    },
+    {
+      "epoch": 0.11236,
+      "grad_norm": 0.9459915161132812,
+      "learning_rate": 0.003,
+      "loss": 4.039,
+      "step": 11236
+    },
+    {
+      "epoch": 0.11237,
+      "grad_norm": 1.2948063611984253,
+      "learning_rate": 0.003,
+      "loss": 4.0347,
+      "step": 11237
+    },
+    {
+      "epoch": 0.11238,
+      "grad_norm": 0.9041596055030823,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 11238
+    },
+    {
+      "epoch": 0.11239,
+      "grad_norm": 0.8141499757766724,
+      "learning_rate": 0.003,
+      "loss": 4.0316,
+      "step": 11239
+    },
+    {
+      "epoch": 0.1124,
+      "grad_norm": 0.8682344555854797,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 11240
+    },
+    {
+      "epoch": 0.11241,
+      "grad_norm": 0.7102754712104797,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 11241
+    },
+    {
+      "epoch": 0.11242,
+      "grad_norm": 0.5986071228981018,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 11242
+    },
+    {
+      "epoch": 0.11243,
+      "grad_norm": 0.6330735087394714,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 11243
+    },
+    {
+      "epoch": 0.11244,
+      "grad_norm": 0.7382640838623047,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 11244
+    },
+    {
+      "epoch": 0.11245,
+      "grad_norm": 0.7702611088752747,
+      "learning_rate": 0.003,
+      "loss": 4.0351,
+      "step": 11245
+    },
+    {
+      "epoch": 0.11246,
+      "grad_norm": 0.9086140394210815,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 11246
+    },
+    {
+      "epoch": 0.11247,
+      "grad_norm": 1.0867910385131836,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 11247
+    },
+    {
+      "epoch": 0.11248,
+      "grad_norm": 0.8973993062973022,
+      "learning_rate": 0.003,
+      "loss": 4.0451,
+      "step": 11248
+    },
+    {
+      "epoch": 0.11249,
+      "grad_norm": 0.741752028465271,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 11249
+    },
+    {
+      "epoch": 0.1125,
+      "grad_norm": 0.5408896803855896,
+      "learning_rate": 0.003,
+      "loss": 3.9993,
+      "step": 11250
+    },
+    {
+      "epoch": 0.11251,
+      "grad_norm": 0.577225923538208,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 11251
+    },
+    {
+      "epoch": 0.11252,
+      "grad_norm": 0.6334212422370911,
+      "learning_rate": 0.003,
+      "loss": 4.0339,
+      "step": 11252
+    },
+    {
+      "epoch": 0.11253,
+      "grad_norm": 0.6426181793212891,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 11253
+    },
+    {
+      "epoch": 0.11254,
+      "grad_norm": 0.7714022994041443,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 11254
+    },
+    {
+      "epoch": 0.11255,
+      "grad_norm": 0.8345813751220703,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 11255
+    },
+    {
+      "epoch": 0.11256,
+      "grad_norm": 0.8170779943466187,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 11256
+    },
+    {
+      "epoch": 0.11257,
+      "grad_norm": 0.8695367574691772,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 11257
+    },
+    {
+      "epoch": 0.11258,
+      "grad_norm": 0.9470844864845276,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 11258
+    },
+    {
+      "epoch": 0.11259,
+      "grad_norm": 1.0294305086135864,
+      "learning_rate": 0.003,
+      "loss": 4.0583,
+      "step": 11259
+    },
+    {
+      "epoch": 0.1126,
+      "grad_norm": 0.9117608666419983,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 11260
+    },
+    {
+      "epoch": 0.11261,
+      "grad_norm": 0.7772311568260193,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 11261
+    },
+    {
+      "epoch": 0.11262,
+      "grad_norm": 0.7702427506446838,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 11262
+    },
+    {
+      "epoch": 0.11263,
+      "grad_norm": 0.7167341709136963,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 11263
+    },
+    {
+      "epoch": 0.11264,
+      "grad_norm": 0.7869431972503662,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 11264
+    },
+    {
+      "epoch": 0.11265,
+      "grad_norm": 0.9567169547080994,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 11265
+    },
+    {
+      "epoch": 0.11266,
+      "grad_norm": 1.15801203250885,
+      "learning_rate": 0.003,
+      "loss": 4.0561,
+      "step": 11266
+    },
+    {
+      "epoch": 0.11267,
+      "grad_norm": 0.9887841939926147,
+      "learning_rate": 0.003,
+      "loss": 4.0579,
+      "step": 11267
+    },
+    {
+      "epoch": 0.11268,
+      "grad_norm": 0.907789945602417,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 11268
+    },
+    {
+      "epoch": 0.11269,
+      "grad_norm": 0.7598255276679993,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 11269
+    },
+    {
+      "epoch": 0.1127,
+      "grad_norm": 0.7950443029403687,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 11270
+    },
+    {
+      "epoch": 0.11271,
+      "grad_norm": 0.9572848081588745,
+      "learning_rate": 0.003,
+      "loss": 4.0382,
+      "step": 11271
+    },
+    {
+      "epoch": 0.11272,
+      "grad_norm": 0.9056628942489624,
+      "learning_rate": 0.003,
+      "loss": 4.0207,
+      "step": 11272
+    },
+    {
+      "epoch": 0.11273,
+      "grad_norm": 1.0475438833236694,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 11273
+    },
+    {
+      "epoch": 0.11274,
+      "grad_norm": 1.0598156452178955,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 11274
+    },
+    {
+      "epoch": 0.11275,
+      "grad_norm": 0.7640100121498108,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 11275
+    },
+    {
+      "epoch": 0.11276,
+      "grad_norm": 0.666291356086731,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 11276
+    },
+    {
+      "epoch": 0.11277,
+      "grad_norm": 0.7809427380561829,
+      "learning_rate": 0.003,
+      "loss": 4.0242,
+      "step": 11277
+    },
+    {
+      "epoch": 0.11278,
+      "grad_norm": 0.7285363078117371,
+      "learning_rate": 0.003,
+      "loss": 4.0477,
+      "step": 11278
+    },
+    {
+      "epoch": 0.11279,
+      "grad_norm": 0.8132727146148682,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 11279
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.9549084901809692,
+      "learning_rate": 0.003,
+      "loss": 4.0364,
+      "step": 11280
+    },
+    {
+      "epoch": 0.11281,
+      "grad_norm": 0.9229263067245483,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 11281
+    },
+    {
+      "epoch": 0.11282,
+      "grad_norm": 0.7626646161079407,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 11282
+    },
+    {
+      "epoch": 0.11283,
+      "grad_norm": 0.6786330342292786,
+      "learning_rate": 0.003,
+      "loss": 4.0334,
+      "step": 11283
+    },
+    {
+      "epoch": 0.11284,
+      "grad_norm": 0.6982824802398682,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 11284
+    },
+    {
+      "epoch": 0.11285,
+      "grad_norm": 0.6288905739784241,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 11285
+    },
+    {
+      "epoch": 0.11286,
+      "grad_norm": 0.5336101651191711,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 11286
+    },
+    {
+      "epoch": 0.11287,
+      "grad_norm": 0.5692437291145325,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 11287
+    },
+    {
+      "epoch": 0.11288,
+      "grad_norm": 0.6209651827812195,
+      "learning_rate": 0.003,
+      "loss": 3.9917,
+      "step": 11288
+    },
+    {
+      "epoch": 0.11289,
+      "grad_norm": 0.7525631785392761,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 11289
+    },
+    {
+      "epoch": 0.1129,
+      "grad_norm": 0.9396098852157593,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 11290
+    },
+    {
+      "epoch": 0.11291,
+      "grad_norm": 1.063955545425415,
+      "learning_rate": 0.003,
+      "loss": 4.0444,
+      "step": 11291
+    },
+    {
+      "epoch": 0.11292,
+      "grad_norm": 0.863498866558075,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 11292
+    },
+    {
+      "epoch": 0.11293,
+      "grad_norm": 0.6577820181846619,
+      "learning_rate": 0.003,
+      "loss": 4.0437,
+      "step": 11293
+    },
+    {
+      "epoch": 0.11294,
+      "grad_norm": 0.5650443434715271,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 11294
+    },
+    {
+      "epoch": 0.11295,
+      "grad_norm": 0.6451413631439209,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 11295
+    },
+    {
+      "epoch": 0.11296,
+      "grad_norm": 0.6615285873413086,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 11296
+    },
+    {
+      "epoch": 0.11297,
+      "grad_norm": 0.6570398807525635,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 11297
+    },
+    {
+      "epoch": 0.11298,
+      "grad_norm": 0.6592798233032227,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 11298
+    },
+    {
+      "epoch": 0.11299,
+      "grad_norm": 0.847843587398529,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 11299
+    },
+    {
+      "epoch": 0.113,
+      "grad_norm": 0.9787973165512085,
+      "learning_rate": 0.003,
+      "loss": 4.0527,
+      "step": 11300
+    },
+    {
+      "epoch": 0.11301,
+      "grad_norm": 1.1148426532745361,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 11301
+    },
+    {
+      "epoch": 0.11302,
+      "grad_norm": 0.8683514595031738,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 11302
+    },
+    {
+      "epoch": 0.11303,
+      "grad_norm": 0.7785764932632446,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 11303
+    },
+    {
+      "epoch": 0.11304,
+      "grad_norm": 0.7863451242446899,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 11304
+    },
+    {
+      "epoch": 0.11305,
+      "grad_norm": 0.8932132720947266,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 11305
+    },
+    {
+      "epoch": 0.11306,
+      "grad_norm": 1.0904179811477661,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 11306
+    },
+    {
+      "epoch": 0.11307,
+      "grad_norm": 0.9348009824752808,
+      "learning_rate": 0.003,
+      "loss": 4.0341,
+      "step": 11307
+    },
+    {
+      "epoch": 0.11308,
+      "grad_norm": 0.8359718918800354,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 11308
+    },
+    {
+      "epoch": 0.11309,
+      "grad_norm": 0.7915058732032776,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 11309
+    },
+    {
+      "epoch": 0.1131,
+      "grad_norm": 0.731500506401062,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 11310
+    },
+    {
+      "epoch": 0.11311,
+      "grad_norm": 0.6767420768737793,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 11311
+    },
+    {
+      "epoch": 0.11312,
+      "grad_norm": 0.7071685194969177,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 11312
+    },
+    {
+      "epoch": 0.11313,
+      "grad_norm": 0.8206322193145752,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 11313
+    },
+    {
+      "epoch": 0.11314,
+      "grad_norm": 0.995917558670044,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 11314
+    },
+    {
+      "epoch": 0.11315,
+      "grad_norm": 1.007768988609314,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 11315
+    },
+    {
+      "epoch": 0.11316,
+      "grad_norm": 0.917212188243866,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 11316
+    },
+    {
+      "epoch": 0.11317,
+      "grad_norm": 0.8176689147949219,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 11317
+    },
+    {
+      "epoch": 0.11318,
+      "grad_norm": 0.790153443813324,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 11318
+    },
+    {
+      "epoch": 0.11319,
+      "grad_norm": 0.77534019947052,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 11319
+    },
+    {
+      "epoch": 0.1132,
+      "grad_norm": 0.7776956558227539,
+      "learning_rate": 0.003,
+      "loss": 4.0323,
+      "step": 11320
+    },
+    {
+      "epoch": 0.11321,
+      "grad_norm": 0.7186883687973022,
+      "learning_rate": 0.003,
+      "loss": 3.9954,
+      "step": 11321
+    },
+    {
+      "epoch": 0.11322,
+      "grad_norm": 0.7190566062927246,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 11322
+    },
+    {
+      "epoch": 0.11323,
+      "grad_norm": 0.7112944722175598,
+      "learning_rate": 0.003,
+      "loss": 3.9829,
+      "step": 11323
+    },
+    {
+      "epoch": 0.11324,
+      "grad_norm": 0.6839632391929626,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 11324
+    },
+    {
+      "epoch": 0.11325,
+      "grad_norm": 0.7430686354637146,
+      "learning_rate": 0.003,
+      "loss": 4.0375,
+      "step": 11325
+    },
+    {
+      "epoch": 0.11326,
+      "grad_norm": 0.8290679454803467,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 11326
+    },
+    {
+      "epoch": 0.11327,
+      "grad_norm": 0.7175843119621277,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 11327
+    },
+    {
+      "epoch": 0.11328,
+      "grad_norm": 0.6538087725639343,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 11328
+    },
+    {
+      "epoch": 0.11329,
+      "grad_norm": 0.5562537908554077,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 11329
+    },
+    {
+      "epoch": 0.1133,
+      "grad_norm": 0.6857356429100037,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 11330
+    },
+    {
+      "epoch": 0.11331,
+      "grad_norm": 0.9902355074882507,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 11331
+    },
+    {
+      "epoch": 0.11332,
+      "grad_norm": 1.1442843675613403,
+      "learning_rate": 0.003,
+      "loss": 4.0466,
+      "step": 11332
+    },
+    {
+      "epoch": 0.11333,
+      "grad_norm": 0.8154362440109253,
+      "learning_rate": 0.003,
+      "loss": 3.9774,
+      "step": 11333
+    },
+    {
+      "epoch": 0.11334,
+      "grad_norm": 0.7913908362388611,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 11334
+    },
+    {
+      "epoch": 0.11335,
+      "grad_norm": 0.9230514764785767,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 11335
+    },
+    {
+      "epoch": 0.11336,
+      "grad_norm": 1.0062015056610107,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 11336
+    },
+    {
+      "epoch": 0.11337,
+      "grad_norm": 0.9774962663650513,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 11337
+    },
+    {
+      "epoch": 0.11338,
+      "grad_norm": 0.8366467356681824,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 11338
+    },
+    {
+      "epoch": 0.11339,
+      "grad_norm": 0.7167420983314514,
+      "learning_rate": 0.003,
+      "loss": 4.0615,
+      "step": 11339
+    },
+    {
+      "epoch": 0.1134,
+      "grad_norm": 0.7209339141845703,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 11340
+    },
+    {
+      "epoch": 0.11341,
+      "grad_norm": 0.7862409353256226,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 11341
+    },
+    {
+      "epoch": 0.11342,
+      "grad_norm": 0.7756770849227905,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 11342
+    },
+    {
+      "epoch": 0.11343,
+      "grad_norm": 0.7953722476959229,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 11343
+    },
+    {
+      "epoch": 0.11344,
+      "grad_norm": 0.8854662179946899,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 11344
+    },
+    {
+      "epoch": 0.11345,
+      "grad_norm": 0.9794262051582336,
+      "learning_rate": 0.003,
+      "loss": 4.0514,
+      "step": 11345
+    },
+    {
+      "epoch": 0.11346,
+      "grad_norm": 0.919201135635376,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 11346
+    },
+    {
+      "epoch": 0.11347,
+      "grad_norm": 0.7543264031410217,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 11347
+    },
+    {
+      "epoch": 0.11348,
+      "grad_norm": 0.7020623087882996,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 11348
+    },
+    {
+      "epoch": 0.11349,
+      "grad_norm": 0.7879665493965149,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 11349
+    },
+    {
+      "epoch": 0.1135,
+      "grad_norm": 0.862865686416626,
+      "learning_rate": 0.003,
+      "loss": 4.0534,
+      "step": 11350
+    },
+    {
+      "epoch": 0.11351,
+      "grad_norm": 0.8149678111076355,
+      "learning_rate": 0.003,
+      "loss": 4.0311,
+      "step": 11351
+    },
+    {
+      "epoch": 0.11352,
+      "grad_norm": 0.7770326137542725,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 11352
+    },
+    {
+      "epoch": 0.11353,
+      "grad_norm": 0.6807413101196289,
+      "learning_rate": 0.003,
+      "loss": 4.0108,
+      "step": 11353
+    },
+    {
+      "epoch": 0.11354,
+      "grad_norm": 0.6837721467018127,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 11354
+    },
+    {
+      "epoch": 0.11355,
+      "grad_norm": 0.6766520142555237,
+      "learning_rate": 0.003,
+      "loss": 4.0583,
+      "step": 11355
+    },
+    {
+      "epoch": 0.11356,
+      "grad_norm": 0.6636706590652466,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 11356
+    },
+    {
+      "epoch": 0.11357,
+      "grad_norm": 0.6780890226364136,
+      "learning_rate": 0.003,
+      "loss": 3.9965,
+      "step": 11357
+    },
+    {
+      "epoch": 0.11358,
+      "grad_norm": 0.6905491948127747,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 11358
+    },
+    {
+      "epoch": 0.11359,
+      "grad_norm": 0.8020058274269104,
+      "learning_rate": 0.003,
+      "loss": 4.0495,
+      "step": 11359
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.8286008238792419,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 11360
+    },
+    {
+      "epoch": 0.11361,
+      "grad_norm": 0.890533447265625,
+      "learning_rate": 0.003,
+      "loss": 4.0004,
+      "step": 11361
+    },
+    {
+      "epoch": 0.11362,
+      "grad_norm": 1.0137696266174316,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 11362
+    },
+    {
+      "epoch": 0.11363,
+      "grad_norm": 1.0358784198760986,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 11363
+    },
+    {
+      "epoch": 0.11364,
+      "grad_norm": 0.8371180295944214,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 11364
+    },
+    {
+      "epoch": 0.11365,
+      "grad_norm": 0.8734986782073975,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 11365
+    },
+    {
+      "epoch": 0.11366,
+      "grad_norm": 0.9228023886680603,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 11366
+    },
+    {
+      "epoch": 0.11367,
+      "grad_norm": 0.9581348896026611,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 11367
+    },
+    {
+      "epoch": 0.11368,
+      "grad_norm": 1.0258618593215942,
+      "learning_rate": 0.003,
+      "loss": 4.0497,
+      "step": 11368
+    },
+    {
+      "epoch": 0.11369,
+      "grad_norm": 1.1487102508544922,
+      "learning_rate": 0.003,
+      "loss": 4.0503,
+      "step": 11369
+    },
+    {
+      "epoch": 0.1137,
+      "grad_norm": 0.7709944844245911,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 11370
+    },
+    {
+      "epoch": 0.11371,
+      "grad_norm": 0.6586733460426331,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 11371
+    },
+    {
+      "epoch": 0.11372,
+      "grad_norm": 0.6725128889083862,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 11372
+    },
+    {
+      "epoch": 0.11373,
+      "grad_norm": 0.7969658970832825,
+      "learning_rate": 0.003,
+      "loss": 4.0351,
+      "step": 11373
+    },
+    {
+      "epoch": 0.11374,
+      "grad_norm": 0.8550029397010803,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 11374
+    },
+    {
+      "epoch": 0.11375,
+      "grad_norm": 0.9954065084457397,
+      "learning_rate": 0.003,
+      "loss": 4.0475,
+      "step": 11375
+    },
+    {
+      "epoch": 0.11376,
+      "grad_norm": 0.9369992017745972,
+      "learning_rate": 0.003,
+      "loss": 4.0696,
+      "step": 11376
+    },
+    {
+      "epoch": 0.11377,
+      "grad_norm": 0.780281662940979,
+      "learning_rate": 0.003,
+      "loss": 4.0294,
+      "step": 11377
+    },
+    {
+      "epoch": 0.11378,
+      "grad_norm": 0.6823753118515015,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 11378
+    },
+    {
+      "epoch": 0.11379,
+      "grad_norm": 0.6865916848182678,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 11379
+    },
+    {
+      "epoch": 0.1138,
+      "grad_norm": 0.6688868403434753,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 11380
+    },
+    {
+      "epoch": 0.11381,
+      "grad_norm": 0.6455453634262085,
+      "learning_rate": 0.003,
+      "loss": 3.9823,
+      "step": 11381
+    },
+    {
+      "epoch": 0.11382,
+      "grad_norm": 0.5758188962936401,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 11382
+    },
+    {
+      "epoch": 0.11383,
+      "grad_norm": 0.4809495508670807,
+      "learning_rate": 0.003,
+      "loss": 3.9998,
+      "step": 11383
+    },
+    {
+      "epoch": 0.11384,
+      "grad_norm": 0.5226956009864807,
+      "learning_rate": 0.003,
+      "loss": 4.015,
+      "step": 11384
+    },
+    {
+      "epoch": 0.11385,
+      "grad_norm": 0.5507903695106506,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 11385
+    },
+    {
+      "epoch": 0.11386,
+      "grad_norm": 0.703472375869751,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 11386
+    },
+    {
+      "epoch": 0.11387,
+      "grad_norm": 0.8473612666130066,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 11387
+    },
+    {
+      "epoch": 0.11388,
+      "grad_norm": 1.0691862106323242,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 11388
+    },
+    {
+      "epoch": 0.11389,
+      "grad_norm": 1.0493425130844116,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 11389
+    },
+    {
+      "epoch": 0.1139,
+      "grad_norm": 0.8289149403572083,
+      "learning_rate": 0.003,
+      "loss": 4.009,
+      "step": 11390
+    },
+    {
+      "epoch": 0.11391,
+      "grad_norm": 0.6496636867523193,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 11391
+    },
+    {
+      "epoch": 0.11392,
+      "grad_norm": 0.6588830947875977,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 11392
+    },
+    {
+      "epoch": 0.11393,
+      "grad_norm": 0.8284120559692383,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 11393
+    },
+    {
+      "epoch": 0.11394,
+      "grad_norm": 0.8982833027839661,
+      "learning_rate": 0.003,
+      "loss": 3.9903,
+      "step": 11394
+    },
+    {
+      "epoch": 0.11395,
+      "grad_norm": 0.8549256920814514,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 11395
+    },
+    {
+      "epoch": 0.11396,
+      "grad_norm": 0.7516512870788574,
+      "learning_rate": 0.003,
+      "loss": 4.0271,
+      "step": 11396
+    },
+    {
+      "epoch": 0.11397,
+      "grad_norm": 0.6641618609428406,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 11397
+    },
+    {
+      "epoch": 0.11398,
+      "grad_norm": 0.7321159243583679,
+      "learning_rate": 0.003,
+      "loss": 3.9921,
+      "step": 11398
+    },
+    {
+      "epoch": 0.11399,
+      "grad_norm": 0.7968395352363586,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 11399
+    },
+    {
+      "epoch": 0.114,
+      "grad_norm": 0.7829858660697937,
+      "learning_rate": 0.003,
+      "loss": 4.0471,
+      "step": 11400
+    },
+    {
+      "epoch": 0.11401,
+      "grad_norm": 0.7570399641990662,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 11401
+    },
+    {
+      "epoch": 0.11402,
+      "grad_norm": 0.7252801656723022,
+      "learning_rate": 0.003,
+      "loss": 4.0039,
+      "step": 11402
+    },
+    {
+      "epoch": 0.11403,
+      "grad_norm": 0.6975102424621582,
+      "learning_rate": 0.003,
+      "loss": 3.9855,
+      "step": 11403
+    },
+    {
+      "epoch": 0.11404,
+      "grad_norm": 0.6238744854927063,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 11404
+    },
+    {
+      "epoch": 0.11405,
+      "grad_norm": 0.4995003938674927,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 11405
+    },
+    {
+      "epoch": 0.11406,
+      "grad_norm": 0.5959094762802124,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 11406
+    },
+    {
+      "epoch": 0.11407,
+      "grad_norm": 0.5805252194404602,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 11407
+    },
+    {
+      "epoch": 0.11408,
+      "grad_norm": 0.563025176525116,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 11408
+    },
+    {
+      "epoch": 0.11409,
+      "grad_norm": 0.6606787443161011,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 11409
+    },
+    {
+      "epoch": 0.1141,
+      "grad_norm": 0.8641210198402405,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 11410
+    },
+    {
+      "epoch": 0.11411,
+      "grad_norm": 1.1100224256515503,
+      "learning_rate": 0.003,
+      "loss": 4.0366,
+      "step": 11411
+    },
+    {
+      "epoch": 0.11412,
+      "grad_norm": 0.9309477210044861,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 11412
+    },
+    {
+      "epoch": 0.11413,
+      "grad_norm": 0.7454750537872314,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 11413
+    },
+    {
+      "epoch": 0.11414,
+      "grad_norm": 0.7080267667770386,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 11414
+    },
+    {
+      "epoch": 0.11415,
+      "grad_norm": 0.7802665829658508,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 11415
+    },
+    {
+      "epoch": 0.11416,
+      "grad_norm": 0.8674970269203186,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 11416
+    },
+    {
+      "epoch": 0.11417,
+      "grad_norm": 0.9167194962501526,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 11417
+    },
+    {
+      "epoch": 0.11418,
+      "grad_norm": 0.8333175182342529,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 11418
+    },
+    {
+      "epoch": 0.11419,
+      "grad_norm": 0.7349749207496643,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 11419
+    },
+    {
+      "epoch": 0.1142,
+      "grad_norm": 0.7354753017425537,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 11420
+    },
+    {
+      "epoch": 0.11421,
+      "grad_norm": 0.7623195648193359,
+      "learning_rate": 0.003,
+      "loss": 3.9943,
+      "step": 11421
+    },
+    {
+      "epoch": 0.11422,
+      "grad_norm": 0.8163384199142456,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 11422
+    },
+    {
+      "epoch": 0.11423,
+      "grad_norm": 0.9785248637199402,
+      "learning_rate": 0.003,
+      "loss": 4.0675,
+      "step": 11423
+    },
+    {
+      "epoch": 0.11424,
+      "grad_norm": 1.2677289247512817,
+      "learning_rate": 0.003,
+      "loss": 4.0674,
+      "step": 11424
+    },
+    {
+      "epoch": 0.11425,
+      "grad_norm": 0.9398715496063232,
+      "learning_rate": 0.003,
+      "loss": 4.0398,
+      "step": 11425
+    },
+    {
+      "epoch": 0.11426,
+      "grad_norm": 0.9864420890808105,
+      "learning_rate": 0.003,
+      "loss": 4.0659,
+      "step": 11426
+    },
+    {
+      "epoch": 0.11427,
+      "grad_norm": 0.9389703869819641,
+      "learning_rate": 0.003,
+      "loss": 4.0373,
+      "step": 11427
+    },
+    {
+      "epoch": 0.11428,
+      "grad_norm": 0.884861171245575,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 11428
+    },
+    {
+      "epoch": 0.11429,
+      "grad_norm": 0.678497850894928,
+      "learning_rate": 0.003,
+      "loss": 4.0599,
+      "step": 11429
+    },
+    {
+      "epoch": 0.1143,
+      "grad_norm": 0.9918597340583801,
+      "learning_rate": 0.003,
+      "loss": 4.0533,
+      "step": 11430
+    },
+    {
+      "epoch": 0.11431,
+      "grad_norm": 1.2051234245300293,
+      "learning_rate": 0.003,
+      "loss": 4.0602,
+      "step": 11431
+    },
+    {
+      "epoch": 0.11432,
+      "grad_norm": 0.9006490707397461,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 11432
+    },
+    {
+      "epoch": 0.11433,
+      "grad_norm": 0.862767219543457,
+      "learning_rate": 0.003,
+      "loss": 4.0713,
+      "step": 11433
+    },
+    {
+      "epoch": 0.11434,
+      "grad_norm": 0.7508940100669861,
+      "learning_rate": 0.003,
+      "loss": 4.0616,
+      "step": 11434
+    },
+    {
+      "epoch": 0.11435,
+      "grad_norm": 0.6505155563354492,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 11435
+    },
+    {
+      "epoch": 0.11436,
+      "grad_norm": 0.5951887965202332,
+      "learning_rate": 0.003,
+      "loss": 4.0342,
+      "step": 11436
+    },
+    {
+      "epoch": 0.11437,
+      "grad_norm": 0.6015822291374207,
+      "learning_rate": 0.003,
+      "loss": 3.9865,
+      "step": 11437
+    },
+    {
+      "epoch": 0.11438,
+      "grad_norm": 0.6409400105476379,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 11438
+    },
+    {
+      "epoch": 0.11439,
+      "grad_norm": 0.6430752277374268,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 11439
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.6330476999282837,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 11440
+    },
+    {
+      "epoch": 0.11441,
+      "grad_norm": 0.8735347390174866,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 11441
+    },
+    {
+      "epoch": 0.11442,
+      "grad_norm": 1.0421018600463867,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 11442
+    },
+    {
+      "epoch": 0.11443,
+      "grad_norm": 1.0504906177520752,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 11443
+    },
+    {
+      "epoch": 0.11444,
+      "grad_norm": 0.9561898112297058,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 11444
+    },
+    {
+      "epoch": 0.11445,
+      "grad_norm": 0.893307089805603,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 11445
+    },
+    {
+      "epoch": 0.11446,
+      "grad_norm": 0.8496833443641663,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 11446
+    },
+    {
+      "epoch": 0.11447,
+      "grad_norm": 0.9446256160736084,
+      "learning_rate": 0.003,
+      "loss": 4.0629,
+      "step": 11447
+    },
+    {
+      "epoch": 0.11448,
+      "grad_norm": 1.1525483131408691,
+      "learning_rate": 0.003,
+      "loss": 4.0647,
+      "step": 11448
+    },
+    {
+      "epoch": 0.11449,
+      "grad_norm": 0.8027827739715576,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 11449
+    },
+    {
+      "epoch": 0.1145,
+      "grad_norm": 0.7700136303901672,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 11450
+    },
+    {
+      "epoch": 0.11451,
+      "grad_norm": 0.6967355608940125,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 11451
+    },
+    {
+      "epoch": 0.11452,
+      "grad_norm": 0.6830270886421204,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 11452
+    },
+    {
+      "epoch": 0.11453,
+      "grad_norm": 0.811431348323822,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 11453
+    },
+    {
+      "epoch": 0.11454,
+      "grad_norm": 0.9994477033615112,
+      "learning_rate": 0.003,
+      "loss": 4.0242,
+      "step": 11454
+    },
+    {
+      "epoch": 0.11455,
+      "grad_norm": 1.0607717037200928,
+      "learning_rate": 0.003,
+      "loss": 4.0001,
+      "step": 11455
+    },
+    {
+      "epoch": 0.11456,
+      "grad_norm": 0.7838580012321472,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 11456
+    },
+    {
+      "epoch": 0.11457,
+      "grad_norm": 0.6020787954330444,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 11457
+    },
+    {
+      "epoch": 0.11458,
+      "grad_norm": 0.6495767831802368,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 11458
+    },
+    {
+      "epoch": 0.11459,
+      "grad_norm": 0.6524631381034851,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 11459
+    },
+    {
+      "epoch": 0.1146,
+      "grad_norm": 0.6308627724647522,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 11460
+    },
+    {
+      "epoch": 0.11461,
+      "grad_norm": 0.6008997559547424,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 11461
+    },
+    {
+      "epoch": 0.11462,
+      "grad_norm": 0.5560154914855957,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 11462
+    },
+    {
+      "epoch": 0.11463,
+      "grad_norm": 0.5246428847312927,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 11463
+    },
+    {
+      "epoch": 0.11464,
+      "grad_norm": 0.5368339419364929,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 11464
+    },
+    {
+      "epoch": 0.11465,
+      "grad_norm": 0.6533960103988647,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 11465
+    },
+    {
+      "epoch": 0.11466,
+      "grad_norm": 0.7779512405395508,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 11466
+    },
+    {
+      "epoch": 0.11467,
+      "grad_norm": 0.7873508334159851,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 11467
+    },
+    {
+      "epoch": 0.11468,
+      "grad_norm": 0.7781346440315247,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 11468
+    },
+    {
+      "epoch": 0.11469,
+      "grad_norm": 0.8823782801628113,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 11469
+    },
+    {
+      "epoch": 0.1147,
+      "grad_norm": 1.0907844305038452,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 11470
+    },
+    {
+      "epoch": 0.11471,
+      "grad_norm": 1.0541037321090698,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 11471
+    },
+    {
+      "epoch": 0.11472,
+      "grad_norm": 0.9238687753677368,
+      "learning_rate": 0.003,
+      "loss": 4.0207,
+      "step": 11472
+    },
+    {
+      "epoch": 0.11473,
+      "grad_norm": 0.7540009617805481,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 11473
+    },
+    {
+      "epoch": 0.11474,
+      "grad_norm": 0.6382348537445068,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 11474
+    },
+    {
+      "epoch": 0.11475,
+      "grad_norm": 0.7358950972557068,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 11475
+    },
+    {
+      "epoch": 0.11476,
+      "grad_norm": 0.8675046563148499,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 11476
+    },
+    {
+      "epoch": 0.11477,
+      "grad_norm": 1.04617440700531,
+      "learning_rate": 0.003,
+      "loss": 4.056,
+      "step": 11477
+    },
+    {
+      "epoch": 0.11478,
+      "grad_norm": 1.0342538356781006,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 11478
+    },
+    {
+      "epoch": 0.11479,
+      "grad_norm": 0.8536825776100159,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 11479
+    },
+    {
+      "epoch": 0.1148,
+      "grad_norm": 0.8890334367752075,
+      "learning_rate": 0.003,
+      "loss": 4.0539,
+      "step": 11480
+    },
+    {
+      "epoch": 0.11481,
+      "grad_norm": 0.8013483285903931,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 11481
+    },
+    {
+      "epoch": 0.11482,
+      "grad_norm": 0.7241466641426086,
+      "learning_rate": 0.003,
+      "loss": 4.0554,
+      "step": 11482
+    },
+    {
+      "epoch": 0.11483,
+      "grad_norm": 0.671504020690918,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 11483
+    },
+    {
+      "epoch": 0.11484,
+      "grad_norm": 0.6110320687294006,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 11484
+    },
+    {
+      "epoch": 0.11485,
+      "grad_norm": 0.6042850017547607,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 11485
+    },
+    {
+      "epoch": 0.11486,
+      "grad_norm": 0.6030364036560059,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 11486
+    },
+    {
+      "epoch": 0.11487,
+      "grad_norm": 0.641995370388031,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 11487
+    },
+    {
+      "epoch": 0.11488,
+      "grad_norm": 0.787706196308136,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 11488
+    },
+    {
+      "epoch": 0.11489,
+      "grad_norm": 0.9720715284347534,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 11489
+    },
+    {
+      "epoch": 0.1149,
+      "grad_norm": 0.9054409861564636,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 11490
+    },
+    {
+      "epoch": 0.11491,
+      "grad_norm": 0.8776920437812805,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 11491
+    },
+    {
+      "epoch": 0.11492,
+      "grad_norm": 0.90711909532547,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 11492
+    },
+    {
+      "epoch": 0.11493,
+      "grad_norm": 0.8923183083534241,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 11493
+    },
+    {
+      "epoch": 0.11494,
+      "grad_norm": 0.7126554250717163,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 11494
+    },
+    {
+      "epoch": 0.11495,
+      "grad_norm": 0.6723785996437073,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 11495
+    },
+    {
+      "epoch": 0.11496,
+      "grad_norm": 0.7934660911560059,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 11496
+    },
+    {
+      "epoch": 0.11497,
+      "grad_norm": 0.8057811856269836,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 11497
+    },
+    {
+      "epoch": 0.11498,
+      "grad_norm": 0.7440505027770996,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 11498
+    },
+    {
+      "epoch": 0.11499,
+      "grad_norm": 0.7898901700973511,
+      "learning_rate": 0.003,
+      "loss": 4.0249,
+      "step": 11499
+    },
+    {
+      "epoch": 0.115,
+      "grad_norm": 0.8212448358535767,
+      "learning_rate": 0.003,
+      "loss": 4.0495,
+      "step": 11500
+    },
+    {
+      "epoch": 0.11501,
+      "grad_norm": 0.8043070435523987,
+      "learning_rate": 0.003,
+      "loss": 4.0493,
+      "step": 11501
+    },
+    {
+      "epoch": 0.11502,
+      "grad_norm": 0.69261634349823,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 11502
+    },
+    {
+      "epoch": 0.11503,
+      "grad_norm": 0.5780364871025085,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 11503
+    },
+    {
+      "epoch": 0.11504,
+      "grad_norm": 0.6269531846046448,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 11504
+    },
+    {
+      "epoch": 0.11505,
+      "grad_norm": 0.7450412511825562,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 11505
+    },
+    {
+      "epoch": 0.11506,
+      "grad_norm": 0.8044024705886841,
+      "learning_rate": 0.003,
+      "loss": 4.0483,
+      "step": 11506
+    },
+    {
+      "epoch": 0.11507,
+      "grad_norm": 0.8625448942184448,
+      "learning_rate": 0.003,
+      "loss": 4.0398,
+      "step": 11507
+    },
+    {
+      "epoch": 0.11508,
+      "grad_norm": 0.9397217035293579,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 11508
+    },
+    {
+      "epoch": 0.11509,
+      "grad_norm": 1.151915431022644,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 11509
+    },
+    {
+      "epoch": 0.1151,
+      "grad_norm": 0.7171595692634583,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 11510
+    },
+    {
+      "epoch": 0.11511,
+      "grad_norm": 0.6709728240966797,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 11511
+    },
+    {
+      "epoch": 0.11512,
+      "grad_norm": 0.7095353007316589,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 11512
+    },
+    {
+      "epoch": 0.11513,
+      "grad_norm": 0.6780837774276733,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 11513
+    },
+    {
+      "epoch": 0.11514,
+      "grad_norm": 0.5921359062194824,
+      "learning_rate": 0.003,
+      "loss": 3.998,
+      "step": 11514
+    },
+    {
+      "epoch": 0.11515,
+      "grad_norm": 0.5906557440757751,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 11515
+    },
+    {
+      "epoch": 0.11516,
+      "grad_norm": 0.6455663442611694,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 11516
+    },
+    {
+      "epoch": 0.11517,
+      "grad_norm": 0.7563275098800659,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 11517
+    },
+    {
+      "epoch": 0.11518,
+      "grad_norm": 0.8928882479667664,
+      "learning_rate": 0.003,
+      "loss": 4.0522,
+      "step": 11518
+    },
+    {
+      "epoch": 0.11519,
+      "grad_norm": 1.008765459060669,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 11519
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 1.1093732118606567,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 11520
+    },
+    {
+      "epoch": 0.11521,
+      "grad_norm": 0.9981747269630432,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 11521
+    },
+    {
+      "epoch": 0.11522,
+      "grad_norm": 0.9125166535377502,
+      "learning_rate": 0.003,
+      "loss": 4.0311,
+      "step": 11522
+    },
+    {
+      "epoch": 0.11523,
+      "grad_norm": 0.7462885975837708,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 11523
+    },
+    {
+      "epoch": 0.11524,
+      "grad_norm": 0.7454661726951599,
+      "learning_rate": 0.003,
+      "loss": 4.0332,
+      "step": 11524
+    },
+    {
+      "epoch": 0.11525,
+      "grad_norm": 0.7887188196182251,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 11525
+    },
+    {
+      "epoch": 0.11526,
+      "grad_norm": 1.0900375843048096,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 11526
+    },
+    {
+      "epoch": 0.11527,
+      "grad_norm": 1.1068854331970215,
+      "learning_rate": 0.003,
+      "loss": 4.045,
+      "step": 11527
+    },
+    {
+      "epoch": 0.11528,
+      "grad_norm": 0.9299308657646179,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 11528
+    },
+    {
+      "epoch": 0.11529,
+      "grad_norm": 0.8719900846481323,
+      "learning_rate": 0.003,
+      "loss": 4.0443,
+      "step": 11529
+    },
+    {
+      "epoch": 0.1153,
+      "grad_norm": 0.9658557176589966,
+      "learning_rate": 0.003,
+      "loss": 4.0027,
+      "step": 11530
+    },
+    {
+      "epoch": 0.11531,
+      "grad_norm": 0.979651689529419,
+      "learning_rate": 0.003,
+      "loss": 4.0493,
+      "step": 11531
+    },
+    {
+      "epoch": 0.11532,
+      "grad_norm": 0.8919920325279236,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 11532
+    },
+    {
+      "epoch": 0.11533,
+      "grad_norm": 0.7383463382720947,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 11533
+    },
+    {
+      "epoch": 0.11534,
+      "grad_norm": 0.7078273892402649,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 11534
+    },
+    {
+      "epoch": 0.11535,
+      "grad_norm": 0.6356802582740784,
+      "learning_rate": 0.003,
+      "loss": 3.9984,
+      "step": 11535
+    },
+    {
+      "epoch": 0.11536,
+      "grad_norm": 0.6901099681854248,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 11536
+    },
+    {
+      "epoch": 0.11537,
+      "grad_norm": 0.7354924082756042,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 11537
+    },
+    {
+      "epoch": 0.11538,
+      "grad_norm": 0.9668107032775879,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 11538
+    },
+    {
+      "epoch": 0.11539,
+      "grad_norm": 1.2265743017196655,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 11539
+    },
+    {
+      "epoch": 0.1154,
+      "grad_norm": 0.7557204961776733,
+      "learning_rate": 0.003,
+      "loss": 4.0495,
+      "step": 11540
+    },
+    {
+      "epoch": 0.11541,
+      "grad_norm": 0.7328994870185852,
+      "learning_rate": 0.003,
+      "loss": 4.0483,
+      "step": 11541
+    },
+    {
+      "epoch": 0.11542,
+      "grad_norm": 0.6833067536354065,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 11542
+    },
+    {
+      "epoch": 0.11543,
+      "grad_norm": 0.7724437117576599,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 11543
+    },
+    {
+      "epoch": 0.11544,
+      "grad_norm": 0.8594021201133728,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 11544
+    },
+    {
+      "epoch": 0.11545,
+      "grad_norm": 1.020458698272705,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 11545
+    },
+    {
+      "epoch": 0.11546,
+      "grad_norm": 1.0542057752609253,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 11546
+    },
+    {
+      "epoch": 0.11547,
+      "grad_norm": 0.8527884483337402,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 11547
+    },
+    {
+      "epoch": 0.11548,
+      "grad_norm": 0.7760061025619507,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 11548
+    },
+    {
+      "epoch": 0.11549,
+      "grad_norm": 0.8009873032569885,
+      "learning_rate": 0.003,
+      "loss": 4.034,
+      "step": 11549
+    },
+    {
+      "epoch": 0.1155,
+      "grad_norm": 0.7634437680244446,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 11550
+    },
+    {
+      "epoch": 0.11551,
+      "grad_norm": 0.7709644436836243,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 11551
+    },
+    {
+      "epoch": 0.11552,
+      "grad_norm": 0.7237449288368225,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 11552
+    },
+    {
+      "epoch": 0.11553,
+      "grad_norm": 0.6726607084274292,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 11553
+    },
+    {
+      "epoch": 0.11554,
+      "grad_norm": 0.5839000344276428,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 11554
+    },
+    {
+      "epoch": 0.11555,
+      "grad_norm": 0.5689242482185364,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 11555
+    },
+    {
+      "epoch": 0.11556,
+      "grad_norm": 0.6661663055419922,
+      "learning_rate": 0.003,
+      "loss": 3.9953,
+      "step": 11556
+    },
+    {
+      "epoch": 0.11557,
+      "grad_norm": 0.7585426568984985,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 11557
+    },
+    {
+      "epoch": 0.11558,
+      "grad_norm": 0.8989922404289246,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 11558
+    },
+    {
+      "epoch": 0.11559,
+      "grad_norm": 0.972117006778717,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 11559
+    },
+    {
+      "epoch": 0.1156,
+      "grad_norm": 0.9820306301116943,
+      "learning_rate": 0.003,
+      "loss": 4.0492,
+      "step": 11560
+    },
+    {
+      "epoch": 0.11561,
+      "grad_norm": 0.9749352931976318,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 11561
+    },
+    {
+      "epoch": 0.11562,
+      "grad_norm": 0.8870478868484497,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 11562
+    },
+    {
+      "epoch": 0.11563,
+      "grad_norm": 0.7449541687965393,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 11563
+    },
+    {
+      "epoch": 0.11564,
+      "grad_norm": 0.6444079875946045,
+      "learning_rate": 0.003,
+      "loss": 4.0168,
+      "step": 11564
+    },
+    {
+      "epoch": 0.11565,
+      "grad_norm": 0.7940042018890381,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 11565
+    },
+    {
+      "epoch": 0.11566,
+      "grad_norm": 0.9880294799804688,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 11566
+    },
+    {
+      "epoch": 0.11567,
+      "grad_norm": 1.0673390626907349,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 11567
+    },
+    {
+      "epoch": 0.11568,
+      "grad_norm": 0.9975600242614746,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 11568
+    },
+    {
+      "epoch": 0.11569,
+      "grad_norm": 0.9561159610748291,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 11569
+    },
+    {
+      "epoch": 0.1157,
+      "grad_norm": 0.7889679074287415,
+      "learning_rate": 0.003,
+      "loss": 3.9925,
+      "step": 11570
+    },
+    {
+      "epoch": 0.11571,
+      "grad_norm": 0.7783418893814087,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 11571
+    },
+    {
+      "epoch": 0.11572,
+      "grad_norm": 0.6229084730148315,
+      "learning_rate": 0.003,
+      "loss": 3.9798,
+      "step": 11572
+    },
+    {
+      "epoch": 0.11573,
+      "grad_norm": 0.6400849223136902,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 11573
+    },
+    {
+      "epoch": 0.11574,
+      "grad_norm": 0.6409059762954712,
+      "learning_rate": 0.003,
+      "loss": 3.9892,
+      "step": 11574
+    },
+    {
+      "epoch": 0.11575,
+      "grad_norm": 0.6521430611610413,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 11575
+    },
+    {
+      "epoch": 0.11576,
+      "grad_norm": 0.7270999550819397,
+      "learning_rate": 0.003,
+      "loss": 3.9864,
+      "step": 11576
+    },
+    {
+      "epoch": 0.11577,
+      "grad_norm": 0.6581321954727173,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 11577
+    },
+    {
+      "epoch": 0.11578,
+      "grad_norm": 0.7205742597579956,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 11578
+    },
+    {
+      "epoch": 0.11579,
+      "grad_norm": 0.7195239067077637,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 11579
+    },
+    {
+      "epoch": 0.1158,
+      "grad_norm": 0.6457316875457764,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 11580
+    },
+    {
+      "epoch": 0.11581,
+      "grad_norm": 0.6417819857597351,
+      "learning_rate": 0.003,
+      "loss": 3.9801,
+      "step": 11581
+    },
+    {
+      "epoch": 0.11582,
+      "grad_norm": 0.7540315389633179,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 11582
+    },
+    {
+      "epoch": 0.11583,
+      "grad_norm": 0.9404506087303162,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 11583
+    },
+    {
+      "epoch": 0.11584,
+      "grad_norm": 1.077072262763977,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 11584
+    },
+    {
+      "epoch": 0.11585,
+      "grad_norm": 0.9085767269134521,
+      "learning_rate": 0.003,
+      "loss": 4.0527,
+      "step": 11585
+    },
+    {
+      "epoch": 0.11586,
+      "grad_norm": 0.8725526928901672,
+      "learning_rate": 0.003,
+      "loss": 4.0119,
+      "step": 11586
+    },
+    {
+      "epoch": 0.11587,
+      "grad_norm": 0.8863524198532104,
+      "learning_rate": 0.003,
+      "loss": 3.9821,
+      "step": 11587
+    },
+    {
+      "epoch": 0.11588,
+      "grad_norm": 0.8565869331359863,
+      "learning_rate": 0.003,
+      "loss": 4.0339,
+      "step": 11588
+    },
+    {
+      "epoch": 0.11589,
+      "grad_norm": 0.8511981964111328,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 11589
+    },
+    {
+      "epoch": 0.1159,
+      "grad_norm": 0.8466473817825317,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 11590
+    },
+    {
+      "epoch": 0.11591,
+      "grad_norm": 0.8959945440292358,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 11591
+    },
+    {
+      "epoch": 0.11592,
+      "grad_norm": 0.8851884007453918,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 11592
+    },
+    {
+      "epoch": 0.11593,
+      "grad_norm": 0.7232153415679932,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 11593
+    },
+    {
+      "epoch": 0.11594,
+      "grad_norm": 0.7993742227554321,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 11594
+    },
+    {
+      "epoch": 0.11595,
+      "grad_norm": 0.8647189736366272,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 11595
+    },
+    {
+      "epoch": 0.11596,
+      "grad_norm": 0.8297176957130432,
+      "learning_rate": 0.003,
+      "loss": 4.0022,
+      "step": 11596
+    },
+    {
+      "epoch": 0.11597,
+      "grad_norm": 0.9545539617538452,
+      "learning_rate": 0.003,
+      "loss": 4.0471,
+      "step": 11597
+    },
+    {
+      "epoch": 0.11598,
+      "grad_norm": 1.1064233779907227,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 11598
+    },
+    {
+      "epoch": 0.11599,
+      "grad_norm": 0.842133641242981,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 11599
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.7919009327888489,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 11600
+    },
+    {
+      "epoch": 0.11601,
+      "grad_norm": 0.8049101829528809,
+      "learning_rate": 0.003,
+      "loss": 4.0507,
+      "step": 11601
+    },
+    {
+      "epoch": 0.11602,
+      "grad_norm": 0.9507090449333191,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 11602
+    },
+    {
+      "epoch": 0.11603,
+      "grad_norm": 0.9661722183227539,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 11603
+    },
+    {
+      "epoch": 0.11604,
+      "grad_norm": 0.9851440787315369,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 11604
+    },
+    {
+      "epoch": 0.11605,
+      "grad_norm": 0.95931077003479,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 11605
+    },
+    {
+      "epoch": 0.11606,
+      "grad_norm": 0.9839902520179749,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 11606
+    },
+    {
+      "epoch": 0.11607,
+      "grad_norm": 0.8679165840148926,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 11607
+    },
+    {
+      "epoch": 0.11608,
+      "grad_norm": 0.7510985136032104,
+      "learning_rate": 0.003,
+      "loss": 4.048,
+      "step": 11608
+    },
+    {
+      "epoch": 0.11609,
+      "grad_norm": 0.6729006767272949,
+      "learning_rate": 0.003,
+      "loss": 4.0626,
+      "step": 11609
+    },
+    {
+      "epoch": 0.1161,
+      "grad_norm": 0.7698171734809875,
+      "learning_rate": 0.003,
+      "loss": 4.0472,
+      "step": 11610
+    },
+    {
+      "epoch": 0.11611,
+      "grad_norm": 0.866439938545227,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 11611
+    },
+    {
+      "epoch": 0.11612,
+      "grad_norm": 0.9573925733566284,
+      "learning_rate": 0.003,
+      "loss": 4.0559,
+      "step": 11612
+    },
+    {
+      "epoch": 0.11613,
+      "grad_norm": 1.0064102411270142,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 11613
+    },
+    {
+      "epoch": 0.11614,
+      "grad_norm": 0.951464056968689,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 11614
+    },
+    {
+      "epoch": 0.11615,
+      "grad_norm": 0.7083039283752441,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 11615
+    },
+    {
+      "epoch": 0.11616,
+      "grad_norm": 0.5653154253959656,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 11616
+    },
+    {
+      "epoch": 0.11617,
+      "grad_norm": 0.5044902563095093,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 11617
+    },
+    {
+      "epoch": 0.11618,
+      "grad_norm": 0.5124688744544983,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 11618
+    },
+    {
+      "epoch": 0.11619,
+      "grad_norm": 0.439615398645401,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 11619
+    },
+    {
+      "epoch": 0.1162,
+      "grad_norm": 0.48477035760879517,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 11620
+    },
+    {
+      "epoch": 0.11621,
+      "grad_norm": 0.4753328561782837,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 11621
+    },
+    {
+      "epoch": 0.11622,
+      "grad_norm": 0.5537976026535034,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 11622
+    },
+    {
+      "epoch": 0.11623,
+      "grad_norm": 0.5809506177902222,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 11623
+    },
+    {
+      "epoch": 0.11624,
+      "grad_norm": 0.6150876879692078,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 11624
+    },
+    {
+      "epoch": 0.11625,
+      "grad_norm": 0.714521050453186,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 11625
+    },
+    {
+      "epoch": 0.11626,
+      "grad_norm": 0.7555716633796692,
+      "learning_rate": 0.003,
+      "loss": 4.0056,
+      "step": 11626
+    },
+    {
+      "epoch": 0.11627,
+      "grad_norm": 0.7596738934516907,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 11627
+    },
+    {
+      "epoch": 0.11628,
+      "grad_norm": 0.6994544863700867,
+      "learning_rate": 0.003,
+      "loss": 4.0481,
+      "step": 11628
+    },
+    {
+      "epoch": 0.11629,
+      "grad_norm": 0.6706871390342712,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 11629
+    },
+    {
+      "epoch": 0.1163,
+      "grad_norm": 0.7496283054351807,
+      "learning_rate": 0.003,
+      "loss": 4.0002,
+      "step": 11630
+    },
+    {
+      "epoch": 0.11631,
+      "grad_norm": 0.7881671786308289,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 11631
+    },
+    {
+      "epoch": 0.11632,
+      "grad_norm": 0.9565471410751343,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 11632
+    },
+    {
+      "epoch": 0.11633,
+      "grad_norm": 1.3986477851867676,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 11633
+    },
+    {
+      "epoch": 0.11634,
+      "grad_norm": 0.5135916471481323,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 11634
+    },
+    {
+      "epoch": 0.11635,
+      "grad_norm": 0.6713153719902039,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 11635
+    },
+    {
+      "epoch": 0.11636,
+      "grad_norm": 0.801653265953064,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 11636
+    },
+    {
+      "epoch": 0.11637,
+      "grad_norm": 0.8092045187950134,
+      "learning_rate": 0.003,
+      "loss": 4.0176,
+      "step": 11637
+    },
+    {
+      "epoch": 0.11638,
+      "grad_norm": 0.8648455739021301,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 11638
+    },
+    {
+      "epoch": 0.11639,
+      "grad_norm": 0.7091639041900635,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 11639
+    },
+    {
+      "epoch": 0.1164,
+      "grad_norm": 0.6421645879745483,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 11640
+    },
+    {
+      "epoch": 0.11641,
+      "grad_norm": 0.6985124349594116,
+      "learning_rate": 0.003,
+      "loss": 4.0271,
+      "step": 11641
+    },
+    {
+      "epoch": 0.11642,
+      "grad_norm": 0.8126601576805115,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 11642
+    },
+    {
+      "epoch": 0.11643,
+      "grad_norm": 0.8762948513031006,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 11643
+    },
+    {
+      "epoch": 0.11644,
+      "grad_norm": 0.9369477033615112,
+      "learning_rate": 0.003,
+      "loss": 4.0341,
+      "step": 11644
+    },
+    {
+      "epoch": 0.11645,
+      "grad_norm": 0.9334920048713684,
+      "learning_rate": 0.003,
+      "loss": 4.0481,
+      "step": 11645
+    },
+    {
+      "epoch": 0.11646,
+      "grad_norm": 1.0934345722198486,
+      "learning_rate": 0.003,
+      "loss": 4.0608,
+      "step": 11646
+    },
+    {
+      "epoch": 0.11647,
+      "grad_norm": 0.9287505149841309,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 11647
+    },
+    {
+      "epoch": 0.11648,
+      "grad_norm": 0.9370683431625366,
+      "learning_rate": 0.003,
+      "loss": 4.0465,
+      "step": 11648
+    },
+    {
+      "epoch": 0.11649,
+      "grad_norm": 0.9412454962730408,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 11649
+    },
+    {
+      "epoch": 0.1165,
+      "grad_norm": 0.8130035996437073,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 11650
+    },
+    {
+      "epoch": 0.11651,
+      "grad_norm": 0.7839321494102478,
+      "learning_rate": 0.003,
+      "loss": 4.0435,
+      "step": 11651
+    },
+    {
+      "epoch": 0.11652,
+      "grad_norm": 0.8696938157081604,
+      "learning_rate": 0.003,
+      "loss": 4.0028,
+      "step": 11652
+    },
+    {
+      "epoch": 0.11653,
+      "grad_norm": 0.9900596141815186,
+      "learning_rate": 0.003,
+      "loss": 4.0441,
+      "step": 11653
+    },
+    {
+      "epoch": 0.11654,
+      "grad_norm": 1.0957406759262085,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 11654
+    },
+    {
+      "epoch": 0.11655,
+      "grad_norm": 1.00582754611969,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 11655
+    },
+    {
+      "epoch": 0.11656,
+      "grad_norm": 1.2449061870574951,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 11656
+    },
+    {
+      "epoch": 0.11657,
+      "grad_norm": 0.8895262479782104,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 11657
+    },
+    {
+      "epoch": 0.11658,
+      "grad_norm": 0.7397792339324951,
+      "learning_rate": 0.003,
+      "loss": 4.0556,
+      "step": 11658
+    },
+    {
+      "epoch": 0.11659,
+      "grad_norm": 0.7503185868263245,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 11659
+    },
+    {
+      "epoch": 0.1166,
+      "grad_norm": 0.8214109539985657,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 11660
+    },
+    {
+      "epoch": 0.11661,
+      "grad_norm": 0.7903082966804504,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 11661
+    },
+    {
+      "epoch": 0.11662,
+      "grad_norm": 0.7165495157241821,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 11662
+    },
+    {
+      "epoch": 0.11663,
+      "grad_norm": 0.7824479937553406,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 11663
+    },
+    {
+      "epoch": 0.11664,
+      "grad_norm": 0.810527503490448,
+      "learning_rate": 0.003,
+      "loss": 4.053,
+      "step": 11664
+    },
+    {
+      "epoch": 0.11665,
+      "grad_norm": 0.817573606967926,
+      "learning_rate": 0.003,
+      "loss": 4.0004,
+      "step": 11665
+    },
+    {
+      "epoch": 0.11666,
+      "grad_norm": 0.8660758137702942,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 11666
+    },
+    {
+      "epoch": 0.11667,
+      "grad_norm": 0.8072959780693054,
+      "learning_rate": 0.003,
+      "loss": 4.0582,
+      "step": 11667
+    },
+    {
+      "epoch": 0.11668,
+      "grad_norm": 0.9078159332275391,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 11668
+    },
+    {
+      "epoch": 0.11669,
+      "grad_norm": 1.052850365638733,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 11669
+    },
+    {
+      "epoch": 0.1167,
+      "grad_norm": 0.9063073992729187,
+      "learning_rate": 0.003,
+      "loss": 4.0342,
+      "step": 11670
+    },
+    {
+      "epoch": 0.11671,
+      "grad_norm": 0.7276980876922607,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 11671
+    },
+    {
+      "epoch": 0.11672,
+      "grad_norm": 0.6072031259536743,
+      "learning_rate": 0.003,
+      "loss": 4.0494,
+      "step": 11672
+    },
+    {
+      "epoch": 0.11673,
+      "grad_norm": 0.6613339781761169,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 11673
+    },
+    {
+      "epoch": 0.11674,
+      "grad_norm": 0.6048155426979065,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 11674
+    },
+    {
+      "epoch": 0.11675,
+      "grad_norm": 0.6490438580513,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 11675
+    },
+    {
+      "epoch": 0.11676,
+      "grad_norm": 0.6653977632522583,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 11676
+    },
+    {
+      "epoch": 0.11677,
+      "grad_norm": 0.6288249492645264,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 11677
+    },
+    {
+      "epoch": 0.11678,
+      "grad_norm": 0.6480962038040161,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 11678
+    },
+    {
+      "epoch": 0.11679,
+      "grad_norm": 0.6132774353027344,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 11679
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.5827869772911072,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 11680
+    },
+    {
+      "epoch": 0.11681,
+      "grad_norm": 0.6305233836174011,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 11681
+    },
+    {
+      "epoch": 0.11682,
+      "grad_norm": 0.7440044283866882,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 11682
+    },
+    {
+      "epoch": 0.11683,
+      "grad_norm": 0.8142518997192383,
+      "learning_rate": 0.003,
+      "loss": 4.0454,
+      "step": 11683
+    },
+    {
+      "epoch": 0.11684,
+      "grad_norm": 0.8391857147216797,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 11684
+    },
+    {
+      "epoch": 0.11685,
+      "grad_norm": 0.788082480430603,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 11685
+    },
+    {
+      "epoch": 0.11686,
+      "grad_norm": 0.8251965641975403,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 11686
+    },
+    {
+      "epoch": 0.11687,
+      "grad_norm": 0.9477576613426208,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 11687
+    },
+    {
+      "epoch": 0.11688,
+      "grad_norm": 0.9171106219291687,
+      "learning_rate": 0.003,
+      "loss": 4.0394,
+      "step": 11688
+    },
+    {
+      "epoch": 0.11689,
+      "grad_norm": 0.7135186791419983,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 11689
+    },
+    {
+      "epoch": 0.1169,
+      "grad_norm": 0.6629257798194885,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 11690
+    },
+    {
+      "epoch": 0.11691,
+      "grad_norm": 0.6546911001205444,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 11691
+    },
+    {
+      "epoch": 0.11692,
+      "grad_norm": 0.6957620978355408,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 11692
+    },
+    {
+      "epoch": 0.11693,
+      "grad_norm": 0.7230929136276245,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 11693
+    },
+    {
+      "epoch": 0.11694,
+      "grad_norm": 0.7952154874801636,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 11694
+    },
+    {
+      "epoch": 0.11695,
+      "grad_norm": 0.866857647895813,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 11695
+    },
+    {
+      "epoch": 0.11696,
+      "grad_norm": 0.9634397625923157,
+      "learning_rate": 0.003,
+      "loss": 4.0295,
+      "step": 11696
+    },
+    {
+      "epoch": 0.11697,
+      "grad_norm": 1.0934638977050781,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 11697
+    },
+    {
+      "epoch": 0.11698,
+      "grad_norm": 0.9745580554008484,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 11698
+    },
+    {
+      "epoch": 0.11699,
+      "grad_norm": 1.0540590286254883,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 11699
+    },
+    {
+      "epoch": 0.117,
+      "grad_norm": 0.8467910289764404,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 11700
+    },
+    {
+      "epoch": 0.11701,
+      "grad_norm": 0.6315522193908691,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 11701
+    },
+    {
+      "epoch": 0.11702,
+      "grad_norm": 0.6518235206604004,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 11702
+    },
+    {
+      "epoch": 0.11703,
+      "grad_norm": 0.7548579573631287,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 11703
+    },
+    {
+      "epoch": 0.11704,
+      "grad_norm": 0.7728471159934998,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 11704
+    },
+    {
+      "epoch": 0.11705,
+      "grad_norm": 0.7047388553619385,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 11705
+    },
+    {
+      "epoch": 0.11706,
+      "grad_norm": 0.6782577633857727,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 11706
+    },
+    {
+      "epoch": 0.11707,
+      "grad_norm": 0.7736063003540039,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 11707
+    },
+    {
+      "epoch": 0.11708,
+      "grad_norm": 0.7953140139579773,
+      "learning_rate": 0.003,
+      "loss": 4.009,
+      "step": 11708
+    },
+    {
+      "epoch": 0.11709,
+      "grad_norm": 0.7375701069831848,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 11709
+    },
+    {
+      "epoch": 0.1171,
+      "grad_norm": 0.7914718389511108,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 11710
+    },
+    {
+      "epoch": 0.11711,
+      "grad_norm": 0.7727771401405334,
+      "learning_rate": 0.003,
+      "loss": 3.995,
+      "step": 11711
+    },
+    {
+      "epoch": 0.11712,
+      "grad_norm": 0.8406480550765991,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 11712
+    },
+    {
+      "epoch": 0.11713,
+      "grad_norm": 0.9383452534675598,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 11713
+    },
+    {
+      "epoch": 0.11714,
+      "grad_norm": 1.0819647312164307,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 11714
+    },
+    {
+      "epoch": 0.11715,
+      "grad_norm": 0.9252278208732605,
+      "learning_rate": 0.003,
+      "loss": 4.0323,
+      "step": 11715
+    },
+    {
+      "epoch": 0.11716,
+      "grad_norm": 0.7834035158157349,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 11716
+    },
+    {
+      "epoch": 0.11717,
+      "grad_norm": 0.8063236474990845,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 11717
+    },
+    {
+      "epoch": 0.11718,
+      "grad_norm": 0.8123244047164917,
+      "learning_rate": 0.003,
+      "loss": 4.0249,
+      "step": 11718
+    },
+    {
+      "epoch": 0.11719,
+      "grad_norm": 0.767584502696991,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 11719
+    },
+    {
+      "epoch": 0.1172,
+      "grad_norm": 0.6858091354370117,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 11720
+    },
+    {
+      "epoch": 0.11721,
+      "grad_norm": 0.6874958276748657,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 11721
+    },
+    {
+      "epoch": 0.11722,
+      "grad_norm": 0.7214937210083008,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 11722
+    },
+    {
+      "epoch": 0.11723,
+      "grad_norm": 0.9050902724266052,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 11723
+    },
+    {
+      "epoch": 0.11724,
+      "grad_norm": 1.386954665184021,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 11724
+    },
+    {
+      "epoch": 0.11725,
+      "grad_norm": 0.7460734844207764,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 11725
+    },
+    {
+      "epoch": 0.11726,
+      "grad_norm": 0.6396922469139099,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 11726
+    },
+    {
+      "epoch": 0.11727,
+      "grad_norm": 0.6917034983634949,
+      "learning_rate": 0.003,
+      "loss": 4.0435,
+      "step": 11727
+    },
+    {
+      "epoch": 0.11728,
+      "grad_norm": 0.7560396194458008,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 11728
+    },
+    {
+      "epoch": 0.11729,
+      "grad_norm": 0.8665305376052856,
+      "learning_rate": 0.003,
+      "loss": 4.0429,
+      "step": 11729
+    },
+    {
+      "epoch": 0.1173,
+      "grad_norm": 0.772159993648529,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 11730
+    },
+    {
+      "epoch": 0.11731,
+      "grad_norm": 0.7486879825592041,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 11731
+    },
+    {
+      "epoch": 0.11732,
+      "grad_norm": 0.7846516966819763,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 11732
+    },
+    {
+      "epoch": 0.11733,
+      "grad_norm": 0.879023015499115,
+      "learning_rate": 0.003,
+      "loss": 4.0366,
+      "step": 11733
+    },
+    {
+      "epoch": 0.11734,
+      "grad_norm": 1.1099352836608887,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 11734
+    },
+    {
+      "epoch": 0.11735,
+      "grad_norm": 0.7834532260894775,
+      "learning_rate": 0.003,
+      "loss": 4.0108,
+      "step": 11735
+    },
+    {
+      "epoch": 0.11736,
+      "grad_norm": 0.5790771842002869,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 11736
+    },
+    {
+      "epoch": 0.11737,
+      "grad_norm": 0.5680363774299622,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 11737
+    },
+    {
+      "epoch": 0.11738,
+      "grad_norm": 0.6008859276771545,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 11738
+    },
+    {
+      "epoch": 0.11739,
+      "grad_norm": 0.7392567992210388,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 11739
+    },
+    {
+      "epoch": 0.1174,
+      "grad_norm": 0.916904091835022,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 11740
+    },
+    {
+      "epoch": 0.11741,
+      "grad_norm": 1.0528932809829712,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 11741
+    },
+    {
+      "epoch": 0.11742,
+      "grad_norm": 0.7817772626876831,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 11742
+    },
+    {
+      "epoch": 0.11743,
+      "grad_norm": 0.5557342171669006,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 11743
+    },
+    {
+      "epoch": 0.11744,
+      "grad_norm": 0.5775336623191833,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 11744
+    },
+    {
+      "epoch": 0.11745,
+      "grad_norm": 0.7112092971801758,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 11745
+    },
+    {
+      "epoch": 0.11746,
+      "grad_norm": 0.9123482704162598,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 11746
+    },
+    {
+      "epoch": 0.11747,
+      "grad_norm": 0.9527279734611511,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 11747
+    },
+    {
+      "epoch": 0.11748,
+      "grad_norm": 0.9975042343139648,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 11748
+    },
+    {
+      "epoch": 0.11749,
+      "grad_norm": 0.9033437371253967,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 11749
+    },
+    {
+      "epoch": 0.1175,
+      "grad_norm": 0.9851463437080383,
+      "learning_rate": 0.003,
+      "loss": 4.0463,
+      "step": 11750
+    },
+    {
+      "epoch": 0.11751,
+      "grad_norm": 0.9741025567054749,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 11751
+    },
+    {
+      "epoch": 0.11752,
+      "grad_norm": 0.8289722800254822,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 11752
+    },
+    {
+      "epoch": 0.11753,
+      "grad_norm": 0.8503996729850769,
+      "learning_rate": 0.003,
+      "loss": 4.0574,
+      "step": 11753
+    },
+    {
+      "epoch": 0.11754,
+      "grad_norm": 0.7909584045410156,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 11754
+    },
+    {
+      "epoch": 0.11755,
+      "grad_norm": 0.7145279049873352,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 11755
+    },
+    {
+      "epoch": 0.11756,
+      "grad_norm": 0.7444949150085449,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 11756
+    },
+    {
+      "epoch": 0.11757,
+      "grad_norm": 0.8783793449401855,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 11757
+    },
+    {
+      "epoch": 0.11758,
+      "grad_norm": 0.8041375875473022,
+      "learning_rate": 0.003,
+      "loss": 4.0568,
+      "step": 11758
+    },
+    {
+      "epoch": 0.11759,
+      "grad_norm": 0.9612789750099182,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 11759
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 1.2083760499954224,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 11760
+    },
+    {
+      "epoch": 0.11761,
+      "grad_norm": 0.807439386844635,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 11761
+    },
+    {
+      "epoch": 0.11762,
+      "grad_norm": 0.7512252330780029,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 11762
+    },
+    {
+      "epoch": 0.11763,
+      "grad_norm": 0.7043628692626953,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 11763
+    },
+    {
+      "epoch": 0.11764,
+      "grad_norm": 0.6556148529052734,
+      "learning_rate": 0.003,
+      "loss": 4.0302,
+      "step": 11764
+    },
+    {
+      "epoch": 0.11765,
+      "grad_norm": 0.6054413914680481,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 11765
+    },
+    {
+      "epoch": 0.11766,
+      "grad_norm": 0.5417591333389282,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 11766
+    },
+    {
+      "epoch": 0.11767,
+      "grad_norm": 0.5872727036476135,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 11767
+    },
+    {
+      "epoch": 0.11768,
+      "grad_norm": 0.6032535433769226,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 11768
+    },
+    {
+      "epoch": 0.11769,
+      "grad_norm": 0.6734449863433838,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 11769
+    },
+    {
+      "epoch": 0.1177,
+      "grad_norm": 0.7936435341835022,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 11770
+    },
+    {
+      "epoch": 0.11771,
+      "grad_norm": 1.079922080039978,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 11771
+    },
+    {
+      "epoch": 0.11772,
+      "grad_norm": 1.234328031539917,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 11772
+    },
+    {
+      "epoch": 0.11773,
+      "grad_norm": 0.5922883152961731,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 11773
+    },
+    {
+      "epoch": 0.11774,
+      "grad_norm": 0.8037434816360474,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 11774
+    },
+    {
+      "epoch": 0.11775,
+      "grad_norm": 1.0557984113693237,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 11775
+    },
+    {
+      "epoch": 0.11776,
+      "grad_norm": 0.8010382652282715,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 11776
+    },
+    {
+      "epoch": 0.11777,
+      "grad_norm": 0.7044901251792908,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 11777
+    },
+    {
+      "epoch": 0.11778,
+      "grad_norm": 0.7547171711921692,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 11778
+    },
+    {
+      "epoch": 0.11779,
+      "grad_norm": 0.8296960592269897,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 11779
+    },
+    {
+      "epoch": 0.1178,
+      "grad_norm": 0.7630957365036011,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 11780
+    },
+    {
+      "epoch": 0.11781,
+      "grad_norm": 0.7284735441207886,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 11781
+    },
+    {
+      "epoch": 0.11782,
+      "grad_norm": 0.6988702416419983,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 11782
+    },
+    {
+      "epoch": 0.11783,
+      "grad_norm": 0.7704843878746033,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 11783
+    },
+    {
+      "epoch": 0.11784,
+      "grad_norm": 0.797591507434845,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 11784
+    },
+    {
+      "epoch": 0.11785,
+      "grad_norm": 0.7599050998687744,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 11785
+    },
+    {
+      "epoch": 0.11786,
+      "grad_norm": 0.7562331557273865,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 11786
+    },
+    {
+      "epoch": 0.11787,
+      "grad_norm": 0.7886567115783691,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 11787
+    },
+    {
+      "epoch": 0.11788,
+      "grad_norm": 0.7673211693763733,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 11788
+    },
+    {
+      "epoch": 0.11789,
+      "grad_norm": 0.7491581439971924,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 11789
+    },
+    {
+      "epoch": 0.1179,
+      "grad_norm": 0.747179388999939,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 11790
+    },
+    {
+      "epoch": 0.11791,
+      "grad_norm": 0.8889058828353882,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 11791
+    },
+    {
+      "epoch": 0.11792,
+      "grad_norm": 0.9870392084121704,
+      "learning_rate": 0.003,
+      "loss": 4.039,
+      "step": 11792
+    },
+    {
+      "epoch": 0.11793,
+      "grad_norm": 1.0791133642196655,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 11793
+    },
+    {
+      "epoch": 0.11794,
+      "grad_norm": 1.0078823566436768,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 11794
+    },
+    {
+      "epoch": 0.11795,
+      "grad_norm": 0.903905987739563,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 11795
+    },
+    {
+      "epoch": 0.11796,
+      "grad_norm": 0.7996202111244202,
+      "learning_rate": 0.003,
+      "loss": 4.0436,
+      "step": 11796
+    },
+    {
+      "epoch": 0.11797,
+      "grad_norm": 0.7034594416618347,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 11797
+    },
+    {
+      "epoch": 0.11798,
+      "grad_norm": 0.8433344960212708,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 11798
+    },
+    {
+      "epoch": 0.11799,
+      "grad_norm": 0.9259787201881409,
+      "learning_rate": 0.003,
+      "loss": 4.0606,
+      "step": 11799
+    },
+    {
+      "epoch": 0.118,
+      "grad_norm": 0.9990931749343872,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 11800
+    },
+    {
+      "epoch": 0.11801,
+      "grad_norm": 1.0543739795684814,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 11801
+    },
+    {
+      "epoch": 0.11802,
+      "grad_norm": 1.006338357925415,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 11802
+    },
+    {
+      "epoch": 0.11803,
+      "grad_norm": 0.9883202910423279,
+      "learning_rate": 0.003,
+      "loss": 4.0488,
+      "step": 11803
+    },
+    {
+      "epoch": 0.11804,
+      "grad_norm": 0.9899671077728271,
+      "learning_rate": 0.003,
+      "loss": 4.0416,
+      "step": 11804
+    },
+    {
+      "epoch": 0.11805,
+      "grad_norm": 0.8588717579841614,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 11805
+    },
+    {
+      "epoch": 0.11806,
+      "grad_norm": 0.7318597435951233,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 11806
+    },
+    {
+      "epoch": 0.11807,
+      "grad_norm": 0.689117968082428,
+      "learning_rate": 0.003,
+      "loss": 4.0271,
+      "step": 11807
+    },
+    {
+      "epoch": 0.11808,
+      "grad_norm": 0.6362882256507874,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 11808
+    },
+    {
+      "epoch": 0.11809,
+      "grad_norm": 0.641882061958313,
+      "learning_rate": 0.003,
+      "loss": 4.0364,
+      "step": 11809
+    },
+    {
+      "epoch": 0.1181,
+      "grad_norm": 0.8091892004013062,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 11810
+    },
+    {
+      "epoch": 0.11811,
+      "grad_norm": 1.0445493459701538,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 11811
+    },
+    {
+      "epoch": 0.11812,
+      "grad_norm": 0.9836093187332153,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 11812
+    },
+    {
+      "epoch": 0.11813,
+      "grad_norm": 0.8166530728340149,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 11813
+    },
+    {
+      "epoch": 0.11814,
+      "grad_norm": 0.7115487456321716,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 11814
+    },
+    {
+      "epoch": 0.11815,
+      "grad_norm": 0.6834825277328491,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 11815
+    },
+    {
+      "epoch": 0.11816,
+      "grad_norm": 0.8012105226516724,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 11816
+    },
+    {
+      "epoch": 0.11817,
+      "grad_norm": 0.9633815288543701,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 11817
+    },
+    {
+      "epoch": 0.11818,
+      "grad_norm": 1.0355150699615479,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 11818
+    },
+    {
+      "epoch": 0.11819,
+      "grad_norm": 0.7960835695266724,
+      "learning_rate": 0.003,
+      "loss": 3.9894,
+      "step": 11819
+    },
+    {
+      "epoch": 0.1182,
+      "grad_norm": 0.6536712646484375,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 11820
+    },
+    {
+      "epoch": 0.11821,
+      "grad_norm": 0.6484993100166321,
+      "learning_rate": 0.003,
+      "loss": 3.984,
+      "step": 11821
+    },
+    {
+      "epoch": 0.11822,
+      "grad_norm": 0.7504442930221558,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 11822
+    },
+    {
+      "epoch": 0.11823,
+      "grad_norm": 0.7606385350227356,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 11823
+    },
+    {
+      "epoch": 0.11824,
+      "grad_norm": 0.7401397824287415,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 11824
+    },
+    {
+      "epoch": 0.11825,
+      "grad_norm": 0.6549736857414246,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 11825
+    },
+    {
+      "epoch": 0.11826,
+      "grad_norm": 0.7295933365821838,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 11826
+    },
+    {
+      "epoch": 0.11827,
+      "grad_norm": 0.8805853724479675,
+      "learning_rate": 0.003,
+      "loss": 4.0406,
+      "step": 11827
+    },
+    {
+      "epoch": 0.11828,
+      "grad_norm": 0.7789543271064758,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 11828
+    },
+    {
+      "epoch": 0.11829,
+      "grad_norm": 0.6787858009338379,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 11829
+    },
+    {
+      "epoch": 0.1183,
+      "grad_norm": 0.5975125432014465,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 11830
+    },
+    {
+      "epoch": 0.11831,
+      "grad_norm": 0.6243595480918884,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 11831
+    },
+    {
+      "epoch": 0.11832,
+      "grad_norm": 0.5720900893211365,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 11832
+    },
+    {
+      "epoch": 0.11833,
+      "grad_norm": 0.502043604850769,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 11833
+    },
+    {
+      "epoch": 0.11834,
+      "grad_norm": 0.4862186312675476,
+      "learning_rate": 0.003,
+      "loss": 4.0371,
+      "step": 11834
+    },
+    {
+      "epoch": 0.11835,
+      "grad_norm": 0.4785534143447876,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 11835
+    },
+    {
+      "epoch": 0.11836,
+      "grad_norm": 0.4916263222694397,
+      "learning_rate": 0.003,
+      "loss": 3.9745,
+      "step": 11836
+    },
+    {
+      "epoch": 0.11837,
+      "grad_norm": 0.5785505771636963,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 11837
+    },
+    {
+      "epoch": 0.11838,
+      "grad_norm": 0.7762029767036438,
+      "learning_rate": 0.003,
+      "loss": 3.9946,
+      "step": 11838
+    },
+    {
+      "epoch": 0.11839,
+      "grad_norm": 1.147498369216919,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 11839
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 1.1743519306182861,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 11840
+    },
+    {
+      "epoch": 0.11841,
+      "grad_norm": 0.7633300423622131,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 11841
+    },
+    {
+      "epoch": 0.11842,
+      "grad_norm": 0.7302260398864746,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 11842
+    },
+    {
+      "epoch": 0.11843,
+      "grad_norm": 0.9037927389144897,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 11843
+    },
+    {
+      "epoch": 0.11844,
+      "grad_norm": 1.0036314725875854,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 11844
+    },
+    {
+      "epoch": 0.11845,
+      "grad_norm": 0.981715202331543,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 11845
+    },
+    {
+      "epoch": 0.11846,
+      "grad_norm": 0.9306796789169312,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 11846
+    },
+    {
+      "epoch": 0.11847,
+      "grad_norm": 0.9010899662971497,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 11847
+    },
+    {
+      "epoch": 0.11848,
+      "grad_norm": 0.9531976580619812,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 11848
+    },
+    {
+      "epoch": 0.11849,
+      "grad_norm": 0.9927566647529602,
+      "learning_rate": 0.003,
+      "loss": 4.0546,
+      "step": 11849
+    },
+    {
+      "epoch": 0.1185,
+      "grad_norm": 1.0491796731948853,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 11850
+    },
+    {
+      "epoch": 0.11851,
+      "grad_norm": 0.8954125046730042,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 11851
+    },
+    {
+      "epoch": 0.11852,
+      "grad_norm": 0.9096914529800415,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 11852
+    },
+    {
+      "epoch": 0.11853,
+      "grad_norm": 0.7658776044845581,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 11853
+    },
+    {
+      "epoch": 0.11854,
+      "grad_norm": 0.724291980266571,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 11854
+    },
+    {
+      "epoch": 0.11855,
+      "grad_norm": 0.6638486981391907,
+      "learning_rate": 0.003,
+      "loss": 4.0488,
+      "step": 11855
+    },
+    {
+      "epoch": 0.11856,
+      "grad_norm": 0.7768213152885437,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 11856
+    },
+    {
+      "epoch": 0.11857,
+      "grad_norm": 0.8802120685577393,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 11857
+    },
+    {
+      "epoch": 0.11858,
+      "grad_norm": 1.0737560987472534,
+      "learning_rate": 0.003,
+      "loss": 4.0638,
+      "step": 11858
+    },
+    {
+      "epoch": 0.11859,
+      "grad_norm": 1.0319401025772095,
+      "learning_rate": 0.003,
+      "loss": 4.0403,
+      "step": 11859
+    },
+    {
+      "epoch": 0.1186,
+      "grad_norm": 1.0573298931121826,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 11860
+    },
+    {
+      "epoch": 0.11861,
+      "grad_norm": 0.9240105152130127,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 11861
+    },
+    {
+      "epoch": 0.11862,
+      "grad_norm": 0.8155292272567749,
+      "learning_rate": 0.003,
+      "loss": 4.0443,
+      "step": 11862
+    },
+    {
+      "epoch": 0.11863,
+      "grad_norm": 0.7273171544075012,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 11863
+    },
+    {
+      "epoch": 0.11864,
+      "grad_norm": 0.7226408123970032,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 11864
+    },
+    {
+      "epoch": 0.11865,
+      "grad_norm": 0.7888292670249939,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 11865
+    },
+    {
+      "epoch": 0.11866,
+      "grad_norm": 0.9124221205711365,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 11866
+    },
+    {
+      "epoch": 0.11867,
+      "grad_norm": 0.9692350029945374,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 11867
+    },
+    {
+      "epoch": 0.11868,
+      "grad_norm": 0.9330461621284485,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 11868
+    },
+    {
+      "epoch": 0.11869,
+      "grad_norm": 0.8661285042762756,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 11869
+    },
+    {
+      "epoch": 0.1187,
+      "grad_norm": 0.852867603302002,
+      "learning_rate": 0.003,
+      "loss": 4.0369,
+      "step": 11870
+    },
+    {
+      "epoch": 0.11871,
+      "grad_norm": 0.8410532474517822,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 11871
+    },
+    {
+      "epoch": 0.11872,
+      "grad_norm": 0.7677245736122131,
+      "learning_rate": 0.003,
+      "loss": 4.0432,
+      "step": 11872
+    },
+    {
+      "epoch": 0.11873,
+      "grad_norm": 0.7099470496177673,
+      "learning_rate": 0.003,
+      "loss": 4.0047,
+      "step": 11873
+    },
+    {
+      "epoch": 0.11874,
+      "grad_norm": 0.5636304616928101,
+      "learning_rate": 0.003,
+      "loss": 3.9831,
+      "step": 11874
+    },
+    {
+      "epoch": 0.11875,
+      "grad_norm": 0.6461973190307617,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 11875
+    },
+    {
+      "epoch": 0.11876,
+      "grad_norm": 0.6658841967582703,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 11876
+    },
+    {
+      "epoch": 0.11877,
+      "grad_norm": 0.655237078666687,
+      "learning_rate": 0.003,
+      "loss": 4.044,
+      "step": 11877
+    },
+    {
+      "epoch": 0.11878,
+      "grad_norm": 0.7163628935813904,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 11878
+    },
+    {
+      "epoch": 0.11879,
+      "grad_norm": 0.5543665885925293,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 11879
+    },
+    {
+      "epoch": 0.1188,
+      "grad_norm": 0.5670758485794067,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 11880
+    },
+    {
+      "epoch": 0.11881,
+      "grad_norm": 0.511709451675415,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 11881
+    },
+    {
+      "epoch": 0.11882,
+      "grad_norm": 0.4493480920791626,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 11882
+    },
+    {
+      "epoch": 0.11883,
+      "grad_norm": 0.5147427320480347,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 11883
+    },
+    {
+      "epoch": 0.11884,
+      "grad_norm": 0.6153466105461121,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 11884
+    },
+    {
+      "epoch": 0.11885,
+      "grad_norm": 0.8834508061408997,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 11885
+    },
+    {
+      "epoch": 0.11886,
+      "grad_norm": 1.244941234588623,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 11886
+    },
+    {
+      "epoch": 0.11887,
+      "grad_norm": 0.7770438194274902,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 11887
+    },
+    {
+      "epoch": 0.11888,
+      "grad_norm": 0.6514125466346741,
+      "learning_rate": 0.003,
+      "loss": 4.0428,
+      "step": 11888
+    },
+    {
+      "epoch": 0.11889,
+      "grad_norm": 0.6172106862068176,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 11889
+    },
+    {
+      "epoch": 0.1189,
+      "grad_norm": 0.607948899269104,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 11890
+    },
+    {
+      "epoch": 0.11891,
+      "grad_norm": 0.7253835201263428,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 11891
+    },
+    {
+      "epoch": 0.11892,
+      "grad_norm": 0.6802498698234558,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 11892
+    },
+    {
+      "epoch": 0.11893,
+      "grad_norm": 0.7029039859771729,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 11893
+    },
+    {
+      "epoch": 0.11894,
+      "grad_norm": 0.7636929154396057,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 11894
+    },
+    {
+      "epoch": 0.11895,
+      "grad_norm": 0.7230116724967957,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 11895
+    },
+    {
+      "epoch": 0.11896,
+      "grad_norm": 0.6269598603248596,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 11896
+    },
+    {
+      "epoch": 0.11897,
+      "grad_norm": 0.5971459746360779,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 11897
+    },
+    {
+      "epoch": 0.11898,
+      "grad_norm": 0.7045280337333679,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 11898
+    },
+    {
+      "epoch": 0.11899,
+      "grad_norm": 0.7602986097335815,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 11899
+    },
+    {
+      "epoch": 0.119,
+      "grad_norm": 0.762942910194397,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 11900
+    },
+    {
+      "epoch": 0.11901,
+      "grad_norm": 0.8937726020812988,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 11901
+    },
+    {
+      "epoch": 0.11902,
+      "grad_norm": 1.0988658666610718,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 11902
+    },
+    {
+      "epoch": 0.11903,
+      "grad_norm": 1.0117043256759644,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 11903
+    },
+    {
+      "epoch": 0.11904,
+      "grad_norm": 1.114853858947754,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 11904
+    },
+    {
+      "epoch": 0.11905,
+      "grad_norm": 0.8835121393203735,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 11905
+    },
+    {
+      "epoch": 0.11906,
+      "grad_norm": 0.7715875506401062,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 11906
+    },
+    {
+      "epoch": 0.11907,
+      "grad_norm": 0.7026994824409485,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 11907
+    },
+    {
+      "epoch": 0.11908,
+      "grad_norm": 0.7263714671134949,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 11908
+    },
+    {
+      "epoch": 0.11909,
+      "grad_norm": 0.8062036037445068,
+      "learning_rate": 0.003,
+      "loss": 3.9912,
+      "step": 11909
+    },
+    {
+      "epoch": 0.1191,
+      "grad_norm": 0.862729549407959,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 11910
+    },
+    {
+      "epoch": 0.11911,
+      "grad_norm": 0.8009734749794006,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 11911
+    },
+    {
+      "epoch": 0.11912,
+      "grad_norm": 0.9418559670448303,
+      "learning_rate": 0.003,
+      "loss": 4.0668,
+      "step": 11912
+    },
+    {
+      "epoch": 0.11913,
+      "grad_norm": 1.1915781497955322,
+      "learning_rate": 0.003,
+      "loss": 4.0611,
+      "step": 11913
+    },
+    {
+      "epoch": 0.11914,
+      "grad_norm": 0.8546242713928223,
+      "learning_rate": 0.003,
+      "loss": 4.0549,
+      "step": 11914
+    },
+    {
+      "epoch": 0.11915,
+      "grad_norm": 0.697476327419281,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 11915
+    },
+    {
+      "epoch": 0.11916,
+      "grad_norm": 0.7126041650772095,
+      "learning_rate": 0.003,
+      "loss": 4.0474,
+      "step": 11916
+    },
+    {
+      "epoch": 0.11917,
+      "grad_norm": 0.7683954238891602,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 11917
+    },
+    {
+      "epoch": 0.11918,
+      "grad_norm": 0.7003222703933716,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 11918
+    },
+    {
+      "epoch": 0.11919,
+      "grad_norm": 0.6144813299179077,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 11919
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.6127336025238037,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 11920
+    },
+    {
+      "epoch": 0.11921,
+      "grad_norm": 0.6253604292869568,
+      "learning_rate": 0.003,
+      "loss": 3.9883,
+      "step": 11921
+    },
+    {
+      "epoch": 0.11922,
+      "grad_norm": 0.716572642326355,
+      "learning_rate": 0.003,
+      "loss": 4.04,
+      "step": 11922
+    },
+    {
+      "epoch": 0.11923,
+      "grad_norm": 0.8776218295097351,
+      "learning_rate": 0.003,
+      "loss": 3.9855,
+      "step": 11923
+    },
+    {
+      "epoch": 0.11924,
+      "grad_norm": 0.9347663521766663,
+      "learning_rate": 0.003,
+      "loss": 4.0176,
+      "step": 11924
+    },
+    {
+      "epoch": 0.11925,
+      "grad_norm": 1.0252727270126343,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 11925
+    },
+    {
+      "epoch": 0.11926,
+      "grad_norm": 1.1369107961654663,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 11926
+    },
+    {
+      "epoch": 0.11927,
+      "grad_norm": 0.8379679918289185,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 11927
+    },
+    {
+      "epoch": 0.11928,
+      "grad_norm": 0.6795943379402161,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 11928
+    },
+    {
+      "epoch": 0.11929,
+      "grad_norm": 0.6492370367050171,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 11929
+    },
+    {
+      "epoch": 0.1193,
+      "grad_norm": 0.7252062559127808,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 11930
+    },
+    {
+      "epoch": 0.11931,
+      "grad_norm": 0.6780815720558167,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 11931
+    },
+    {
+      "epoch": 0.11932,
+      "grad_norm": 0.678745448589325,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 11932
+    },
+    {
+      "epoch": 0.11933,
+      "grad_norm": 0.6801624298095703,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 11933
+    },
+    {
+      "epoch": 0.11934,
+      "grad_norm": 0.7659531235694885,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 11934
+    },
+    {
+      "epoch": 0.11935,
+      "grad_norm": 0.8605554699897766,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 11935
+    },
+    {
+      "epoch": 0.11936,
+      "grad_norm": 0.9928908944129944,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 11936
+    },
+    {
+      "epoch": 0.11937,
+      "grad_norm": 0.9970818161964417,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 11937
+    },
+    {
+      "epoch": 0.11938,
+      "grad_norm": 0.8282353281974792,
+      "learning_rate": 0.003,
+      "loss": 3.9908,
+      "step": 11938
+    },
+    {
+      "epoch": 0.11939,
+      "grad_norm": 0.802817702293396,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 11939
+    },
+    {
+      "epoch": 0.1194,
+      "grad_norm": 0.7577781081199646,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 11940
+    },
+    {
+      "epoch": 0.11941,
+      "grad_norm": 0.7385069727897644,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 11941
+    },
+    {
+      "epoch": 0.11942,
+      "grad_norm": 0.7899826765060425,
+      "learning_rate": 0.003,
+      "loss": 4.0394,
+      "step": 11942
+    },
+    {
+      "epoch": 0.11943,
+      "grad_norm": 0.7291315793991089,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 11943
+    },
+    {
+      "epoch": 0.11944,
+      "grad_norm": 0.6764593720436096,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 11944
+    },
+    {
+      "epoch": 0.11945,
+      "grad_norm": 0.7137207388877869,
+      "learning_rate": 0.003,
+      "loss": 3.9897,
+      "step": 11945
+    },
+    {
+      "epoch": 0.11946,
+      "grad_norm": 0.9325737953186035,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 11946
+    },
+    {
+      "epoch": 0.11947,
+      "grad_norm": 1.2605060338974,
+      "learning_rate": 0.003,
+      "loss": 4.0452,
+      "step": 11947
+    },
+    {
+      "epoch": 0.11948,
+      "grad_norm": 0.7151952981948853,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 11948
+    },
+    {
+      "epoch": 0.11949,
+      "grad_norm": 0.6542052030563354,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 11949
+    },
+    {
+      "epoch": 0.1195,
+      "grad_norm": 0.5991142988204956,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 11950
+    },
+    {
+      "epoch": 0.11951,
+      "grad_norm": 0.6021481156349182,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 11951
+    },
+    {
+      "epoch": 0.11952,
+      "grad_norm": 0.6924704313278198,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 11952
+    },
+    {
+      "epoch": 0.11953,
+      "grad_norm": 0.609024703502655,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 11953
+    },
+    {
+      "epoch": 0.11954,
+      "grad_norm": 0.6090964674949646,
+      "learning_rate": 0.003,
+      "loss": 3.9912,
+      "step": 11954
+    },
+    {
+      "epoch": 0.11955,
+      "grad_norm": 0.6386412382125854,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 11955
+    },
+    {
+      "epoch": 0.11956,
+      "grad_norm": 0.7164059281349182,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 11956
+    },
+    {
+      "epoch": 0.11957,
+      "grad_norm": 0.7793961763381958,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 11957
+    },
+    {
+      "epoch": 0.11958,
+      "grad_norm": 0.6898953914642334,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 11958
+    },
+    {
+      "epoch": 0.11959,
+      "grad_norm": 0.7248098254203796,
+      "learning_rate": 0.003,
+      "loss": 3.99,
+      "step": 11959
+    },
+    {
+      "epoch": 0.1196,
+      "grad_norm": 0.9642515778541565,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 11960
+    },
+    {
+      "epoch": 0.11961,
+      "grad_norm": 1.08201265335083,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 11961
+    },
+    {
+      "epoch": 0.11962,
+      "grad_norm": 0.8699434995651245,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 11962
+    },
+    {
+      "epoch": 0.11963,
+      "grad_norm": 0.8050743341445923,
+      "learning_rate": 0.003,
+      "loss": 4.015,
+      "step": 11963
+    },
+    {
+      "epoch": 0.11964,
+      "grad_norm": 0.7765284776687622,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 11964
+    },
+    {
+      "epoch": 0.11965,
+      "grad_norm": 0.894541323184967,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 11965
+    },
+    {
+      "epoch": 0.11966,
+      "grad_norm": 1.0280059576034546,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 11966
+    },
+    {
+      "epoch": 0.11967,
+      "grad_norm": 1.1794424057006836,
+      "learning_rate": 0.003,
+      "loss": 4.0001,
+      "step": 11967
+    },
+    {
+      "epoch": 0.11968,
+      "grad_norm": 0.886949896812439,
+      "learning_rate": 0.003,
+      "loss": 3.9988,
+      "step": 11968
+    },
+    {
+      "epoch": 0.11969,
+      "grad_norm": 0.7620879411697388,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 11969
+    },
+    {
+      "epoch": 0.1197,
+      "grad_norm": 0.7351470589637756,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 11970
+    },
+    {
+      "epoch": 0.11971,
+      "grad_norm": 0.7624823451042175,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 11971
+    },
+    {
+      "epoch": 0.11972,
+      "grad_norm": 0.6433542966842651,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 11972
+    },
+    {
+      "epoch": 0.11973,
+      "grad_norm": 0.6336454153060913,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 11973
+    },
+    {
+      "epoch": 0.11974,
+      "grad_norm": 0.6942681670188904,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 11974
+    },
+    {
+      "epoch": 0.11975,
+      "grad_norm": 0.7114654183387756,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 11975
+    },
+    {
+      "epoch": 0.11976,
+      "grad_norm": 0.7263957858085632,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 11976
+    },
+    {
+      "epoch": 0.11977,
+      "grad_norm": 0.8133060932159424,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 11977
+    },
+    {
+      "epoch": 0.11978,
+      "grad_norm": 1.0520018339157104,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 11978
+    },
+    {
+      "epoch": 0.11979,
+      "grad_norm": 1.1112744808197021,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 11979
+    },
+    {
+      "epoch": 0.1198,
+      "grad_norm": 0.9063125848770142,
+      "learning_rate": 0.003,
+      "loss": 4.057,
+      "step": 11980
+    },
+    {
+      "epoch": 0.11981,
+      "grad_norm": 0.9346314668655396,
+      "learning_rate": 0.003,
+      "loss": 4.0488,
+      "step": 11981
+    },
+    {
+      "epoch": 0.11982,
+      "grad_norm": 0.9792442321777344,
+      "learning_rate": 0.003,
+      "loss": 4.0332,
+      "step": 11982
+    },
+    {
+      "epoch": 0.11983,
+      "grad_norm": 0.9704103469848633,
+      "learning_rate": 0.003,
+      "loss": 4.0503,
+      "step": 11983
+    },
+    {
+      "epoch": 0.11984,
+      "grad_norm": 0.849890410900116,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 11984
+    },
+    {
+      "epoch": 0.11985,
+      "grad_norm": 0.9243372678756714,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 11985
+    },
+    {
+      "epoch": 0.11986,
+      "grad_norm": 1.137408971786499,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 11986
+    },
+    {
+      "epoch": 0.11987,
+      "grad_norm": 0.8184463381767273,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 11987
+    },
+    {
+      "epoch": 0.11988,
+      "grad_norm": 0.7495303153991699,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 11988
+    },
+    {
+      "epoch": 0.11989,
+      "grad_norm": 0.700872004032135,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 11989
+    },
+    {
+      "epoch": 0.1199,
+      "grad_norm": 0.7107323408126831,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 11990
+    },
+    {
+      "epoch": 0.11991,
+      "grad_norm": 0.8026617765426636,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 11991
+    },
+    {
+      "epoch": 0.11992,
+      "grad_norm": 0.8296177983283997,
+      "learning_rate": 0.003,
+      "loss": 4.0504,
+      "step": 11992
+    },
+    {
+      "epoch": 0.11993,
+      "grad_norm": 0.8199352622032166,
+      "learning_rate": 0.003,
+      "loss": 4.0039,
+      "step": 11993
+    },
+    {
+      "epoch": 0.11994,
+      "grad_norm": 0.711927592754364,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 11994
+    },
+    {
+      "epoch": 0.11995,
+      "grad_norm": 0.7301411628723145,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 11995
+    },
+    {
+      "epoch": 0.11996,
+      "grad_norm": 0.7096173763275146,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 11996
+    },
+    {
+      "epoch": 0.11997,
+      "grad_norm": 0.6520859003067017,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 11997
+    },
+    {
+      "epoch": 0.11998,
+      "grad_norm": 0.6798012852668762,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 11998
+    },
+    {
+      "epoch": 0.11999,
+      "grad_norm": 0.8672653436660767,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 11999
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.0057185888290405,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 12000
+    },
+    {
+      "epoch": 0.12001,
+      "grad_norm": 0.9552724361419678,
+      "learning_rate": 0.003,
+      "loss": 4.0185,
+      "step": 12001
+    },
+    {
+      "epoch": 0.12002,
+      "grad_norm": 1.129225492477417,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 12002
+    },
+    {
+      "epoch": 0.12003,
+      "grad_norm": 0.8080599904060364,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 12003
+    },
+    {
+      "epoch": 0.12004,
+      "grad_norm": 0.7223402857780457,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 12004
+    },
+    {
+      "epoch": 0.12005,
+      "grad_norm": 0.6406126022338867,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 12005
+    },
+    {
+      "epoch": 0.12006,
+      "grad_norm": 0.6637592315673828,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 12006
+    },
+    {
+      "epoch": 0.12007,
+      "grad_norm": 0.8314640522003174,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 12007
+    },
+    {
+      "epoch": 0.12008,
+      "grad_norm": 0.9080745577812195,
+      "learning_rate": 0.003,
+      "loss": 4.05,
+      "step": 12008
+    },
+    {
+      "epoch": 0.12009,
+      "grad_norm": 0.8800250887870789,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 12009
+    },
+    {
+      "epoch": 0.1201,
+      "grad_norm": 0.8031384348869324,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 12010
+    },
+    {
+      "epoch": 0.12011,
+      "grad_norm": 0.9114692807197571,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 12011
+    },
+    {
+      "epoch": 0.12012,
+      "grad_norm": 0.9691035747528076,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 12012
+    },
+    {
+      "epoch": 0.12013,
+      "grad_norm": 1.0572527647018433,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 12013
+    },
+    {
+      "epoch": 0.12014,
+      "grad_norm": 0.914512574672699,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 12014
+    },
+    {
+      "epoch": 0.12015,
+      "grad_norm": 0.7966519594192505,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 12015
+    },
+    {
+      "epoch": 0.12016,
+      "grad_norm": 0.5781070590019226,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 12016
+    },
+    {
+      "epoch": 0.12017,
+      "grad_norm": 0.5625675916671753,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 12017
+    },
+    {
+      "epoch": 0.12018,
+      "grad_norm": 0.7984493374824524,
+      "learning_rate": 0.003,
+      "loss": 3.9687,
+      "step": 12018
+    },
+    {
+      "epoch": 0.12019,
+      "grad_norm": 0.8596245050430298,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 12019
+    },
+    {
+      "epoch": 0.1202,
+      "grad_norm": 0.8960752487182617,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 12020
+    },
+    {
+      "epoch": 0.12021,
+      "grad_norm": 0.8911660313606262,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 12021
+    },
+    {
+      "epoch": 0.12022,
+      "grad_norm": 0.7461158633232117,
+      "learning_rate": 0.003,
+      "loss": 3.9915,
+      "step": 12022
+    },
+    {
+      "epoch": 0.12023,
+      "grad_norm": 0.7623320817947388,
+      "learning_rate": 0.003,
+      "loss": 4.0027,
+      "step": 12023
+    },
+    {
+      "epoch": 0.12024,
+      "grad_norm": 0.781243622303009,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 12024
+    },
+    {
+      "epoch": 0.12025,
+      "grad_norm": 0.843302845954895,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 12025
+    },
+    {
+      "epoch": 0.12026,
+      "grad_norm": 0.902095377445221,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 12026
+    },
+    {
+      "epoch": 0.12027,
+      "grad_norm": 1.1767667531967163,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 12027
+    },
+    {
+      "epoch": 0.12028,
+      "grad_norm": 0.8134157061576843,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 12028
+    },
+    {
+      "epoch": 0.12029,
+      "grad_norm": 0.7799186706542969,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 12029
+    },
+    {
+      "epoch": 0.1203,
+      "grad_norm": 0.8834793567657471,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 12030
+    },
+    {
+      "epoch": 0.12031,
+      "grad_norm": 1.112605333328247,
+      "learning_rate": 0.003,
+      "loss": 4.0366,
+      "step": 12031
+    },
+    {
+      "epoch": 0.12032,
+      "grad_norm": 1.2384040355682373,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 12032
+    },
+    {
+      "epoch": 0.12033,
+      "grad_norm": 0.752450168132782,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 12033
+    },
+    {
+      "epoch": 0.12034,
+      "grad_norm": 0.6615104079246521,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 12034
+    },
+    {
+      "epoch": 0.12035,
+      "grad_norm": 0.7580855488777161,
+      "learning_rate": 0.003,
+      "loss": 3.9972,
+      "step": 12035
+    },
+    {
+      "epoch": 0.12036,
+      "grad_norm": 0.6694372892379761,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 12036
+    },
+    {
+      "epoch": 0.12037,
+      "grad_norm": 0.7598423361778259,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 12037
+    },
+    {
+      "epoch": 0.12038,
+      "grad_norm": 0.7612495422363281,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 12038
+    },
+    {
+      "epoch": 0.12039,
+      "grad_norm": 0.7796201705932617,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 12039
+    },
+    {
+      "epoch": 0.1204,
+      "grad_norm": 0.7708131670951843,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 12040
+    },
+    {
+      "epoch": 0.12041,
+      "grad_norm": 0.707992672920227,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 12041
+    },
+    {
+      "epoch": 0.12042,
+      "grad_norm": 0.6602921485900879,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 12042
+    },
+    {
+      "epoch": 0.12043,
+      "grad_norm": 0.7230878472328186,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 12043
+    },
+    {
+      "epoch": 0.12044,
+      "grad_norm": 0.7043576240539551,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 12044
+    },
+    {
+      "epoch": 0.12045,
+      "grad_norm": 0.7773805260658264,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 12045
+    },
+    {
+      "epoch": 0.12046,
+      "grad_norm": 0.9323230981826782,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 12046
+    },
+    {
+      "epoch": 0.12047,
+      "grad_norm": 0.9924766421318054,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 12047
+    },
+    {
+      "epoch": 0.12048,
+      "grad_norm": 0.9833324551582336,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 12048
+    },
+    {
+      "epoch": 0.12049,
+      "grad_norm": 0.8768059015274048,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 12049
+    },
+    {
+      "epoch": 0.1205,
+      "grad_norm": 0.6571690440177917,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 12050
+    },
+    {
+      "epoch": 0.12051,
+      "grad_norm": 0.6308192610740662,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 12051
+    },
+    {
+      "epoch": 0.12052,
+      "grad_norm": 0.7342305183410645,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 12052
+    },
+    {
+      "epoch": 0.12053,
+      "grad_norm": 0.9399965405464172,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 12053
+    },
+    {
+      "epoch": 0.12054,
+      "grad_norm": 1.0758345127105713,
+      "learning_rate": 0.003,
+      "loss": 4.0451,
+      "step": 12054
+    },
+    {
+      "epoch": 0.12055,
+      "grad_norm": 0.7563015818595886,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 12055
+    },
+    {
+      "epoch": 0.12056,
+      "grad_norm": 0.6503219604492188,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 12056
+    },
+    {
+      "epoch": 0.12057,
+      "grad_norm": 0.6826764345169067,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 12057
+    },
+    {
+      "epoch": 0.12058,
+      "grad_norm": 0.7406718134880066,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 12058
+    },
+    {
+      "epoch": 0.12059,
+      "grad_norm": 0.7053825855255127,
+      "learning_rate": 0.003,
+      "loss": 4.0386,
+      "step": 12059
+    },
+    {
+      "epoch": 0.1206,
+      "grad_norm": 0.7572804093360901,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 12060
+    },
+    {
+      "epoch": 0.12061,
+      "grad_norm": 0.9737385511398315,
+      "learning_rate": 0.003,
+      "loss": 4.0472,
+      "step": 12061
+    },
+    {
+      "epoch": 0.12062,
+      "grad_norm": 1.229773998260498,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 12062
+    },
+    {
+      "epoch": 0.12063,
+      "grad_norm": 0.9908783435821533,
+      "learning_rate": 0.003,
+      "loss": 4.034,
+      "step": 12063
+    },
+    {
+      "epoch": 0.12064,
+      "grad_norm": 0.8750876188278198,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 12064
+    },
+    {
+      "epoch": 0.12065,
+      "grad_norm": 0.8608133792877197,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 12065
+    },
+    {
+      "epoch": 0.12066,
+      "grad_norm": 0.8134159445762634,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 12066
+    },
+    {
+      "epoch": 0.12067,
+      "grad_norm": 0.6643102169036865,
+      "learning_rate": 0.003,
+      "loss": 4.0497,
+      "step": 12067
+    },
+    {
+      "epoch": 0.12068,
+      "grad_norm": 0.7150804400444031,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 12068
+    },
+    {
+      "epoch": 0.12069,
+      "grad_norm": 0.8066950440406799,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 12069
+    },
+    {
+      "epoch": 0.1207,
+      "grad_norm": 0.8921740055084229,
+      "learning_rate": 0.003,
+      "loss": 4.0095,
+      "step": 12070
+    },
+    {
+      "epoch": 0.12071,
+      "grad_norm": 1.0176774263381958,
+      "learning_rate": 0.003,
+      "loss": 4.0271,
+      "step": 12071
+    },
+    {
+      "epoch": 0.12072,
+      "grad_norm": 0.9523666501045227,
+      "learning_rate": 0.003,
+      "loss": 4.0697,
+      "step": 12072
+    },
+    {
+      "epoch": 0.12073,
+      "grad_norm": 0.9739864468574524,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 12073
+    },
+    {
+      "epoch": 0.12074,
+      "grad_norm": 0.8286561369895935,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 12074
+    },
+    {
+      "epoch": 0.12075,
+      "grad_norm": 0.7403897643089294,
+      "learning_rate": 0.003,
+      "loss": 3.9819,
+      "step": 12075
+    },
+    {
+      "epoch": 0.12076,
+      "grad_norm": 0.6872183084487915,
+      "learning_rate": 0.003,
+      "loss": 4.0234,
+      "step": 12076
+    },
+    {
+      "epoch": 0.12077,
+      "grad_norm": 0.6335912942886353,
+      "learning_rate": 0.003,
+      "loss": 3.9772,
+      "step": 12077
+    },
+    {
+      "epoch": 0.12078,
+      "grad_norm": 0.6662702560424805,
+      "learning_rate": 0.003,
+      "loss": 4.0057,
+      "step": 12078
+    },
+    {
+      "epoch": 0.12079,
+      "grad_norm": 0.6359442472457886,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 12079
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.5857674479484558,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 12080
+    },
+    {
+      "epoch": 0.12081,
+      "grad_norm": 0.6044425964355469,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 12081
+    },
+    {
+      "epoch": 0.12082,
+      "grad_norm": 0.6441171169281006,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 12082
+    },
+    {
+      "epoch": 0.12083,
+      "grad_norm": 0.6959605813026428,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 12083
+    },
+    {
+      "epoch": 0.12084,
+      "grad_norm": 0.7471359968185425,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 12084
+    },
+    {
+      "epoch": 0.12085,
+      "grad_norm": 0.92557293176651,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 12085
+    },
+    {
+      "epoch": 0.12086,
+      "grad_norm": 1.1548380851745605,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 12086
+    },
+    {
+      "epoch": 0.12087,
+      "grad_norm": 0.8555243015289307,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 12087
+    },
+    {
+      "epoch": 0.12088,
+      "grad_norm": 0.6919422149658203,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 12088
+    },
+    {
+      "epoch": 0.12089,
+      "grad_norm": 0.654999315738678,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 12089
+    },
+    {
+      "epoch": 0.1209,
+      "grad_norm": 0.7054887413978577,
+      "learning_rate": 0.003,
+      "loss": 3.9938,
+      "step": 12090
+    },
+    {
+      "epoch": 0.12091,
+      "grad_norm": 0.6020074486732483,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 12091
+    },
+    {
+      "epoch": 0.12092,
+      "grad_norm": 0.6386114954948425,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 12092
+    },
+    {
+      "epoch": 0.12093,
+      "grad_norm": 0.7278828620910645,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 12093
+    },
+    {
+      "epoch": 0.12094,
+      "grad_norm": 0.8216177821159363,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 12094
+    },
+    {
+      "epoch": 0.12095,
+      "grad_norm": 0.8740129470825195,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 12095
+    },
+    {
+      "epoch": 0.12096,
+      "grad_norm": 0.9309924244880676,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 12096
+    },
+    {
+      "epoch": 0.12097,
+      "grad_norm": 0.9449071884155273,
+      "learning_rate": 0.003,
+      "loss": 4.0463,
+      "step": 12097
+    },
+    {
+      "epoch": 0.12098,
+      "grad_norm": 0.806206226348877,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 12098
+    },
+    {
+      "epoch": 0.12099,
+      "grad_norm": 0.6736234426498413,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 12099
+    },
+    {
+      "epoch": 0.121,
+      "grad_norm": 0.8320266008377075,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 12100
+    },
+    {
+      "epoch": 0.12101,
+      "grad_norm": 0.9483769536018372,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 12101
+    },
+    {
+      "epoch": 0.12102,
+      "grad_norm": 1.183903455734253,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 12102
+    },
+    {
+      "epoch": 0.12103,
+      "grad_norm": 0.9592525362968445,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 12103
+    },
+    {
+      "epoch": 0.12104,
+      "grad_norm": 0.8384464383125305,
+      "learning_rate": 0.003,
+      "loss": 3.9965,
+      "step": 12104
+    },
+    {
+      "epoch": 0.12105,
+      "grad_norm": 0.7661356925964355,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 12105
+    },
+    {
+      "epoch": 0.12106,
+      "grad_norm": 0.8419826626777649,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 12106
+    },
+    {
+      "epoch": 0.12107,
+      "grad_norm": 0.9251189231872559,
+      "learning_rate": 0.003,
+      "loss": 4.048,
+      "step": 12107
+    },
+    {
+      "epoch": 0.12108,
+      "grad_norm": 1.063491702079773,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 12108
+    },
+    {
+      "epoch": 0.12109,
+      "grad_norm": 0.8947046995162964,
+      "learning_rate": 0.003,
+      "loss": 4.0028,
+      "step": 12109
+    },
+    {
+      "epoch": 0.1211,
+      "grad_norm": 0.8604028820991516,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 12110
+    },
+    {
+      "epoch": 0.12111,
+      "grad_norm": 0.7970651984214783,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 12111
+    },
+    {
+      "epoch": 0.12112,
+      "grad_norm": 0.7963676452636719,
+      "learning_rate": 0.003,
+      "loss": 4.0462,
+      "step": 12112
+    },
+    {
+      "epoch": 0.12113,
+      "grad_norm": 0.7746478319168091,
+      "learning_rate": 0.003,
+      "loss": 4.0351,
+      "step": 12113
+    },
+    {
+      "epoch": 0.12114,
+      "grad_norm": 0.7988983988761902,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 12114
+    },
+    {
+      "epoch": 0.12115,
+      "grad_norm": 0.8888752460479736,
+      "learning_rate": 0.003,
+      "loss": 3.9862,
+      "step": 12115
+    },
+    {
+      "epoch": 0.12116,
+      "grad_norm": 0.8377873301506042,
+      "learning_rate": 0.003,
+      "loss": 4.0551,
+      "step": 12116
+    },
+    {
+      "epoch": 0.12117,
+      "grad_norm": 0.6950471997261047,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 12117
+    },
+    {
+      "epoch": 0.12118,
+      "grad_norm": 0.7134587168693542,
+      "learning_rate": 0.003,
+      "loss": 3.9893,
+      "step": 12118
+    },
+    {
+      "epoch": 0.12119,
+      "grad_norm": 0.6574320197105408,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 12119
+    },
+    {
+      "epoch": 0.1212,
+      "grad_norm": 0.5444393754005432,
+      "learning_rate": 0.003,
+      "loss": 3.9915,
+      "step": 12120
+    },
+    {
+      "epoch": 0.12121,
+      "grad_norm": 0.5818464159965515,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 12121
+    },
+    {
+      "epoch": 0.12122,
+      "grad_norm": 0.6494410037994385,
+      "learning_rate": 0.003,
+      "loss": 4.0389,
+      "step": 12122
+    },
+    {
+      "epoch": 0.12123,
+      "grad_norm": 0.7310759425163269,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 12123
+    },
+    {
+      "epoch": 0.12124,
+      "grad_norm": 0.7354447841644287,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 12124
+    },
+    {
+      "epoch": 0.12125,
+      "grad_norm": 0.7390363216400146,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 12125
+    },
+    {
+      "epoch": 0.12126,
+      "grad_norm": 0.7380009293556213,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 12126
+    },
+    {
+      "epoch": 0.12127,
+      "grad_norm": 0.7050486207008362,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 12127
+    },
+    {
+      "epoch": 0.12128,
+      "grad_norm": 0.8125148415565491,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 12128
+    },
+    {
+      "epoch": 0.12129,
+      "grad_norm": 0.8589991927146912,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 12129
+    },
+    {
+      "epoch": 0.1213,
+      "grad_norm": 0.847008466720581,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 12130
+    },
+    {
+      "epoch": 0.12131,
+      "grad_norm": 0.9250040054321289,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 12131
+    },
+    {
+      "epoch": 0.12132,
+      "grad_norm": 1.12769615650177,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 12132
+    },
+    {
+      "epoch": 0.12133,
+      "grad_norm": 1.0908305644989014,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 12133
+    },
+    {
+      "epoch": 0.12134,
+      "grad_norm": 0.8170710802078247,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 12134
+    },
+    {
+      "epoch": 0.12135,
+      "grad_norm": 0.7351126074790955,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 12135
+    },
+    {
+      "epoch": 0.12136,
+      "grad_norm": 0.7365264892578125,
+      "learning_rate": 0.003,
+      "loss": 4.0185,
+      "step": 12136
+    },
+    {
+      "epoch": 0.12137,
+      "grad_norm": 0.7570641040802002,
+      "learning_rate": 0.003,
+      "loss": 4.0555,
+      "step": 12137
+    },
+    {
+      "epoch": 0.12138,
+      "grad_norm": 0.7057920098304749,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 12138
+    },
+    {
+      "epoch": 0.12139,
+      "grad_norm": 0.7471189498901367,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 12139
+    },
+    {
+      "epoch": 0.1214,
+      "grad_norm": 0.8883476257324219,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 12140
+    },
+    {
+      "epoch": 0.12141,
+      "grad_norm": 0.884615421295166,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 12141
+    },
+    {
+      "epoch": 0.12142,
+      "grad_norm": 0.7976682782173157,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 12142
+    },
+    {
+      "epoch": 0.12143,
+      "grad_norm": 0.8299143314361572,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 12143
+    },
+    {
+      "epoch": 0.12144,
+      "grad_norm": 0.9504224061965942,
+      "learning_rate": 0.003,
+      "loss": 4.0486,
+      "step": 12144
+    },
+    {
+      "epoch": 0.12145,
+      "grad_norm": 0.9362991452217102,
+      "learning_rate": 0.003,
+      "loss": 4.0433,
+      "step": 12145
+    },
+    {
+      "epoch": 0.12146,
+      "grad_norm": 0.9021977782249451,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 12146
+    },
+    {
+      "epoch": 0.12147,
+      "grad_norm": 0.95110684633255,
+      "learning_rate": 0.003,
+      "loss": 4.0323,
+      "step": 12147
+    },
+    {
+      "epoch": 0.12148,
+      "grad_norm": 0.8893001675605774,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 12148
+    },
+    {
+      "epoch": 0.12149,
+      "grad_norm": 0.7543485164642334,
+      "learning_rate": 0.003,
+      "loss": 4.0234,
+      "step": 12149
+    },
+    {
+      "epoch": 0.1215,
+      "grad_norm": 0.6045880317687988,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 12150
+    },
+    {
+      "epoch": 0.12151,
+      "grad_norm": 0.6619433760643005,
+      "learning_rate": 0.003,
+      "loss": 3.9913,
+      "step": 12151
+    },
+    {
+      "epoch": 0.12152,
+      "grad_norm": 0.7473676800727844,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 12152
+    },
+    {
+      "epoch": 0.12153,
+      "grad_norm": 0.6784195899963379,
+      "learning_rate": 0.003,
+      "loss": 4.0497,
+      "step": 12153
+    },
+    {
+      "epoch": 0.12154,
+      "grad_norm": 0.6124919056892395,
+      "learning_rate": 0.003,
+      "loss": 3.9838,
+      "step": 12154
+    },
+    {
+      "epoch": 0.12155,
+      "grad_norm": 0.653683066368103,
+      "learning_rate": 0.003,
+      "loss": 3.9965,
+      "step": 12155
+    },
+    {
+      "epoch": 0.12156,
+      "grad_norm": 0.8463802933692932,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 12156
+    },
+    {
+      "epoch": 0.12157,
+      "grad_norm": 0.9578853249549866,
+      "learning_rate": 0.003,
+      "loss": 3.9782,
+      "step": 12157
+    },
+    {
+      "epoch": 0.12158,
+      "grad_norm": 1.1375261545181274,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 12158
+    },
+    {
+      "epoch": 0.12159,
+      "grad_norm": 0.8238728046417236,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 12159
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.7417867183685303,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 12160
+    },
+    {
+      "epoch": 0.12161,
+      "grad_norm": 0.8725582361221313,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 12161
+    },
+    {
+      "epoch": 0.12162,
+      "grad_norm": 0.9587368965148926,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 12162
+    },
+    {
+      "epoch": 0.12163,
+      "grad_norm": 0.9570406079292297,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 12163
+    },
+    {
+      "epoch": 0.12164,
+      "grad_norm": 0.7927216291427612,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 12164
+    },
+    {
+      "epoch": 0.12165,
+      "grad_norm": 0.8281391859054565,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 12165
+    },
+    {
+      "epoch": 0.12166,
+      "grad_norm": 0.8732867240905762,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 12166
+    },
+    {
+      "epoch": 0.12167,
+      "grad_norm": 1.0064791440963745,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 12167
+    },
+    {
+      "epoch": 0.12168,
+      "grad_norm": 1.2000722885131836,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 12168
+    },
+    {
+      "epoch": 0.12169,
+      "grad_norm": 0.7036688327789307,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 12169
+    },
+    {
+      "epoch": 0.1217,
+      "grad_norm": 0.6756927371025085,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 12170
+    },
+    {
+      "epoch": 0.12171,
+      "grad_norm": 0.6766944527626038,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 12171
+    },
+    {
+      "epoch": 0.12172,
+      "grad_norm": 0.6412097215652466,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 12172
+    },
+    {
+      "epoch": 0.12173,
+      "grad_norm": 0.6343686580657959,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 12173
+    },
+    {
+      "epoch": 0.12174,
+      "grad_norm": 0.728533923625946,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 12174
+    },
+    {
+      "epoch": 0.12175,
+      "grad_norm": 0.7192274928092957,
+      "learning_rate": 0.003,
+      "loss": 4.0347,
+      "step": 12175
+    },
+    {
+      "epoch": 0.12176,
+      "grad_norm": 0.8144643902778625,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 12176
+    },
+    {
+      "epoch": 0.12177,
+      "grad_norm": 0.9071642160415649,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 12177
+    },
+    {
+      "epoch": 0.12178,
+      "grad_norm": 1.0799925327301025,
+      "learning_rate": 0.003,
+      "loss": 4.0376,
+      "step": 12178
+    },
+    {
+      "epoch": 0.12179,
+      "grad_norm": 1.0543906688690186,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 12179
+    },
+    {
+      "epoch": 0.1218,
+      "grad_norm": 0.8628104329109192,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 12180
+    },
+    {
+      "epoch": 0.12181,
+      "grad_norm": 0.6865615248680115,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 12181
+    },
+    {
+      "epoch": 0.12182,
+      "grad_norm": 0.7942445278167725,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 12182
+    },
+    {
+      "epoch": 0.12183,
+      "grad_norm": 0.9711741805076599,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 12183
+    },
+    {
+      "epoch": 0.12184,
+      "grad_norm": 1.001904010772705,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 12184
+    },
+    {
+      "epoch": 0.12185,
+      "grad_norm": 0.8621499538421631,
+      "learning_rate": 0.003,
+      "loss": 4.0108,
+      "step": 12185
+    },
+    {
+      "epoch": 0.12186,
+      "grad_norm": 0.7740522027015686,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 12186
+    },
+    {
+      "epoch": 0.12187,
+      "grad_norm": 0.7193096876144409,
+      "learning_rate": 0.003,
+      "loss": 4.0289,
+      "step": 12187
+    },
+    {
+      "epoch": 0.12188,
+      "grad_norm": 0.7161280512809753,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 12188
+    },
+    {
+      "epoch": 0.12189,
+      "grad_norm": 0.6895385384559631,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 12189
+    },
+    {
+      "epoch": 0.1219,
+      "grad_norm": 0.6350817680358887,
+      "learning_rate": 0.003,
+      "loss": 4.0249,
+      "step": 12190
+    },
+    {
+      "epoch": 0.12191,
+      "grad_norm": 0.5773914456367493,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 12191
+    },
+    {
+      "epoch": 0.12192,
+      "grad_norm": 0.6090174317359924,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 12192
+    },
+    {
+      "epoch": 0.12193,
+      "grad_norm": 0.662630021572113,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 12193
+    },
+    {
+      "epoch": 0.12194,
+      "grad_norm": 0.6350228786468506,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 12194
+    },
+    {
+      "epoch": 0.12195,
+      "grad_norm": 0.6361150741577148,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 12195
+    },
+    {
+      "epoch": 0.12196,
+      "grad_norm": 0.799187421798706,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 12196
+    },
+    {
+      "epoch": 0.12197,
+      "grad_norm": 0.8982497453689575,
+      "learning_rate": 0.003,
+      "loss": 3.9938,
+      "step": 12197
+    },
+    {
+      "epoch": 0.12198,
+      "grad_norm": 1.0497699975967407,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 12198
+    },
+    {
+      "epoch": 0.12199,
+      "grad_norm": 1.0019128322601318,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 12199
+    },
+    {
+      "epoch": 0.122,
+      "grad_norm": 0.9094564914703369,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 12200
+    },
+    {
+      "epoch": 0.12201,
+      "grad_norm": 0.9067058563232422,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 12201
+    },
+    {
+      "epoch": 0.12202,
+      "grad_norm": 0.9713603258132935,
+      "learning_rate": 0.003,
+      "loss": 4.0369,
+      "step": 12202
+    },
+    {
+      "epoch": 0.12203,
+      "grad_norm": 0.9190332889556885,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 12203
+    },
+    {
+      "epoch": 0.12204,
+      "grad_norm": 0.9638498425483704,
+      "learning_rate": 0.003,
+      "loss": 4.049,
+      "step": 12204
+    },
+    {
+      "epoch": 0.12205,
+      "grad_norm": 1.0316067934036255,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 12205
+    },
+    {
+      "epoch": 0.12206,
+      "grad_norm": 1.0103806257247925,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 12206
+    },
+    {
+      "epoch": 0.12207,
+      "grad_norm": 0.9632631540298462,
+      "learning_rate": 0.003,
+      "loss": 4.0809,
+      "step": 12207
+    },
+    {
+      "epoch": 0.12208,
+      "grad_norm": 0.9950851798057556,
+      "learning_rate": 0.003,
+      "loss": 4.0565,
+      "step": 12208
+    },
+    {
+      "epoch": 0.12209,
+      "grad_norm": 0.9666954874992371,
+      "learning_rate": 0.003,
+      "loss": 4.0446,
+      "step": 12209
+    },
+    {
+      "epoch": 0.1221,
+      "grad_norm": 0.8264901041984558,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 12210
+    },
+    {
+      "epoch": 0.12211,
+      "grad_norm": 0.9638129472732544,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 12211
+    },
+    {
+      "epoch": 0.12212,
+      "grad_norm": 1.0826435089111328,
+      "learning_rate": 0.003,
+      "loss": 4.0462,
+      "step": 12212
+    },
+    {
+      "epoch": 0.12213,
+      "grad_norm": 0.922379195690155,
+      "learning_rate": 0.003,
+      "loss": 4.0507,
+      "step": 12213
+    },
+    {
+      "epoch": 0.12214,
+      "grad_norm": 1.0430341958999634,
+      "learning_rate": 0.003,
+      "loss": 4.0501,
+      "step": 12214
+    },
+    {
+      "epoch": 0.12215,
+      "grad_norm": 1.1599080562591553,
+      "learning_rate": 0.003,
+      "loss": 4.0682,
+      "step": 12215
+    },
+    {
+      "epoch": 0.12216,
+      "grad_norm": 0.8667312264442444,
+      "learning_rate": 0.003,
+      "loss": 4.0342,
+      "step": 12216
+    },
+    {
+      "epoch": 0.12217,
+      "grad_norm": 0.7546870708465576,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 12217
+    },
+    {
+      "epoch": 0.12218,
+      "grad_norm": 0.7160872220993042,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 12218
+    },
+    {
+      "epoch": 0.12219,
+      "grad_norm": 0.6566637754440308,
+      "learning_rate": 0.003,
+      "loss": 4.0527,
+      "step": 12219
+    },
+    {
+      "epoch": 0.1222,
+      "grad_norm": 0.621151864528656,
+      "learning_rate": 0.003,
+      "loss": 4.0332,
+      "step": 12220
+    },
+    {
+      "epoch": 0.12221,
+      "grad_norm": 0.6620752811431885,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 12221
+    },
+    {
+      "epoch": 0.12222,
+      "grad_norm": 0.6338918805122375,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 12222
+    },
+    {
+      "epoch": 0.12223,
+      "grad_norm": 0.5584115982055664,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 12223
+    },
+    {
+      "epoch": 0.12224,
+      "grad_norm": 0.6339457035064697,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 12224
+    },
+    {
+      "epoch": 0.12225,
+      "grad_norm": 0.706930935382843,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 12225
+    },
+    {
+      "epoch": 0.12226,
+      "grad_norm": 0.7364891767501831,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 12226
+    },
+    {
+      "epoch": 0.12227,
+      "grad_norm": 0.8302417397499084,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 12227
+    },
+    {
+      "epoch": 0.12228,
+      "grad_norm": 0.9262897968292236,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 12228
+    },
+    {
+      "epoch": 0.12229,
+      "grad_norm": 0.8998088240623474,
+      "learning_rate": 0.003,
+      "loss": 4.0369,
+      "step": 12229
+    },
+    {
+      "epoch": 0.1223,
+      "grad_norm": 0.9295552968978882,
+      "learning_rate": 0.003,
+      "loss": 4.0596,
+      "step": 12230
+    },
+    {
+      "epoch": 0.12231,
+      "grad_norm": 1.1946221590042114,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 12231
+    },
+    {
+      "epoch": 0.12232,
+      "grad_norm": 1.2436331510543823,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 12232
+    },
+    {
+      "epoch": 0.12233,
+      "grad_norm": 0.6863802075386047,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 12233
+    },
+    {
+      "epoch": 0.12234,
+      "grad_norm": 0.6205759644508362,
+      "learning_rate": 0.003,
+      "loss": 4.0495,
+      "step": 12234
+    },
+    {
+      "epoch": 0.12235,
+      "grad_norm": 0.7436956167221069,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 12235
+    },
+    {
+      "epoch": 0.12236,
+      "grad_norm": 0.698631227016449,
+      "learning_rate": 0.003,
+      "loss": 4.0448,
+      "step": 12236
+    },
+    {
+      "epoch": 0.12237,
+      "grad_norm": 0.7856922745704651,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 12237
+    },
+    {
+      "epoch": 0.12238,
+      "grad_norm": 0.7397446632385254,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 12238
+    },
+    {
+      "epoch": 0.12239,
+      "grad_norm": 0.6234411597251892,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 12239
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.6826146245002747,
+      "learning_rate": 0.003,
+      "loss": 4.0382,
+      "step": 12240
+    },
+    {
+      "epoch": 0.12241,
+      "grad_norm": 0.8209196329116821,
+      "learning_rate": 0.003,
+      "loss": 3.9929,
+      "step": 12241
+    },
+    {
+      "epoch": 0.12242,
+      "grad_norm": 0.8299884796142578,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 12242
+    },
+    {
+      "epoch": 0.12243,
+      "grad_norm": 0.8798038959503174,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 12243
+    },
+    {
+      "epoch": 0.12244,
+      "grad_norm": 0.8742157220840454,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 12244
+    },
+    {
+      "epoch": 0.12245,
+      "grad_norm": 0.8342897891998291,
+      "learning_rate": 0.003,
+      "loss": 4.0443,
+      "step": 12245
+    },
+    {
+      "epoch": 0.12246,
+      "grad_norm": 0.8192770481109619,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 12246
+    },
+    {
+      "epoch": 0.12247,
+      "grad_norm": 0.7069940567016602,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 12247
+    },
+    {
+      "epoch": 0.12248,
+      "grad_norm": 0.6147398948669434,
+      "learning_rate": 0.003,
+      "loss": 4.0119,
+      "step": 12248
+    },
+    {
+      "epoch": 0.12249,
+      "grad_norm": 0.5641793012619019,
+      "learning_rate": 0.003,
+      "loss": 3.9793,
+      "step": 12249
+    },
+    {
+      "epoch": 0.1225,
+      "grad_norm": 0.5511481165885925,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 12250
+    },
+    {
+      "epoch": 0.12251,
+      "grad_norm": 0.5728287100791931,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 12251
+    },
+    {
+      "epoch": 0.12252,
+      "grad_norm": 0.6058318614959717,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 12252
+    },
+    {
+      "epoch": 0.12253,
+      "grad_norm": 0.7094419002532959,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 12253
+    },
+    {
+      "epoch": 0.12254,
+      "grad_norm": 0.7600876092910767,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 12254
+    },
+    {
+      "epoch": 0.12255,
+      "grad_norm": 0.8422216773033142,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 12255
+    },
+    {
+      "epoch": 0.12256,
+      "grad_norm": 1.0142340660095215,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 12256
+    },
+    {
+      "epoch": 0.12257,
+      "grad_norm": 1.1331077814102173,
+      "learning_rate": 0.003,
+      "loss": 4.0471,
+      "step": 12257
+    },
+    {
+      "epoch": 0.12258,
+      "grad_norm": 0.7469304800033569,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 12258
+    },
+    {
+      "epoch": 0.12259,
+      "grad_norm": 0.706351637840271,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 12259
+    },
+    {
+      "epoch": 0.1226,
+      "grad_norm": 0.6740030646324158,
+      "learning_rate": 0.003,
+      "loss": 4.0004,
+      "step": 12260
+    },
+    {
+      "epoch": 0.12261,
+      "grad_norm": 0.6421547532081604,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 12261
+    },
+    {
+      "epoch": 0.12262,
+      "grad_norm": 0.6817286014556885,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 12262
+    },
+    {
+      "epoch": 0.12263,
+      "grad_norm": 0.7440471649169922,
+      "learning_rate": 0.003,
+      "loss": 3.9885,
+      "step": 12263
+    },
+    {
+      "epoch": 0.12264,
+      "grad_norm": 0.6655385494232178,
+      "learning_rate": 0.003,
+      "loss": 4.0242,
+      "step": 12264
+    },
+    {
+      "epoch": 0.12265,
+      "grad_norm": 0.6313167214393616,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 12265
+    },
+    {
+      "epoch": 0.12266,
+      "grad_norm": 0.806358814239502,
+      "learning_rate": 0.003,
+      "loss": 3.9861,
+      "step": 12266
+    },
+    {
+      "epoch": 0.12267,
+      "grad_norm": 1.0218111276626587,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 12267
+    },
+    {
+      "epoch": 0.12268,
+      "grad_norm": 1.006636381149292,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 12268
+    },
+    {
+      "epoch": 0.12269,
+      "grad_norm": 0.9273249506950378,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 12269
+    },
+    {
+      "epoch": 0.1227,
+      "grad_norm": 0.9001882076263428,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 12270
+    },
+    {
+      "epoch": 0.12271,
+      "grad_norm": 0.8164517283439636,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 12271
+    },
+    {
+      "epoch": 0.12272,
+      "grad_norm": 0.9012036919593811,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 12272
+    },
+    {
+      "epoch": 0.12273,
+      "grad_norm": 1.025341272354126,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 12273
+    },
+    {
+      "epoch": 0.12274,
+      "grad_norm": 1.055772066116333,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 12274
+    },
+    {
+      "epoch": 0.12275,
+      "grad_norm": 1.0038049221038818,
+      "learning_rate": 0.003,
+      "loss": 4.0489,
+      "step": 12275
+    },
+    {
+      "epoch": 0.12276,
+      "grad_norm": 0.9755121469497681,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 12276
+    },
+    {
+      "epoch": 0.12277,
+      "grad_norm": 0.9044497013092041,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 12277
+    },
+    {
+      "epoch": 0.12278,
+      "grad_norm": 0.8723435997962952,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 12278
+    },
+    {
+      "epoch": 0.12279,
+      "grad_norm": 0.7940369248390198,
+      "learning_rate": 0.003,
+      "loss": 3.9907,
+      "step": 12279
+    },
+    {
+      "epoch": 0.1228,
+      "grad_norm": 0.8326817750930786,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 12280
+    },
+    {
+      "epoch": 0.12281,
+      "grad_norm": 0.8071943521499634,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 12281
+    },
+    {
+      "epoch": 0.12282,
+      "grad_norm": 0.7996351718902588,
+      "learning_rate": 0.003,
+      "loss": 4.0536,
+      "step": 12282
+    },
+    {
+      "epoch": 0.12283,
+      "grad_norm": 0.7421732544898987,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 12283
+    },
+    {
+      "epoch": 0.12284,
+      "grad_norm": 0.7162188291549683,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 12284
+    },
+    {
+      "epoch": 0.12285,
+      "grad_norm": 0.6978950500488281,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 12285
+    },
+    {
+      "epoch": 0.12286,
+      "grad_norm": 0.6271545886993408,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 12286
+    },
+    {
+      "epoch": 0.12287,
+      "grad_norm": 0.6546214818954468,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 12287
+    },
+    {
+      "epoch": 0.12288,
+      "grad_norm": 0.6622276306152344,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 12288
+    },
+    {
+      "epoch": 0.12289,
+      "grad_norm": 0.7192988991737366,
+      "learning_rate": 0.003,
+      "loss": 3.9902,
+      "step": 12289
+    },
+    {
+      "epoch": 0.1229,
+      "grad_norm": 0.8424289226531982,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 12290
+    },
+    {
+      "epoch": 0.12291,
+      "grad_norm": 1.0219829082489014,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 12291
+    },
+    {
+      "epoch": 0.12292,
+      "grad_norm": 1.1891382932662964,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 12292
+    },
+    {
+      "epoch": 0.12293,
+      "grad_norm": 0.7496522068977356,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 12293
+    },
+    {
+      "epoch": 0.12294,
+      "grad_norm": 0.6887698173522949,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 12294
+    },
+    {
+      "epoch": 0.12295,
+      "grad_norm": 0.6139264702796936,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 12295
+    },
+    {
+      "epoch": 0.12296,
+      "grad_norm": 0.6319611072540283,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 12296
+    },
+    {
+      "epoch": 0.12297,
+      "grad_norm": 0.8084327578544617,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 12297
+    },
+    {
+      "epoch": 0.12298,
+      "grad_norm": 0.946826159954071,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 12298
+    },
+    {
+      "epoch": 0.12299,
+      "grad_norm": 1.009784460067749,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 12299
+    },
+    {
+      "epoch": 0.123,
+      "grad_norm": 1.0070241689682007,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 12300
+    },
+    {
+      "epoch": 0.12301,
+      "grad_norm": 1.0530861616134644,
+      "learning_rate": 0.003,
+      "loss": 4.0503,
+      "step": 12301
+    },
+    {
+      "epoch": 0.12302,
+      "grad_norm": 0.8976438045501709,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 12302
+    },
+    {
+      "epoch": 0.12303,
+      "grad_norm": 0.8210477828979492,
+      "learning_rate": 0.003,
+      "loss": 4.0892,
+      "step": 12303
+    },
+    {
+      "epoch": 0.12304,
+      "grad_norm": 0.8757627606391907,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 12304
+    },
+    {
+      "epoch": 0.12305,
+      "grad_norm": 0.8765901327133179,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 12305
+    },
+    {
+      "epoch": 0.12306,
+      "grad_norm": 0.8396662473678589,
+      "learning_rate": 0.003,
+      "loss": 4.0234,
+      "step": 12306
+    },
+    {
+      "epoch": 0.12307,
+      "grad_norm": 0.6701096892356873,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 12307
+    },
+    {
+      "epoch": 0.12308,
+      "grad_norm": 0.6225115060806274,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 12308
+    },
+    {
+      "epoch": 0.12309,
+      "grad_norm": 0.5880732536315918,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 12309
+    },
+    {
+      "epoch": 0.1231,
+      "grad_norm": 0.5994876027107239,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 12310
+    },
+    {
+      "epoch": 0.12311,
+      "grad_norm": 0.5422573685646057,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 12311
+    },
+    {
+      "epoch": 0.12312,
+      "grad_norm": 0.5807183384895325,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 12312
+    },
+    {
+      "epoch": 0.12313,
+      "grad_norm": 0.5875729322433472,
+      "learning_rate": 0.003,
+      "loss": 4.0416,
+      "step": 12313
+    },
+    {
+      "epoch": 0.12314,
+      "grad_norm": 0.590406596660614,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 12314
+    },
+    {
+      "epoch": 0.12315,
+      "grad_norm": 0.663951575756073,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 12315
+    },
+    {
+      "epoch": 0.12316,
+      "grad_norm": 0.7856951951980591,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 12316
+    },
+    {
+      "epoch": 0.12317,
+      "grad_norm": 0.9424818754196167,
+      "learning_rate": 0.003,
+      "loss": 3.9832,
+      "step": 12317
+    },
+    {
+      "epoch": 0.12318,
+      "grad_norm": 1.0489387512207031,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 12318
+    },
+    {
+      "epoch": 0.12319,
+      "grad_norm": 0.9710455536842346,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 12319
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.9531890153884888,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 12320
+    },
+    {
+      "epoch": 0.12321,
+      "grad_norm": 0.8863466382026672,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 12321
+    },
+    {
+      "epoch": 0.12322,
+      "grad_norm": 0.8031570911407471,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 12322
+    },
+    {
+      "epoch": 0.12323,
+      "grad_norm": 0.8672590255737305,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 12323
+    },
+    {
+      "epoch": 0.12324,
+      "grad_norm": 0.8071771264076233,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 12324
+    },
+    {
+      "epoch": 0.12325,
+      "grad_norm": 0.6649886965751648,
+      "learning_rate": 0.003,
+      "loss": 3.9818,
+      "step": 12325
+    },
+    {
+      "epoch": 0.12326,
+      "grad_norm": 0.6209746599197388,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 12326
+    },
+    {
+      "epoch": 0.12327,
+      "grad_norm": 0.7036540508270264,
+      "learning_rate": 0.003,
+      "loss": 3.9861,
+      "step": 12327
+    },
+    {
+      "epoch": 0.12328,
+      "grad_norm": 1.0158931016921997,
+      "learning_rate": 0.003,
+      "loss": 3.9868,
+      "step": 12328
+    },
+    {
+      "epoch": 0.12329,
+      "grad_norm": 1.3318747282028198,
+      "learning_rate": 0.003,
+      "loss": 4.043,
+      "step": 12329
+    },
+    {
+      "epoch": 0.1233,
+      "grad_norm": 0.6423041224479675,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 12330
+    },
+    {
+      "epoch": 0.12331,
+      "grad_norm": 0.6106191873550415,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 12331
+    },
+    {
+      "epoch": 0.12332,
+      "grad_norm": 0.7630592584609985,
+      "learning_rate": 0.003,
+      "loss": 3.9992,
+      "step": 12332
+    },
+    {
+      "epoch": 0.12333,
+      "grad_norm": 0.7819821834564209,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 12333
+    },
+    {
+      "epoch": 0.12334,
+      "grad_norm": 0.7031112313270569,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 12334
+    },
+    {
+      "epoch": 0.12335,
+      "grad_norm": 0.7386513948440552,
+      "learning_rate": 0.003,
+      "loss": 4.0176,
+      "step": 12335
+    },
+    {
+      "epoch": 0.12336,
+      "grad_norm": 0.7526112794876099,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 12336
+    },
+    {
+      "epoch": 0.12337,
+      "grad_norm": 0.7977548837661743,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 12337
+    },
+    {
+      "epoch": 0.12338,
+      "grad_norm": 0.8067567944526672,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 12338
+    },
+    {
+      "epoch": 0.12339,
+      "grad_norm": 0.8627603650093079,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 12339
+    },
+    {
+      "epoch": 0.1234,
+      "grad_norm": 0.9207614660263062,
+      "learning_rate": 0.003,
+      "loss": 4.0449,
+      "step": 12340
+    },
+    {
+      "epoch": 0.12341,
+      "grad_norm": 0.74089115858078,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 12341
+    },
+    {
+      "epoch": 0.12342,
+      "grad_norm": 0.6935558319091797,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 12342
+    },
+    {
+      "epoch": 0.12343,
+      "grad_norm": 0.7786235809326172,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 12343
+    },
+    {
+      "epoch": 0.12344,
+      "grad_norm": 0.9137173891067505,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 12344
+    },
+    {
+      "epoch": 0.12345,
+      "grad_norm": 0.8964434266090393,
+      "learning_rate": 0.003,
+      "loss": 3.992,
+      "step": 12345
+    },
+    {
+      "epoch": 0.12346,
+      "grad_norm": 0.9500121474266052,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 12346
+    },
+    {
+      "epoch": 0.12347,
+      "grad_norm": 0.8546897172927856,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 12347
+    },
+    {
+      "epoch": 0.12348,
+      "grad_norm": 0.7947580218315125,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 12348
+    },
+    {
+      "epoch": 0.12349,
+      "grad_norm": 0.8268088698387146,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 12349
+    },
+    {
+      "epoch": 0.1235,
+      "grad_norm": 0.7504370808601379,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 12350
+    },
+    {
+      "epoch": 0.12351,
+      "grad_norm": 0.7369322180747986,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 12351
+    },
+    {
+      "epoch": 0.12352,
+      "grad_norm": 0.7135762572288513,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 12352
+    },
+    {
+      "epoch": 0.12353,
+      "grad_norm": 0.7480922341346741,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 12353
+    },
+    {
+      "epoch": 0.12354,
+      "grad_norm": 0.7950556874275208,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 12354
+    },
+    {
+      "epoch": 0.12355,
+      "grad_norm": 0.8724504709243774,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 12355
+    },
+    {
+      "epoch": 0.12356,
+      "grad_norm": 1.0912940502166748,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 12356
+    },
+    {
+      "epoch": 0.12357,
+      "grad_norm": 0.9967378973960876,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 12357
+    },
+    {
+      "epoch": 0.12358,
+      "grad_norm": 0.8651845455169678,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 12358
+    },
+    {
+      "epoch": 0.12359,
+      "grad_norm": 0.7292317152023315,
+      "learning_rate": 0.003,
+      "loss": 3.996,
+      "step": 12359
+    },
+    {
+      "epoch": 0.1236,
+      "grad_norm": 0.7191388010978699,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 12360
+    },
+    {
+      "epoch": 0.12361,
+      "grad_norm": 0.645366370677948,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 12361
+    },
+    {
+      "epoch": 0.12362,
+      "grad_norm": 0.6226277947425842,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 12362
+    },
+    {
+      "epoch": 0.12363,
+      "grad_norm": 0.5864716172218323,
+      "learning_rate": 0.003,
+      "loss": 4.0056,
+      "step": 12363
+    },
+    {
+      "epoch": 0.12364,
+      "grad_norm": 0.5753576159477234,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 12364
+    },
+    {
+      "epoch": 0.12365,
+      "grad_norm": 0.6819441914558411,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 12365
+    },
+    {
+      "epoch": 0.12366,
+      "grad_norm": 0.7048933506011963,
+      "learning_rate": 0.003,
+      "loss": 4.0651,
+      "step": 12366
+    },
+    {
+      "epoch": 0.12367,
+      "grad_norm": 0.7060320377349854,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 12367
+    },
+    {
+      "epoch": 0.12368,
+      "grad_norm": 0.6548412442207336,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 12368
+    },
+    {
+      "epoch": 0.12369,
+      "grad_norm": 0.6585349440574646,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 12369
+    },
+    {
+      "epoch": 0.1237,
+      "grad_norm": 0.751133143901825,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 12370
+    },
+    {
+      "epoch": 0.12371,
+      "grad_norm": 0.8336089849472046,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 12371
+    },
+    {
+      "epoch": 0.12372,
+      "grad_norm": 1.0661908388137817,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 12372
+    },
+    {
+      "epoch": 0.12373,
+      "grad_norm": 1.1019768714904785,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 12373
+    },
+    {
+      "epoch": 0.12374,
+      "grad_norm": 0.7196155190467834,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 12374
+    },
+    {
+      "epoch": 0.12375,
+      "grad_norm": 0.737911581993103,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 12375
+    },
+    {
+      "epoch": 0.12376,
+      "grad_norm": 0.854030191898346,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 12376
+    },
+    {
+      "epoch": 0.12377,
+      "grad_norm": 0.8438823819160461,
+      "learning_rate": 0.003,
+      "loss": 3.986,
+      "step": 12377
+    },
+    {
+      "epoch": 0.12378,
+      "grad_norm": 0.9753403067588806,
+      "learning_rate": 0.003,
+      "loss": 4.0541,
+      "step": 12378
+    },
+    {
+      "epoch": 0.12379,
+      "grad_norm": 1.1652812957763672,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 12379
+    },
+    {
+      "epoch": 0.1238,
+      "grad_norm": 0.9180656671524048,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 12380
+    },
+    {
+      "epoch": 0.12381,
+      "grad_norm": 0.8215830326080322,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 12381
+    },
+    {
+      "epoch": 0.12382,
+      "grad_norm": 0.8445789813995361,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 12382
+    },
+    {
+      "epoch": 0.12383,
+      "grad_norm": 0.8105951547622681,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 12383
+    },
+    {
+      "epoch": 0.12384,
+      "grad_norm": 0.8429214954376221,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 12384
+    },
+    {
+      "epoch": 0.12385,
+      "grad_norm": 1.0361195802688599,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 12385
+    },
+    {
+      "epoch": 0.12386,
+      "grad_norm": 1.1223865747451782,
+      "learning_rate": 0.003,
+      "loss": 4.034,
+      "step": 12386
+    },
+    {
+      "epoch": 0.12387,
+      "grad_norm": 0.7196579575538635,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 12387
+    },
+    {
+      "epoch": 0.12388,
+      "grad_norm": 0.6705121994018555,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 12388
+    },
+    {
+      "epoch": 0.12389,
+      "grad_norm": 0.9487857818603516,
+      "learning_rate": 0.003,
+      "loss": 4.0479,
+      "step": 12389
+    },
+    {
+      "epoch": 0.1239,
+      "grad_norm": 1.1736794710159302,
+      "learning_rate": 0.003,
+      "loss": 4.0698,
+      "step": 12390
+    },
+    {
+      "epoch": 0.12391,
+      "grad_norm": 0.8355658054351807,
+      "learning_rate": 0.003,
+      "loss": 3.986,
+      "step": 12391
+    },
+    {
+      "epoch": 0.12392,
+      "grad_norm": 0.7802602052688599,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 12392
+    },
+    {
+      "epoch": 0.12393,
+      "grad_norm": 0.8559476137161255,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 12393
+    },
+    {
+      "epoch": 0.12394,
+      "grad_norm": 0.8642109632492065,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 12394
+    },
+    {
+      "epoch": 0.12395,
+      "grad_norm": 0.8027004599571228,
+      "learning_rate": 0.003,
+      "loss": 3.9943,
+      "step": 12395
+    },
+    {
+      "epoch": 0.12396,
+      "grad_norm": 0.6828938126564026,
+      "learning_rate": 0.003,
+      "loss": 4.0432,
+      "step": 12396
+    },
+    {
+      "epoch": 0.12397,
+      "grad_norm": 0.7188079357147217,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 12397
+    },
+    {
+      "epoch": 0.12398,
+      "grad_norm": 0.745271623134613,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 12398
+    },
+    {
+      "epoch": 0.12399,
+      "grad_norm": 0.8426857590675354,
+      "learning_rate": 0.003,
+      "loss": 4.0555,
+      "step": 12399
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.8381713032722473,
+      "learning_rate": 0.003,
+      "loss": 4.0697,
+      "step": 12400
+    },
+    {
+      "epoch": 0.12401,
+      "grad_norm": 0.712110698223114,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 12401
+    },
+    {
+      "epoch": 0.12402,
+      "grad_norm": 0.6172171831130981,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 12402
+    },
+    {
+      "epoch": 0.12403,
+      "grad_norm": 0.633280336856842,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 12403
+    },
+    {
+      "epoch": 0.12404,
+      "grad_norm": 0.5696246027946472,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 12404
+    },
+    {
+      "epoch": 0.12405,
+      "grad_norm": 0.6653860211372375,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 12405
+    },
+    {
+      "epoch": 0.12406,
+      "grad_norm": 0.6514864563941956,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 12406
+    },
+    {
+      "epoch": 0.12407,
+      "grad_norm": 0.7046161890029907,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 12407
+    },
+    {
+      "epoch": 0.12408,
+      "grad_norm": 0.8537508845329285,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 12408
+    },
+    {
+      "epoch": 0.12409,
+      "grad_norm": 0.8411065340042114,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 12409
+    },
+    {
+      "epoch": 0.1241,
+      "grad_norm": 1.0898795127868652,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 12410
+    },
+    {
+      "epoch": 0.12411,
+      "grad_norm": 1.1494314670562744,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 12411
+    },
+    {
+      "epoch": 0.12412,
+      "grad_norm": 0.8649919629096985,
+      "learning_rate": 0.003,
+      "loss": 3.9957,
+      "step": 12412
+    },
+    {
+      "epoch": 0.12413,
+      "grad_norm": 0.6732476949691772,
+      "learning_rate": 0.003,
+      "loss": 4.0389,
+      "step": 12413
+    },
+    {
+      "epoch": 0.12414,
+      "grad_norm": 0.6648367643356323,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 12414
+    },
+    {
+      "epoch": 0.12415,
+      "grad_norm": 0.690162181854248,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 12415
+    },
+    {
+      "epoch": 0.12416,
+      "grad_norm": 0.7361937165260315,
+      "learning_rate": 0.003,
+      "loss": 4.0334,
+      "step": 12416
+    },
+    {
+      "epoch": 0.12417,
+      "grad_norm": 0.7241788506507874,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 12417
+    },
+    {
+      "epoch": 0.12418,
+      "grad_norm": 0.8302240371704102,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 12418
+    },
+    {
+      "epoch": 0.12419,
+      "grad_norm": 0.7926172018051147,
+      "learning_rate": 0.003,
+      "loss": 3.9983,
+      "step": 12419
+    },
+    {
+      "epoch": 0.1242,
+      "grad_norm": 0.6780438423156738,
+      "learning_rate": 0.003,
+      "loss": 3.984,
+      "step": 12420
+    },
+    {
+      "epoch": 0.12421,
+      "grad_norm": 0.6866170167922974,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 12421
+    },
+    {
+      "epoch": 0.12422,
+      "grad_norm": 0.6301003098487854,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 12422
+    },
+    {
+      "epoch": 0.12423,
+      "grad_norm": 0.5220256447792053,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 12423
+    },
+    {
+      "epoch": 0.12424,
+      "grad_norm": 0.5333865284919739,
+      "learning_rate": 0.003,
+      "loss": 3.9727,
+      "step": 12424
+    },
+    {
+      "epoch": 0.12425,
+      "grad_norm": 0.5312225818634033,
+      "learning_rate": 0.003,
+      "loss": 3.9998,
+      "step": 12425
+    },
+    {
+      "epoch": 0.12426,
+      "grad_norm": 0.7749300599098206,
+      "learning_rate": 0.003,
+      "loss": 3.9977,
+      "step": 12426
+    },
+    {
+      "epoch": 0.12427,
+      "grad_norm": 1.0293408632278442,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 12427
+    },
+    {
+      "epoch": 0.12428,
+      "grad_norm": 1.2746484279632568,
+      "learning_rate": 0.003,
+      "loss": 4.0421,
+      "step": 12428
+    },
+    {
+      "epoch": 0.12429,
+      "grad_norm": 0.5662277936935425,
+      "learning_rate": 0.003,
+      "loss": 3.9929,
+      "step": 12429
+    },
+    {
+      "epoch": 0.1243,
+      "grad_norm": 0.6462495923042297,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 12430
+    },
+    {
+      "epoch": 0.12431,
+      "grad_norm": 0.7909497618675232,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 12431
+    },
+    {
+      "epoch": 0.12432,
+      "grad_norm": 0.828307032585144,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 12432
+    },
+    {
+      "epoch": 0.12433,
+      "grad_norm": 0.7236911058425903,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 12433
+    },
+    {
+      "epoch": 0.12434,
+      "grad_norm": 0.6816155314445496,
+      "learning_rate": 0.003,
+      "loss": 4.0456,
+      "step": 12434
+    },
+    {
+      "epoch": 0.12435,
+      "grad_norm": 0.6432563662528992,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 12435
+    },
+    {
+      "epoch": 0.12436,
+      "grad_norm": 0.6814159154891968,
+      "learning_rate": 0.003,
+      "loss": 3.9901,
+      "step": 12436
+    },
+    {
+      "epoch": 0.12437,
+      "grad_norm": 0.6854045987129211,
+      "learning_rate": 0.003,
+      "loss": 3.9905,
+      "step": 12437
+    },
+    {
+      "epoch": 0.12438,
+      "grad_norm": 0.7913328409194946,
+      "learning_rate": 0.003,
+      "loss": 3.9922,
+      "step": 12438
+    },
+    {
+      "epoch": 0.12439,
+      "grad_norm": 1.0051794052124023,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 12439
+    },
+    {
+      "epoch": 0.1244,
+      "grad_norm": 1.0895280838012695,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 12440
+    },
+    {
+      "epoch": 0.12441,
+      "grad_norm": 1.0590616464614868,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 12441
+    },
+    {
+      "epoch": 0.12442,
+      "grad_norm": 1.073089361190796,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 12442
+    },
+    {
+      "epoch": 0.12443,
+      "grad_norm": 1.0370190143585205,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 12443
+    },
+    {
+      "epoch": 0.12444,
+      "grad_norm": 0.8426869511604309,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 12444
+    },
+    {
+      "epoch": 0.12445,
+      "grad_norm": 0.8574836850166321,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 12445
+    },
+    {
+      "epoch": 0.12446,
+      "grad_norm": 0.8478577136993408,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 12446
+    },
+    {
+      "epoch": 0.12447,
+      "grad_norm": 1.0601985454559326,
+      "learning_rate": 0.003,
+      "loss": 4.0503,
+      "step": 12447
+    },
+    {
+      "epoch": 0.12448,
+      "grad_norm": 1.0860538482666016,
+      "learning_rate": 0.003,
+      "loss": 4.0475,
+      "step": 12448
+    },
+    {
+      "epoch": 0.12449,
+      "grad_norm": 1.008545994758606,
+      "learning_rate": 0.003,
+      "loss": 4.0635,
+      "step": 12449
+    },
+    {
+      "epoch": 0.1245,
+      "grad_norm": 1.0802119970321655,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 12450
+    },
+    {
+      "epoch": 0.12451,
+      "grad_norm": 0.8374972343444824,
+      "learning_rate": 0.003,
+      "loss": 3.9975,
+      "step": 12451
+    },
+    {
+      "epoch": 0.12452,
+      "grad_norm": 0.9035298228263855,
+      "learning_rate": 0.003,
+      "loss": 4.048,
+      "step": 12452
+    },
+    {
+      "epoch": 0.12453,
+      "grad_norm": 0.7676633596420288,
+      "learning_rate": 0.003,
+      "loss": 4.0504,
+      "step": 12453
+    },
+    {
+      "epoch": 0.12454,
+      "grad_norm": 0.6823439598083496,
+      "learning_rate": 0.003,
+      "loss": 3.9992,
+      "step": 12454
+    },
+    {
+      "epoch": 0.12455,
+      "grad_norm": 0.6423712968826294,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 12455
+    },
+    {
+      "epoch": 0.12456,
+      "grad_norm": 0.6407604217529297,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 12456
+    },
+    {
+      "epoch": 0.12457,
+      "grad_norm": 0.7209190726280212,
+      "learning_rate": 0.003,
+      "loss": 4.0311,
+      "step": 12457
+    },
+    {
+      "epoch": 0.12458,
+      "grad_norm": 0.7898573875427246,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 12458
+    },
+    {
+      "epoch": 0.12459,
+      "grad_norm": 0.7464539408683777,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 12459
+    },
+    {
+      "epoch": 0.1246,
+      "grad_norm": 0.7210266590118408,
+      "learning_rate": 0.003,
+      "loss": 4.0446,
+      "step": 12460
+    },
+    {
+      "epoch": 0.12461,
+      "grad_norm": 0.6472708582878113,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 12461
+    },
+    {
+      "epoch": 0.12462,
+      "grad_norm": 0.6778544187545776,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 12462
+    },
+    {
+      "epoch": 0.12463,
+      "grad_norm": 0.7389536499977112,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 12463
+    },
+    {
+      "epoch": 0.12464,
+      "grad_norm": 0.9307025074958801,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 12464
+    },
+    {
+      "epoch": 0.12465,
+      "grad_norm": 1.2131528854370117,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 12465
+    },
+    {
+      "epoch": 0.12466,
+      "grad_norm": 0.8831672072410583,
+      "learning_rate": 0.003,
+      "loss": 4.0423,
+      "step": 12466
+    },
+    {
+      "epoch": 0.12467,
+      "grad_norm": 0.7936322093009949,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 12467
+    },
+    {
+      "epoch": 0.12468,
+      "grad_norm": 0.7008469104766846,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 12468
+    },
+    {
+      "epoch": 0.12469,
+      "grad_norm": 0.6694667339324951,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 12469
+    },
+    {
+      "epoch": 0.1247,
+      "grad_norm": 0.581159770488739,
+      "learning_rate": 0.003,
+      "loss": 3.9841,
+      "step": 12470
+    },
+    {
+      "epoch": 0.12471,
+      "grad_norm": 0.5600728392601013,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 12471
+    },
+    {
+      "epoch": 0.12472,
+      "grad_norm": 0.5877519845962524,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 12472
+    },
+    {
+      "epoch": 0.12473,
+      "grad_norm": 0.595278263092041,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 12473
+    },
+    {
+      "epoch": 0.12474,
+      "grad_norm": 0.5744599103927612,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 12474
+    },
+    {
+      "epoch": 0.12475,
+      "grad_norm": 0.6423681378364563,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 12475
+    },
+    {
+      "epoch": 0.12476,
+      "grad_norm": 0.8026760220527649,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 12476
+    },
+    {
+      "epoch": 0.12477,
+      "grad_norm": 1.138680338859558,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 12477
+    },
+    {
+      "epoch": 0.12478,
+      "grad_norm": 1.0252193212509155,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 12478
+    },
+    {
+      "epoch": 0.12479,
+      "grad_norm": 0.7640153765678406,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 12479
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.6829314231872559,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 12480
+    },
+    {
+      "epoch": 0.12481,
+      "grad_norm": 0.7560384273529053,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 12481
+    },
+    {
+      "epoch": 0.12482,
+      "grad_norm": 0.9146919846534729,
+      "learning_rate": 0.003,
+      "loss": 4.0289,
+      "step": 12482
+    },
+    {
+      "epoch": 0.12483,
+      "grad_norm": 1.131492257118225,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 12483
+    },
+    {
+      "epoch": 0.12484,
+      "grad_norm": 0.9777736067771912,
+      "learning_rate": 0.003,
+      "loss": 3.9781,
+      "step": 12484
+    },
+    {
+      "epoch": 0.12485,
+      "grad_norm": 0.8896247744560242,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 12485
+    },
+    {
+      "epoch": 0.12486,
+      "grad_norm": 0.7884039282798767,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 12486
+    },
+    {
+      "epoch": 0.12487,
+      "grad_norm": 0.74324631690979,
+      "learning_rate": 0.003,
+      "loss": 3.9803,
+      "step": 12487
+    },
+    {
+      "epoch": 0.12488,
+      "grad_norm": 0.7056796550750732,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 12488
+    },
+    {
+      "epoch": 0.12489,
+      "grad_norm": 0.6776674389839172,
+      "learning_rate": 0.003,
+      "loss": 3.9984,
+      "step": 12489
+    },
+    {
+      "epoch": 0.1249,
+      "grad_norm": 0.7072422504425049,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 12490
+    },
+    {
+      "epoch": 0.12491,
+      "grad_norm": 0.7262662649154663,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 12491
+    },
+    {
+      "epoch": 0.12492,
+      "grad_norm": 0.8424429297447205,
+      "learning_rate": 0.003,
+      "loss": 4.0334,
+      "step": 12492
+    },
+    {
+      "epoch": 0.12493,
+      "grad_norm": 0.9678304195404053,
+      "learning_rate": 0.003,
+      "loss": 4.0321,
+      "step": 12493
+    },
+    {
+      "epoch": 0.12494,
+      "grad_norm": 0.8881115317344666,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 12494
+    },
+    {
+      "epoch": 0.12495,
+      "grad_norm": 0.8806186318397522,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 12495
+    },
+    {
+      "epoch": 0.12496,
+      "grad_norm": 0.8792669773101807,
+      "learning_rate": 0.003,
+      "loss": 4.0091,
+      "step": 12496
+    },
+    {
+      "epoch": 0.12497,
+      "grad_norm": 0.9512883424758911,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 12497
+    },
+    {
+      "epoch": 0.12498,
+      "grad_norm": 0.9773924946784973,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 12498
+    },
+    {
+      "epoch": 0.12499,
+      "grad_norm": 0.9709060192108154,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 12499
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 1.2080605030059814,
+      "learning_rate": 0.003,
+      "loss": 4.079,
+      "step": 12500
+    },
+    {
+      "epoch": 0.12501,
+      "grad_norm": 0.8987671136856079,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 12501
+    },
+    {
+      "epoch": 0.12502,
+      "grad_norm": 0.8190386295318604,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 12502
+    },
+    {
+      "epoch": 0.12503,
+      "grad_norm": 0.9141505360603333,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 12503
+    },
+    {
+      "epoch": 0.12504,
+      "grad_norm": 0.8598799705505371,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 12504
+    },
+    {
+      "epoch": 0.12505,
+      "grad_norm": 0.9165339469909668,
+      "learning_rate": 0.003,
+      "loss": 4.054,
+      "step": 12505
+    },
+    {
+      "epoch": 0.12506,
+      "grad_norm": 1.0256816148757935,
+      "learning_rate": 0.003,
+      "loss": 4.0433,
+      "step": 12506
+    },
+    {
+      "epoch": 0.12507,
+      "grad_norm": 1.0075628757476807,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 12507
+    },
+    {
+      "epoch": 0.12508,
+      "grad_norm": 0.8303059935569763,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 12508
+    },
+    {
+      "epoch": 0.12509,
+      "grad_norm": 0.9752023220062256,
+      "learning_rate": 0.003,
+      "loss": 4.0386,
+      "step": 12509
+    },
+    {
+      "epoch": 0.1251,
+      "grad_norm": 0.9722163677215576,
+      "learning_rate": 0.003,
+      "loss": 4.0072,
+      "step": 12510
+    },
+    {
+      "epoch": 0.12511,
+      "grad_norm": 1.1083474159240723,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 12511
+    },
+    {
+      "epoch": 0.12512,
+      "grad_norm": 1.0361601114273071,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 12512
+    },
+    {
+      "epoch": 0.12513,
+      "grad_norm": 1.0454269647598267,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 12513
+    },
+    {
+      "epoch": 0.12514,
+      "grad_norm": 0.9313620924949646,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 12514
+    },
+    {
+      "epoch": 0.12515,
+      "grad_norm": 0.9025012850761414,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 12515
+    },
+    {
+      "epoch": 0.12516,
+      "grad_norm": 0.8324545621871948,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 12516
+    },
+    {
+      "epoch": 0.12517,
+      "grad_norm": 0.729629397392273,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 12517
+    },
+    {
+      "epoch": 0.12518,
+      "grad_norm": 0.7196733355522156,
+      "learning_rate": 0.003,
+      "loss": 4.0432,
+      "step": 12518
+    },
+    {
+      "epoch": 0.12519,
+      "grad_norm": 0.7338614463806152,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 12519
+    },
+    {
+      "epoch": 0.1252,
+      "grad_norm": 0.7173320055007935,
+      "learning_rate": 0.003,
+      "loss": 3.9841,
+      "step": 12520
+    },
+    {
+      "epoch": 0.12521,
+      "grad_norm": 0.660810649394989,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 12521
+    },
+    {
+      "epoch": 0.12522,
+      "grad_norm": 0.6969593167304993,
+      "learning_rate": 0.003,
+      "loss": 4.0608,
+      "step": 12522
+    },
+    {
+      "epoch": 0.12523,
+      "grad_norm": 0.603926956653595,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 12523
+    },
+    {
+      "epoch": 0.12524,
+      "grad_norm": 0.5833172798156738,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 12524
+    },
+    {
+      "epoch": 0.12525,
+      "grad_norm": 0.7020481824874878,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 12525
+    },
+    {
+      "epoch": 0.12526,
+      "grad_norm": 0.8667402863502502,
+      "learning_rate": 0.003,
+      "loss": 4.0472,
+      "step": 12526
+    },
+    {
+      "epoch": 0.12527,
+      "grad_norm": 0.9506835341453552,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 12527
+    },
+    {
+      "epoch": 0.12528,
+      "grad_norm": 0.9923872351646423,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 12528
+    },
+    {
+      "epoch": 0.12529,
+      "grad_norm": 0.7969499230384827,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 12529
+    },
+    {
+      "epoch": 0.1253,
+      "grad_norm": 0.6928942799568176,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 12530
+    },
+    {
+      "epoch": 0.12531,
+      "grad_norm": 0.6717604994773865,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 12531
+    },
+    {
+      "epoch": 0.12532,
+      "grad_norm": 0.6549240350723267,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 12532
+    },
+    {
+      "epoch": 0.12533,
+      "grad_norm": 0.6569952964782715,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 12533
+    },
+    {
+      "epoch": 0.12534,
+      "grad_norm": 0.6018800735473633,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 12534
+    },
+    {
+      "epoch": 0.12535,
+      "grad_norm": 0.7109904289245605,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 12535
+    },
+    {
+      "epoch": 0.12536,
+      "grad_norm": 0.8113789558410645,
+      "learning_rate": 0.003,
+      "loss": 3.9621,
+      "step": 12536
+    },
+    {
+      "epoch": 0.12537,
+      "grad_norm": 0.9771004319190979,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 12537
+    },
+    {
+      "epoch": 0.12538,
+      "grad_norm": 1.0944777727127075,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 12538
+    },
+    {
+      "epoch": 0.12539,
+      "grad_norm": 0.8347179293632507,
+      "learning_rate": 0.003,
+      "loss": 4.0437,
+      "step": 12539
+    },
+    {
+      "epoch": 0.1254,
+      "grad_norm": 0.652307391166687,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 12540
+    },
+    {
+      "epoch": 0.12541,
+      "grad_norm": 0.5324810743331909,
+      "learning_rate": 0.003,
+      "loss": 3.99,
+      "step": 12541
+    },
+    {
+      "epoch": 0.12542,
+      "grad_norm": 0.5968160033226013,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 12542
+    },
+    {
+      "epoch": 0.12543,
+      "grad_norm": 0.6760523915290833,
+      "learning_rate": 0.003,
+      "loss": 3.9729,
+      "step": 12543
+    },
+    {
+      "epoch": 0.12544,
+      "grad_norm": 0.7631462812423706,
+      "learning_rate": 0.003,
+      "loss": 4.0434,
+      "step": 12544
+    },
+    {
+      "epoch": 0.12545,
+      "grad_norm": 0.829358696937561,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 12545
+    },
+    {
+      "epoch": 0.12546,
+      "grad_norm": 0.8343631029129028,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 12546
+    },
+    {
+      "epoch": 0.12547,
+      "grad_norm": 0.7265992760658264,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 12547
+    },
+    {
+      "epoch": 0.12548,
+      "grad_norm": 0.6316820383071899,
+      "learning_rate": 0.003,
+      "loss": 3.9923,
+      "step": 12548
+    },
+    {
+      "epoch": 0.12549,
+      "grad_norm": 0.7243136167526245,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 12549
+    },
+    {
+      "epoch": 0.1255,
+      "grad_norm": 0.6993957757949829,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 12550
+    },
+    {
+      "epoch": 0.12551,
+      "grad_norm": 0.7077828645706177,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 12551
+    },
+    {
+      "epoch": 0.12552,
+      "grad_norm": 0.8004039525985718,
+      "learning_rate": 0.003,
+      "loss": 3.9836,
+      "step": 12552
+    },
+    {
+      "epoch": 0.12553,
+      "grad_norm": 0.7996693849563599,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 12553
+    },
+    {
+      "epoch": 0.12554,
+      "grad_norm": 0.7634584903717041,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 12554
+    },
+    {
+      "epoch": 0.12555,
+      "grad_norm": 0.7199156880378723,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 12555
+    },
+    {
+      "epoch": 0.12556,
+      "grad_norm": 0.7700698971748352,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 12556
+    },
+    {
+      "epoch": 0.12557,
+      "grad_norm": 0.9102762341499329,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 12557
+    },
+    {
+      "epoch": 0.12558,
+      "grad_norm": 0.9445754885673523,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 12558
+    },
+    {
+      "epoch": 0.12559,
+      "grad_norm": 0.8067466020584106,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 12559
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.9157754778862,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 12560
+    },
+    {
+      "epoch": 0.12561,
+      "grad_norm": 1.014098882675171,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 12561
+    },
+    {
+      "epoch": 0.12562,
+      "grad_norm": 0.9550620913505554,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 12562
+    },
+    {
+      "epoch": 0.12563,
+      "grad_norm": 0.8556691408157349,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 12563
+    },
+    {
+      "epoch": 0.12564,
+      "grad_norm": 0.791690468788147,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 12564
+    },
+    {
+      "epoch": 0.12565,
+      "grad_norm": 0.7778317928314209,
+      "learning_rate": 0.003,
+      "loss": 4.0371,
+      "step": 12565
+    },
+    {
+      "epoch": 0.12566,
+      "grad_norm": 0.7928282618522644,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 12566
+    },
+    {
+      "epoch": 0.12567,
+      "grad_norm": 0.8807454705238342,
+      "learning_rate": 0.003,
+      "loss": 4.0661,
+      "step": 12567
+    },
+    {
+      "epoch": 0.12568,
+      "grad_norm": 1.1370134353637695,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 12568
+    },
+    {
+      "epoch": 0.12569,
+      "grad_norm": 1.082935094833374,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 12569
+    },
+    {
+      "epoch": 0.1257,
+      "grad_norm": 0.9118308424949646,
+      "learning_rate": 0.003,
+      "loss": 4.0729,
+      "step": 12570
+    },
+    {
+      "epoch": 0.12571,
+      "grad_norm": 0.8950235843658447,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 12571
+    },
+    {
+      "epoch": 0.12572,
+      "grad_norm": 0.8408292531967163,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 12572
+    },
+    {
+      "epoch": 0.12573,
+      "grad_norm": 0.7380292415618896,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 12573
+    },
+    {
+      "epoch": 0.12574,
+      "grad_norm": 0.6289229989051819,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 12574
+    },
+    {
+      "epoch": 0.12575,
+      "grad_norm": 0.8124949932098389,
+      "learning_rate": 0.003,
+      "loss": 4.0651,
+      "step": 12575
+    },
+    {
+      "epoch": 0.12576,
+      "grad_norm": 1.0226274728775024,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 12576
+    },
+    {
+      "epoch": 0.12577,
+      "grad_norm": 1.118428349494934,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 12577
+    },
+    {
+      "epoch": 0.12578,
+      "grad_norm": 0.7549068927764893,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 12578
+    },
+    {
+      "epoch": 0.12579,
+      "grad_norm": 0.6412855982780457,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 12579
+    },
+    {
+      "epoch": 0.1258,
+      "grad_norm": 0.6058320999145508,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 12580
+    },
+    {
+      "epoch": 0.12581,
+      "grad_norm": 0.7913022041320801,
+      "learning_rate": 0.003,
+      "loss": 3.9887,
+      "step": 12581
+    },
+    {
+      "epoch": 0.12582,
+      "grad_norm": 0.9635642766952515,
+      "learning_rate": 0.003,
+      "loss": 3.9901,
+      "step": 12582
+    },
+    {
+      "epoch": 0.12583,
+      "grad_norm": 0.9899559617042542,
+      "learning_rate": 0.003,
+      "loss": 4.0376,
+      "step": 12583
+    },
+    {
+      "epoch": 0.12584,
+      "grad_norm": 0.816019594669342,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 12584
+    },
+    {
+      "epoch": 0.12585,
+      "grad_norm": 0.7499985098838806,
+      "learning_rate": 0.003,
+      "loss": 4.0508,
+      "step": 12585
+    },
+    {
+      "epoch": 0.12586,
+      "grad_norm": 0.8531391024589539,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 12586
+    },
+    {
+      "epoch": 0.12587,
+      "grad_norm": 1.0371969938278198,
+      "learning_rate": 0.003,
+      "loss": 4.0488,
+      "step": 12587
+    },
+    {
+      "epoch": 0.12588,
+      "grad_norm": 0.95607590675354,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 12588
+    },
+    {
+      "epoch": 0.12589,
+      "grad_norm": 0.9678847789764404,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 12589
+    },
+    {
+      "epoch": 0.1259,
+      "grad_norm": 0.974133312702179,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 12590
+    },
+    {
+      "epoch": 0.12591,
+      "grad_norm": 0.8166702389717102,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 12591
+    },
+    {
+      "epoch": 0.12592,
+      "grad_norm": 0.6931419372558594,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 12592
+    },
+    {
+      "epoch": 0.12593,
+      "grad_norm": 0.7652658224105835,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 12593
+    },
+    {
+      "epoch": 0.12594,
+      "grad_norm": 0.9171327352523804,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 12594
+    },
+    {
+      "epoch": 0.12595,
+      "grad_norm": 1.0293341875076294,
+      "learning_rate": 0.003,
+      "loss": 4.0558,
+      "step": 12595
+    },
+    {
+      "epoch": 0.12596,
+      "grad_norm": 0.9116888046264648,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 12596
+    },
+    {
+      "epoch": 0.12597,
+      "grad_norm": 0.8362652659416199,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 12597
+    },
+    {
+      "epoch": 0.12598,
+      "grad_norm": 0.7781234383583069,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 12598
+    },
+    {
+      "epoch": 0.12599,
+      "grad_norm": 0.8064082860946655,
+      "learning_rate": 0.003,
+      "loss": 4.05,
+      "step": 12599
+    },
+    {
+      "epoch": 0.126,
+      "grad_norm": 0.8930453062057495,
+      "learning_rate": 0.003,
+      "loss": 4.0461,
+      "step": 12600
+    },
+    {
+      "epoch": 0.12601,
+      "grad_norm": 0.8855689764022827,
+      "learning_rate": 0.003,
+      "loss": 4.0631,
+      "step": 12601
+    },
+    {
+      "epoch": 0.12602,
+      "grad_norm": 0.7009050250053406,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 12602
+    },
+    {
+      "epoch": 0.12603,
+      "grad_norm": 0.6489673256874084,
+      "learning_rate": 0.003,
+      "loss": 3.9723,
+      "step": 12603
+    },
+    {
+      "epoch": 0.12604,
+      "grad_norm": 0.5806393027305603,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 12604
+    },
+    {
+      "epoch": 0.12605,
+      "grad_norm": 0.6146126389503479,
+      "learning_rate": 0.003,
+      "loss": 4.015,
+      "step": 12605
+    },
+    {
+      "epoch": 0.12606,
+      "grad_norm": 0.6746604442596436,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 12606
+    },
+    {
+      "epoch": 0.12607,
+      "grad_norm": 0.8226182460784912,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 12607
+    },
+    {
+      "epoch": 0.12608,
+      "grad_norm": 0.9086608290672302,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 12608
+    },
+    {
+      "epoch": 0.12609,
+      "grad_norm": 1.0627309083938599,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 12609
+    },
+    {
+      "epoch": 0.1261,
+      "grad_norm": 1.0476856231689453,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 12610
+    },
+    {
+      "epoch": 0.12611,
+      "grad_norm": 0.8482815623283386,
+      "learning_rate": 0.003,
+      "loss": 4.0375,
+      "step": 12611
+    },
+    {
+      "epoch": 0.12612,
+      "grad_norm": 0.7633053660392761,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 12612
+    },
+    {
+      "epoch": 0.12613,
+      "grad_norm": 0.6689841151237488,
+      "learning_rate": 0.003,
+      "loss": 3.987,
+      "step": 12613
+    },
+    {
+      "epoch": 0.12614,
+      "grad_norm": 0.6084244251251221,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 12614
+    },
+    {
+      "epoch": 0.12615,
+      "grad_norm": 0.6043604612350464,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 12615
+    },
+    {
+      "epoch": 0.12616,
+      "grad_norm": 0.6392335891723633,
+      "learning_rate": 0.003,
+      "loss": 3.9868,
+      "step": 12616
+    },
+    {
+      "epoch": 0.12617,
+      "grad_norm": 0.707719087600708,
+      "learning_rate": 0.003,
+      "loss": 4.0006,
+      "step": 12617
+    },
+    {
+      "epoch": 0.12618,
+      "grad_norm": 0.7622061371803284,
+      "learning_rate": 0.003,
+      "loss": 3.9998,
+      "step": 12618
+    },
+    {
+      "epoch": 0.12619,
+      "grad_norm": 0.9297196865081787,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 12619
+    },
+    {
+      "epoch": 0.1262,
+      "grad_norm": 0.9986196160316467,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 12620
+    },
+    {
+      "epoch": 0.12621,
+      "grad_norm": 0.8918259143829346,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 12621
+    },
+    {
+      "epoch": 0.12622,
+      "grad_norm": 0.6821439862251282,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 12622
+    },
+    {
+      "epoch": 0.12623,
+      "grad_norm": 0.7973731160163879,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 12623
+    },
+    {
+      "epoch": 0.12624,
+      "grad_norm": 0.8734741806983948,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 12624
+    },
+    {
+      "epoch": 0.12625,
+      "grad_norm": 0.8574873805046082,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 12625
+    },
+    {
+      "epoch": 0.12626,
+      "grad_norm": 0.9886305928230286,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 12626
+    },
+    {
+      "epoch": 0.12627,
+      "grad_norm": 0.9929408431053162,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 12627
+    },
+    {
+      "epoch": 0.12628,
+      "grad_norm": 0.7818512916564941,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 12628
+    },
+    {
+      "epoch": 0.12629,
+      "grad_norm": 0.6975765824317932,
+      "learning_rate": 0.003,
+      "loss": 4.0444,
+      "step": 12629
+    },
+    {
+      "epoch": 0.1263,
+      "grad_norm": 0.6380565166473389,
+      "learning_rate": 0.003,
+      "loss": 3.9982,
+      "step": 12630
+    },
+    {
+      "epoch": 0.12631,
+      "grad_norm": 0.728340208530426,
+      "learning_rate": 0.003,
+      "loss": 4.0108,
+      "step": 12631
+    },
+    {
+      "epoch": 0.12632,
+      "grad_norm": 0.8124565482139587,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 12632
+    },
+    {
+      "epoch": 0.12633,
+      "grad_norm": 0.8869121670722961,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 12633
+    },
+    {
+      "epoch": 0.12634,
+      "grad_norm": 0.8852589130401611,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 12634
+    },
+    {
+      "epoch": 0.12635,
+      "grad_norm": 0.6980815529823303,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 12635
+    },
+    {
+      "epoch": 0.12636,
+      "grad_norm": 0.5582261681556702,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 12636
+    },
+    {
+      "epoch": 0.12637,
+      "grad_norm": 0.6569713950157166,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 12637
+    },
+    {
+      "epoch": 0.12638,
+      "grad_norm": 0.6073877811431885,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 12638
+    },
+    {
+      "epoch": 0.12639,
+      "grad_norm": 0.6059941649436951,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 12639
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.6994345188140869,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 12640
+    },
+    {
+      "epoch": 0.12641,
+      "grad_norm": 0.6538774371147156,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 12641
+    },
+    {
+      "epoch": 0.12642,
+      "grad_norm": 0.5538105368614197,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 12642
+    },
+    {
+      "epoch": 0.12643,
+      "grad_norm": 0.5192658305168152,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 12643
+    },
+    {
+      "epoch": 0.12644,
+      "grad_norm": 0.44997328519821167,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 12644
+    },
+    {
+      "epoch": 0.12645,
+      "grad_norm": 0.5587505102157593,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 12645
+    },
+    {
+      "epoch": 0.12646,
+      "grad_norm": 0.7219934463500977,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 12646
+    },
+    {
+      "epoch": 0.12647,
+      "grad_norm": 0.9880595803260803,
+      "learning_rate": 0.003,
+      "loss": 3.9942,
+      "step": 12647
+    },
+    {
+      "epoch": 0.12648,
+      "grad_norm": 1.2066372632980347,
+      "learning_rate": 0.003,
+      "loss": 4.0545,
+      "step": 12648
+    },
+    {
+      "epoch": 0.12649,
+      "grad_norm": 0.6333531737327576,
+      "learning_rate": 0.003,
+      "loss": 3.984,
+      "step": 12649
+    },
+    {
+      "epoch": 0.1265,
+      "grad_norm": 0.6311098337173462,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 12650
+    },
+    {
+      "epoch": 0.12651,
+      "grad_norm": 0.6966622471809387,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 12651
+    },
+    {
+      "epoch": 0.12652,
+      "grad_norm": 0.7308709621429443,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 12652
+    },
+    {
+      "epoch": 0.12653,
+      "grad_norm": 0.7971374988555908,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 12653
+    },
+    {
+      "epoch": 0.12654,
+      "grad_norm": 0.8622496128082275,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 12654
+    },
+    {
+      "epoch": 0.12655,
+      "grad_norm": 0.8505637049674988,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 12655
+    },
+    {
+      "epoch": 0.12656,
+      "grad_norm": 0.8776390552520752,
+      "learning_rate": 0.003,
+      "loss": 4.0497,
+      "step": 12656
+    },
+    {
+      "epoch": 0.12657,
+      "grad_norm": 1.0906471014022827,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 12657
+    },
+    {
+      "epoch": 0.12658,
+      "grad_norm": 1.083461880683899,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 12658
+    },
+    {
+      "epoch": 0.12659,
+      "grad_norm": 1.0041580200195312,
+      "learning_rate": 0.003,
+      "loss": 3.998,
+      "step": 12659
+    },
+    {
+      "epoch": 0.1266,
+      "grad_norm": 0.8595044612884521,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 12660
+    },
+    {
+      "epoch": 0.12661,
+      "grad_norm": 0.9145925045013428,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 12661
+    },
+    {
+      "epoch": 0.12662,
+      "grad_norm": 0.9530350565910339,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 12662
+    },
+    {
+      "epoch": 0.12663,
+      "grad_norm": 0.9560208320617676,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 12663
+    },
+    {
+      "epoch": 0.12664,
+      "grad_norm": 0.794122040271759,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 12664
+    },
+    {
+      "epoch": 0.12665,
+      "grad_norm": 0.7163031697273254,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 12665
+    },
+    {
+      "epoch": 0.12666,
+      "grad_norm": 0.7502034306526184,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 12666
+    },
+    {
+      "epoch": 0.12667,
+      "grad_norm": 0.8145994544029236,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 12667
+    },
+    {
+      "epoch": 0.12668,
+      "grad_norm": 0.9863829016685486,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 12668
+    },
+    {
+      "epoch": 0.12669,
+      "grad_norm": 1.2059465646743774,
+      "learning_rate": 0.003,
+      "loss": 4.0589,
+      "step": 12669
+    },
+    {
+      "epoch": 0.1267,
+      "grad_norm": 0.9510130286216736,
+      "learning_rate": 0.003,
+      "loss": 3.9967,
+      "step": 12670
+    },
+    {
+      "epoch": 0.12671,
+      "grad_norm": 1.0077412128448486,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 12671
+    },
+    {
+      "epoch": 0.12672,
+      "grad_norm": 1.0883513689041138,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 12672
+    },
+    {
+      "epoch": 0.12673,
+      "grad_norm": 1.0037307739257812,
+      "learning_rate": 0.003,
+      "loss": 4.0454,
+      "step": 12673
+    },
+    {
+      "epoch": 0.12674,
+      "grad_norm": 0.8705130219459534,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 12674
+    },
+    {
+      "epoch": 0.12675,
+      "grad_norm": 0.7188978791236877,
+      "learning_rate": 0.003,
+      "loss": 4.0522,
+      "step": 12675
+    },
+    {
+      "epoch": 0.12676,
+      "grad_norm": 0.8157229423522949,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 12676
+    },
+    {
+      "epoch": 0.12677,
+      "grad_norm": 1.175352692604065,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 12677
+    },
+    {
+      "epoch": 0.12678,
+      "grad_norm": 0.9241523742675781,
+      "learning_rate": 0.003,
+      "loss": 4.0323,
+      "step": 12678
+    },
+    {
+      "epoch": 0.12679,
+      "grad_norm": 0.935789942741394,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 12679
+    },
+    {
+      "epoch": 0.1268,
+      "grad_norm": 1.0995280742645264,
+      "learning_rate": 0.003,
+      "loss": 4.043,
+      "step": 12680
+    },
+    {
+      "epoch": 0.12681,
+      "grad_norm": 1.0176584720611572,
+      "learning_rate": 0.003,
+      "loss": 4.0473,
+      "step": 12681
+    },
+    {
+      "epoch": 0.12682,
+      "grad_norm": 0.9173998832702637,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 12682
+    },
+    {
+      "epoch": 0.12683,
+      "grad_norm": 0.8360102772712708,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 12683
+    },
+    {
+      "epoch": 0.12684,
+      "grad_norm": 0.8676466941833496,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 12684
+    },
+    {
+      "epoch": 0.12685,
+      "grad_norm": 0.8226211071014404,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 12685
+    },
+    {
+      "epoch": 0.12686,
+      "grad_norm": 0.7982207536697388,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 12686
+    },
+    {
+      "epoch": 0.12687,
+      "grad_norm": 0.7003099918365479,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 12687
+    },
+    {
+      "epoch": 0.12688,
+      "grad_norm": 0.6146358251571655,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 12688
+    },
+    {
+      "epoch": 0.12689,
+      "grad_norm": 0.7245614528656006,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 12689
+    },
+    {
+      "epoch": 0.1269,
+      "grad_norm": 0.6969562768936157,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 12690
+    },
+    {
+      "epoch": 0.12691,
+      "grad_norm": 0.646485447883606,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 12691
+    },
+    {
+      "epoch": 0.12692,
+      "grad_norm": 0.6295223236083984,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 12692
+    },
+    {
+      "epoch": 0.12693,
+      "grad_norm": 0.6015465259552002,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 12693
+    },
+    {
+      "epoch": 0.12694,
+      "grad_norm": 0.6565688848495483,
+      "learning_rate": 0.003,
+      "loss": 3.9894,
+      "step": 12694
+    },
+    {
+      "epoch": 0.12695,
+      "grad_norm": 0.8570446968078613,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 12695
+    },
+    {
+      "epoch": 0.12696,
+      "grad_norm": 1.0074641704559326,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 12696
+    },
+    {
+      "epoch": 0.12697,
+      "grad_norm": 0.9953659772872925,
+      "learning_rate": 0.003,
+      "loss": 3.9873,
+      "step": 12697
+    },
+    {
+      "epoch": 0.12698,
+      "grad_norm": 0.7940637469291687,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 12698
+    },
+    {
+      "epoch": 0.12699,
+      "grad_norm": 0.679948091506958,
+      "learning_rate": 0.003,
+      "loss": 3.9867,
+      "step": 12699
+    },
+    {
+      "epoch": 0.127,
+      "grad_norm": 0.752021312713623,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 12700
+    },
+    {
+      "epoch": 0.12701,
+      "grad_norm": 0.9325186610221863,
+      "learning_rate": 0.003,
+      "loss": 3.9966,
+      "step": 12701
+    },
+    {
+      "epoch": 0.12702,
+      "grad_norm": 1.2120436429977417,
+      "learning_rate": 0.003,
+      "loss": 4.0237,
+      "step": 12702
+    },
+    {
+      "epoch": 0.12703,
+      "grad_norm": 0.6374262571334839,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 12703
+    },
+    {
+      "epoch": 0.12704,
+      "grad_norm": 0.6450442671775818,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 12704
+    },
+    {
+      "epoch": 0.12705,
+      "grad_norm": 0.6463040113449097,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 12705
+    },
+    {
+      "epoch": 0.12706,
+      "grad_norm": 0.5338553786277771,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 12706
+    },
+    {
+      "epoch": 0.12707,
+      "grad_norm": 0.511511504650116,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 12707
+    },
+    {
+      "epoch": 0.12708,
+      "grad_norm": 0.5351196527481079,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 12708
+    },
+    {
+      "epoch": 0.12709,
+      "grad_norm": 0.5771192908287048,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 12709
+    },
+    {
+      "epoch": 0.1271,
+      "grad_norm": 0.5235174298286438,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 12710
+    },
+    {
+      "epoch": 0.12711,
+      "grad_norm": 0.5067513585090637,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 12711
+    },
+    {
+      "epoch": 0.12712,
+      "grad_norm": 0.4862622320652008,
+      "learning_rate": 0.003,
+      "loss": 3.9896,
+      "step": 12712
+    },
+    {
+      "epoch": 0.12713,
+      "grad_norm": 0.4507077932357788,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 12713
+    },
+    {
+      "epoch": 0.12714,
+      "grad_norm": 0.4960992932319641,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 12714
+    },
+    {
+      "epoch": 0.12715,
+      "grad_norm": 0.4874277710914612,
+      "learning_rate": 0.003,
+      "loss": 3.9748,
+      "step": 12715
+    },
+    {
+      "epoch": 0.12716,
+      "grad_norm": 0.45987147092819214,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 12716
+    },
+    {
+      "epoch": 0.12717,
+      "grad_norm": 0.5444669127464294,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 12717
+    },
+    {
+      "epoch": 0.12718,
+      "grad_norm": 0.6519801616668701,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 12718
+    },
+    {
+      "epoch": 0.12719,
+      "grad_norm": 0.846007764339447,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 12719
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 1.1677876710891724,
+      "learning_rate": 0.003,
+      "loss": 3.9857,
+      "step": 12720
+    },
+    {
+      "epoch": 0.12721,
+      "grad_norm": 0.9765973091125488,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 12721
+    },
+    {
+      "epoch": 0.12722,
+      "grad_norm": 0.9912946820259094,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 12722
+    },
+    {
+      "epoch": 0.12723,
+      "grad_norm": 1.0117685794830322,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 12723
+    },
+    {
+      "epoch": 0.12724,
+      "grad_norm": 1.0825741291046143,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 12724
+    },
+    {
+      "epoch": 0.12725,
+      "grad_norm": 1.1411371231079102,
+      "learning_rate": 0.003,
+      "loss": 4.0386,
+      "step": 12725
+    },
+    {
+      "epoch": 0.12726,
+      "grad_norm": 0.8763951063156128,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 12726
+    },
+    {
+      "epoch": 0.12727,
+      "grad_norm": 1.0362228155136108,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 12727
+    },
+    {
+      "epoch": 0.12728,
+      "grad_norm": 1.0936086177825928,
+      "learning_rate": 0.003,
+      "loss": 4.0462,
+      "step": 12728
+    },
+    {
+      "epoch": 0.12729,
+      "grad_norm": 0.8342863321304321,
+      "learning_rate": 0.003,
+      "loss": 4.0402,
+      "step": 12729
+    },
+    {
+      "epoch": 0.1273,
+      "grad_norm": 0.8730246424674988,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 12730
+    },
+    {
+      "epoch": 0.12731,
+      "grad_norm": 0.8429976105690002,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 12731
+    },
+    {
+      "epoch": 0.12732,
+      "grad_norm": 0.9439195394515991,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 12732
+    },
+    {
+      "epoch": 0.12733,
+      "grad_norm": 1.016707420349121,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 12733
+    },
+    {
+      "epoch": 0.12734,
+      "grad_norm": 1.1399983167648315,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 12734
+    },
+    {
+      "epoch": 0.12735,
+      "grad_norm": 1.092917561531067,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 12735
+    },
+    {
+      "epoch": 0.12736,
+      "grad_norm": 1.0256966352462769,
+      "learning_rate": 0.003,
+      "loss": 4.0468,
+      "step": 12736
+    },
+    {
+      "epoch": 0.12737,
+      "grad_norm": 0.9440638422966003,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 12737
+    },
+    {
+      "epoch": 0.12738,
+      "grad_norm": 0.8956122994422913,
+      "learning_rate": 0.003,
+      "loss": 4.0451,
+      "step": 12738
+    },
+    {
+      "epoch": 0.12739,
+      "grad_norm": 0.7672011256217957,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 12739
+    },
+    {
+      "epoch": 0.1274,
+      "grad_norm": 0.8364803791046143,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 12740
+    },
+    {
+      "epoch": 0.12741,
+      "grad_norm": 0.9540492296218872,
+      "learning_rate": 0.003,
+      "loss": 4.0347,
+      "step": 12741
+    },
+    {
+      "epoch": 0.12742,
+      "grad_norm": 1.300502061843872,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 12742
+    },
+    {
+      "epoch": 0.12743,
+      "grad_norm": 0.5849984884262085,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 12743
+    },
+    {
+      "epoch": 0.12744,
+      "grad_norm": 0.5924078822135925,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 12744
+    },
+    {
+      "epoch": 0.12745,
+      "grad_norm": 0.6661898493766785,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 12745
+    },
+    {
+      "epoch": 0.12746,
+      "grad_norm": 0.6874447464942932,
+      "learning_rate": 0.003,
+      "loss": 4.044,
+      "step": 12746
+    },
+    {
+      "epoch": 0.12747,
+      "grad_norm": 0.7282478213310242,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 12747
+    },
+    {
+      "epoch": 0.12748,
+      "grad_norm": 0.6811204552650452,
+      "learning_rate": 0.003,
+      "loss": 4.0347,
+      "step": 12748
+    },
+    {
+      "epoch": 0.12749,
+      "grad_norm": 0.5884941816329956,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 12749
+    },
+    {
+      "epoch": 0.1275,
+      "grad_norm": 0.5128979682922363,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 12750
+    },
+    {
+      "epoch": 0.12751,
+      "grad_norm": 0.542918860912323,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 12751
+    },
+    {
+      "epoch": 0.12752,
+      "grad_norm": 0.6755454540252686,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 12752
+    },
+    {
+      "epoch": 0.12753,
+      "grad_norm": 0.7383790612220764,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 12753
+    },
+    {
+      "epoch": 0.12754,
+      "grad_norm": 0.752862811088562,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 12754
+    },
+    {
+      "epoch": 0.12755,
+      "grad_norm": 0.7278972864151001,
+      "learning_rate": 0.003,
+      "loss": 4.0108,
+      "step": 12755
+    },
+    {
+      "epoch": 0.12756,
+      "grad_norm": 0.6267310976982117,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 12756
+    },
+    {
+      "epoch": 0.12757,
+      "grad_norm": 0.5327348113059998,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 12757
+    },
+    {
+      "epoch": 0.12758,
+      "grad_norm": 0.5505622029304504,
+      "learning_rate": 0.003,
+      "loss": 4.0047,
+      "step": 12758
+    },
+    {
+      "epoch": 0.12759,
+      "grad_norm": 0.6469417214393616,
+      "learning_rate": 0.003,
+      "loss": 4.0028,
+      "step": 12759
+    },
+    {
+      "epoch": 0.1276,
+      "grad_norm": 0.7604227066040039,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 12760
+    },
+    {
+      "epoch": 0.12761,
+      "grad_norm": 0.9358194470405579,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 12761
+    },
+    {
+      "epoch": 0.12762,
+      "grad_norm": 1.2231742143630981,
+      "learning_rate": 0.003,
+      "loss": 3.9879,
+      "step": 12762
+    },
+    {
+      "epoch": 0.12763,
+      "grad_norm": 0.6989595890045166,
+      "learning_rate": 0.003,
+      "loss": 3.9745,
+      "step": 12763
+    },
+    {
+      "epoch": 0.12764,
+      "grad_norm": 0.6533869504928589,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 12764
+    },
+    {
+      "epoch": 0.12765,
+      "grad_norm": 0.6442908644676208,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 12765
+    },
+    {
+      "epoch": 0.12766,
+      "grad_norm": 0.6153333783149719,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 12766
+    },
+    {
+      "epoch": 0.12767,
+      "grad_norm": 0.6329177618026733,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 12767
+    },
+    {
+      "epoch": 0.12768,
+      "grad_norm": 0.6864577531814575,
+      "learning_rate": 0.003,
+      "loss": 4.031,
+      "step": 12768
+    },
+    {
+      "epoch": 0.12769,
+      "grad_norm": 0.8032853007316589,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 12769
+    },
+    {
+      "epoch": 0.1277,
+      "grad_norm": 0.8559294939041138,
+      "learning_rate": 0.003,
+      "loss": 3.9821,
+      "step": 12770
+    },
+    {
+      "epoch": 0.12771,
+      "grad_norm": 0.8253934383392334,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 12771
+    },
+    {
+      "epoch": 0.12772,
+      "grad_norm": 0.8039045333862305,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 12772
+    },
+    {
+      "epoch": 0.12773,
+      "grad_norm": 0.7011158466339111,
+      "learning_rate": 0.003,
+      "loss": 3.9944,
+      "step": 12773
+    },
+    {
+      "epoch": 0.12774,
+      "grad_norm": 0.8235172629356384,
+      "learning_rate": 0.003,
+      "loss": 3.9745,
+      "step": 12774
+    },
+    {
+      "epoch": 0.12775,
+      "grad_norm": 0.8840197324752808,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 12775
+    },
+    {
+      "epoch": 0.12776,
+      "grad_norm": 0.9510916471481323,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 12776
+    },
+    {
+      "epoch": 0.12777,
+      "grad_norm": 0.9348542094230652,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 12777
+    },
+    {
+      "epoch": 0.12778,
+      "grad_norm": 0.8219630122184753,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 12778
+    },
+    {
+      "epoch": 0.12779,
+      "grad_norm": 0.7910003662109375,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 12779
+    },
+    {
+      "epoch": 0.1278,
+      "grad_norm": 0.8482397794723511,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 12780
+    },
+    {
+      "epoch": 0.12781,
+      "grad_norm": 0.9158820509910583,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 12781
+    },
+    {
+      "epoch": 0.12782,
+      "grad_norm": 1.0353529453277588,
+      "learning_rate": 0.003,
+      "loss": 4.0389,
+      "step": 12782
+    },
+    {
+      "epoch": 0.12783,
+      "grad_norm": 1.087900161743164,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 12783
+    },
+    {
+      "epoch": 0.12784,
+      "grad_norm": 1.1465449333190918,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 12784
+    },
+    {
+      "epoch": 0.12785,
+      "grad_norm": 0.8442012667655945,
+      "learning_rate": 0.003,
+      "loss": 3.9954,
+      "step": 12785
+    },
+    {
+      "epoch": 0.12786,
+      "grad_norm": 0.6666450500488281,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 12786
+    },
+    {
+      "epoch": 0.12787,
+      "grad_norm": 0.6549657583236694,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 12787
+    },
+    {
+      "epoch": 0.12788,
+      "grad_norm": 0.694678008556366,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 12788
+    },
+    {
+      "epoch": 0.12789,
+      "grad_norm": 0.6600160002708435,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 12789
+    },
+    {
+      "epoch": 0.1279,
+      "grad_norm": 0.75733482837677,
+      "learning_rate": 0.003,
+      "loss": 4.0409,
+      "step": 12790
+    },
+    {
+      "epoch": 0.12791,
+      "grad_norm": 0.7874218225479126,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 12791
+    },
+    {
+      "epoch": 0.12792,
+      "grad_norm": 0.7725749015808105,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 12792
+    },
+    {
+      "epoch": 0.12793,
+      "grad_norm": 0.7349872589111328,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 12793
+    },
+    {
+      "epoch": 0.12794,
+      "grad_norm": 0.7539064884185791,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 12794
+    },
+    {
+      "epoch": 0.12795,
+      "grad_norm": 0.8775387406349182,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 12795
+    },
+    {
+      "epoch": 0.12796,
+      "grad_norm": 0.9745922088623047,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 12796
+    },
+    {
+      "epoch": 0.12797,
+      "grad_norm": 0.9114720821380615,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 12797
+    },
+    {
+      "epoch": 0.12798,
+      "grad_norm": 1.044573187828064,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 12798
+    },
+    {
+      "epoch": 0.12799,
+      "grad_norm": 1.1686325073242188,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 12799
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.8817925453186035,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 12800
+    },
+    {
+      "epoch": 0.12801,
+      "grad_norm": 0.800365149974823,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 12801
+    },
+    {
+      "epoch": 0.12802,
+      "grad_norm": 0.8279086351394653,
+      "learning_rate": 0.003,
+      "loss": 4.0617,
+      "step": 12802
+    },
+    {
+      "epoch": 0.12803,
+      "grad_norm": 0.9438039660453796,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 12803
+    },
+    {
+      "epoch": 0.12804,
+      "grad_norm": 0.9062755703926086,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 12804
+    },
+    {
+      "epoch": 0.12805,
+      "grad_norm": 0.7312350273132324,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 12805
+    },
+    {
+      "epoch": 0.12806,
+      "grad_norm": 0.7411422729492188,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 12806
+    },
+    {
+      "epoch": 0.12807,
+      "grad_norm": 0.7521910667419434,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 12807
+    },
+    {
+      "epoch": 0.12808,
+      "grad_norm": 0.7977126836776733,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 12808
+    },
+    {
+      "epoch": 0.12809,
+      "grad_norm": 0.7871893644332886,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 12809
+    },
+    {
+      "epoch": 0.1281,
+      "grad_norm": 0.8420676589012146,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 12810
+    },
+    {
+      "epoch": 0.12811,
+      "grad_norm": 0.9533843994140625,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 12811
+    },
+    {
+      "epoch": 0.12812,
+      "grad_norm": 1.204431176185608,
+      "learning_rate": 0.003,
+      "loss": 4.0355,
+      "step": 12812
+    },
+    {
+      "epoch": 0.12813,
+      "grad_norm": 0.9592463374137878,
+      "learning_rate": 0.003,
+      "loss": 4.0347,
+      "step": 12813
+    },
+    {
+      "epoch": 0.12814,
+      "grad_norm": 0.9378253221511841,
+      "learning_rate": 0.003,
+      "loss": 4.0435,
+      "step": 12814
+    },
+    {
+      "epoch": 0.12815,
+      "grad_norm": 1.095750331878662,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 12815
+    },
+    {
+      "epoch": 0.12816,
+      "grad_norm": 0.9643769860267639,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 12816
+    },
+    {
+      "epoch": 0.12817,
+      "grad_norm": 0.9230008721351624,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 12817
+    },
+    {
+      "epoch": 0.12818,
+      "grad_norm": 0.9425803422927856,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 12818
+    },
+    {
+      "epoch": 0.12819,
+      "grad_norm": 0.8780527114868164,
+      "learning_rate": 0.003,
+      "loss": 4.0332,
+      "step": 12819
+    },
+    {
+      "epoch": 0.1282,
+      "grad_norm": 0.82173752784729,
+      "learning_rate": 0.003,
+      "loss": 3.9943,
+      "step": 12820
+    },
+    {
+      "epoch": 0.12821,
+      "grad_norm": 0.7313708662986755,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 12821
+    },
+    {
+      "epoch": 0.12822,
+      "grad_norm": 0.5953492522239685,
+      "learning_rate": 0.003,
+      "loss": 4.0427,
+      "step": 12822
+    },
+    {
+      "epoch": 0.12823,
+      "grad_norm": 0.5389003753662109,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 12823
+    },
+    {
+      "epoch": 0.12824,
+      "grad_norm": 0.4850378930568695,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 12824
+    },
+    {
+      "epoch": 0.12825,
+      "grad_norm": 0.479413777589798,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 12825
+    },
+    {
+      "epoch": 0.12826,
+      "grad_norm": 0.4634591341018677,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 12826
+    },
+    {
+      "epoch": 0.12827,
+      "grad_norm": 0.46316298842430115,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 12827
+    },
+    {
+      "epoch": 0.12828,
+      "grad_norm": 0.5380398631095886,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 12828
+    },
+    {
+      "epoch": 0.12829,
+      "grad_norm": 0.6179541945457458,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 12829
+    },
+    {
+      "epoch": 0.1283,
+      "grad_norm": 0.7865729928016663,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 12830
+    },
+    {
+      "epoch": 0.12831,
+      "grad_norm": 1.0153639316558838,
+      "learning_rate": 0.003,
+      "loss": 4.0027,
+      "step": 12831
+    },
+    {
+      "epoch": 0.12832,
+      "grad_norm": 1.1865519285202026,
+      "learning_rate": 0.003,
+      "loss": 4.0168,
+      "step": 12832
+    },
+    {
+      "epoch": 0.12833,
+      "grad_norm": 0.7247180938720703,
+      "learning_rate": 0.003,
+      "loss": 4.0526,
+      "step": 12833
+    },
+    {
+      "epoch": 0.12834,
+      "grad_norm": 0.6804682612419128,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 12834
+    },
+    {
+      "epoch": 0.12835,
+      "grad_norm": 0.5373378396034241,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 12835
+    },
+    {
+      "epoch": 0.12836,
+      "grad_norm": 0.6478649973869324,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 12836
+    },
+    {
+      "epoch": 0.12837,
+      "grad_norm": 0.7635132670402527,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 12837
+    },
+    {
+      "epoch": 0.12838,
+      "grad_norm": 0.8292766809463501,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 12838
+    },
+    {
+      "epoch": 0.12839,
+      "grad_norm": 0.8596310019493103,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 12839
+    },
+    {
+      "epoch": 0.1284,
+      "grad_norm": 0.9096194505691528,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 12840
+    },
+    {
+      "epoch": 0.12841,
+      "grad_norm": 0.9546847939491272,
+      "learning_rate": 0.003,
+      "loss": 3.9841,
+      "step": 12841
+    },
+    {
+      "epoch": 0.12842,
+      "grad_norm": 1.0239142179489136,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 12842
+    },
+    {
+      "epoch": 0.12843,
+      "grad_norm": 1.1351287364959717,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 12843
+    },
+    {
+      "epoch": 0.12844,
+      "grad_norm": 0.8102722764015198,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 12844
+    },
+    {
+      "epoch": 0.12845,
+      "grad_norm": 0.6956878900527954,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 12845
+    },
+    {
+      "epoch": 0.12846,
+      "grad_norm": 0.8143348693847656,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 12846
+    },
+    {
+      "epoch": 0.12847,
+      "grad_norm": 1.0078028440475464,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 12847
+    },
+    {
+      "epoch": 0.12848,
+      "grad_norm": 1.0167229175567627,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 12848
+    },
+    {
+      "epoch": 0.12849,
+      "grad_norm": 0.9717152118682861,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 12849
+    },
+    {
+      "epoch": 0.1285,
+      "grad_norm": 0.8741232752799988,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 12850
+    },
+    {
+      "epoch": 0.12851,
+      "grad_norm": 0.7856922149658203,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 12851
+    },
+    {
+      "epoch": 0.12852,
+      "grad_norm": 0.7052040696144104,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 12852
+    },
+    {
+      "epoch": 0.12853,
+      "grad_norm": 0.6401236653327942,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 12853
+    },
+    {
+      "epoch": 0.12854,
+      "grad_norm": 0.6218764185905457,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 12854
+    },
+    {
+      "epoch": 0.12855,
+      "grad_norm": 0.5677884817123413,
+      "learning_rate": 0.003,
+      "loss": 3.9827,
+      "step": 12855
+    },
+    {
+      "epoch": 0.12856,
+      "grad_norm": 0.6288309097290039,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 12856
+    },
+    {
+      "epoch": 0.12857,
+      "grad_norm": 0.9016016721725464,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 12857
+    },
+    {
+      "epoch": 0.12858,
+      "grad_norm": 1.2577475309371948,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 12858
+    },
+    {
+      "epoch": 0.12859,
+      "grad_norm": 0.9303752779960632,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 12859
+    },
+    {
+      "epoch": 0.1286,
+      "grad_norm": 0.8439638614654541,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 12860
+    },
+    {
+      "epoch": 0.12861,
+      "grad_norm": 0.945719838142395,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 12861
+    },
+    {
+      "epoch": 0.12862,
+      "grad_norm": 1.131162405014038,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 12862
+    },
+    {
+      "epoch": 0.12863,
+      "grad_norm": 0.9842260479927063,
+      "learning_rate": 0.003,
+      "loss": 4.0421,
+      "step": 12863
+    },
+    {
+      "epoch": 0.12864,
+      "grad_norm": 0.8646676540374756,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 12864
+    },
+    {
+      "epoch": 0.12865,
+      "grad_norm": 0.9796521067619324,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 12865
+    },
+    {
+      "epoch": 0.12866,
+      "grad_norm": 0.8813204169273376,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 12866
+    },
+    {
+      "epoch": 0.12867,
+      "grad_norm": 0.7494421601295471,
+      "learning_rate": 0.003,
+      "loss": 4.0072,
+      "step": 12867
+    },
+    {
+      "epoch": 0.12868,
+      "grad_norm": 0.75898277759552,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 12868
+    },
+    {
+      "epoch": 0.12869,
+      "grad_norm": 0.7349717020988464,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 12869
+    },
+    {
+      "epoch": 0.1287,
+      "grad_norm": 0.6646801829338074,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 12870
+    },
+    {
+      "epoch": 0.12871,
+      "grad_norm": 0.7402473092079163,
+      "learning_rate": 0.003,
+      "loss": 4.0602,
+      "step": 12871
+    },
+    {
+      "epoch": 0.12872,
+      "grad_norm": 0.7680990099906921,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 12872
+    },
+    {
+      "epoch": 0.12873,
+      "grad_norm": 0.7846862077713013,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 12873
+    },
+    {
+      "epoch": 0.12874,
+      "grad_norm": 0.8325425982475281,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 12874
+    },
+    {
+      "epoch": 0.12875,
+      "grad_norm": 1.0272611379623413,
+      "learning_rate": 0.003,
+      "loss": 4.043,
+      "step": 12875
+    },
+    {
+      "epoch": 0.12876,
+      "grad_norm": 1.1894290447235107,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 12876
+    },
+    {
+      "epoch": 0.12877,
+      "grad_norm": 0.8812578916549683,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 12877
+    },
+    {
+      "epoch": 0.12878,
+      "grad_norm": 0.8838363885879517,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 12878
+    },
+    {
+      "epoch": 0.12879,
+      "grad_norm": 0.8199244141578674,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 12879
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.6704943776130676,
+      "learning_rate": 0.003,
+      "loss": 3.9998,
+      "step": 12880
+    },
+    {
+      "epoch": 0.12881,
+      "grad_norm": 0.603921115398407,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 12881
+    },
+    {
+      "epoch": 0.12882,
+      "grad_norm": 0.47761204838752747,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 12882
+    },
+    {
+      "epoch": 0.12883,
+      "grad_norm": 0.5151971578598022,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 12883
+    },
+    {
+      "epoch": 0.12884,
+      "grad_norm": 0.5642871260643005,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 12884
+    },
+    {
+      "epoch": 0.12885,
+      "grad_norm": 0.5510008931159973,
+      "learning_rate": 0.003,
+      "loss": 3.9891,
+      "step": 12885
+    },
+    {
+      "epoch": 0.12886,
+      "grad_norm": 0.5852599740028381,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 12886
+    },
+    {
+      "epoch": 0.12887,
+      "grad_norm": 0.6851714253425598,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 12887
+    },
+    {
+      "epoch": 0.12888,
+      "grad_norm": 0.8603548407554626,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 12888
+    },
+    {
+      "epoch": 0.12889,
+      "grad_norm": 1.0281792879104614,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 12889
+    },
+    {
+      "epoch": 0.1289,
+      "grad_norm": 1.1843347549438477,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 12890
+    },
+    {
+      "epoch": 0.12891,
+      "grad_norm": 0.5978385806083679,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 12891
+    },
+    {
+      "epoch": 0.12892,
+      "grad_norm": 0.6088024973869324,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 12892
+    },
+    {
+      "epoch": 0.12893,
+      "grad_norm": 0.7430006265640259,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 12893
+    },
+    {
+      "epoch": 0.12894,
+      "grad_norm": 0.9343671202659607,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 12894
+    },
+    {
+      "epoch": 0.12895,
+      "grad_norm": 1.1110566854476929,
+      "learning_rate": 0.003,
+      "loss": 4.0488,
+      "step": 12895
+    },
+    {
+      "epoch": 0.12896,
+      "grad_norm": 0.9754167199134827,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 12896
+    },
+    {
+      "epoch": 0.12897,
+      "grad_norm": 1.0744414329528809,
+      "learning_rate": 0.003,
+      "loss": 3.9895,
+      "step": 12897
+    },
+    {
+      "epoch": 0.12898,
+      "grad_norm": 0.8748178482055664,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 12898
+    },
+    {
+      "epoch": 0.12899,
+      "grad_norm": 0.9084308743476868,
+      "learning_rate": 0.003,
+      "loss": 4.0549,
+      "step": 12899
+    },
+    {
+      "epoch": 0.129,
+      "grad_norm": 0.8756034970283508,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 12900
+    },
+    {
+      "epoch": 0.12901,
+      "grad_norm": 0.8914014101028442,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 12901
+    },
+    {
+      "epoch": 0.12902,
+      "grad_norm": 0.943042516708374,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 12902
+    },
+    {
+      "epoch": 0.12903,
+      "grad_norm": 1.0388507843017578,
+      "learning_rate": 0.003,
+      "loss": 4.0589,
+      "step": 12903
+    },
+    {
+      "epoch": 0.12904,
+      "grad_norm": 0.8804526925086975,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 12904
+    },
+    {
+      "epoch": 0.12905,
+      "grad_norm": 0.7760341763496399,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 12905
+    },
+    {
+      "epoch": 0.12906,
+      "grad_norm": 0.7428815960884094,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 12906
+    },
+    {
+      "epoch": 0.12907,
+      "grad_norm": 0.8633037805557251,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 12907
+    },
+    {
+      "epoch": 0.12908,
+      "grad_norm": 1.0847586393356323,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 12908
+    },
+    {
+      "epoch": 0.12909,
+      "grad_norm": 1.10981023311615,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 12909
+    },
+    {
+      "epoch": 0.1291,
+      "grad_norm": 0.8015402555465698,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 12910
+    },
+    {
+      "epoch": 0.12911,
+      "grad_norm": 0.6852465867996216,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 12911
+    },
+    {
+      "epoch": 0.12912,
+      "grad_norm": 0.6617090106010437,
+      "learning_rate": 0.003,
+      "loss": 3.9943,
+      "step": 12912
+    },
+    {
+      "epoch": 0.12913,
+      "grad_norm": 0.6093372702598572,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 12913
+    },
+    {
+      "epoch": 0.12914,
+      "grad_norm": 0.5485365986824036,
+      "learning_rate": 0.003,
+      "loss": 4.0249,
+      "step": 12914
+    },
+    {
+      "epoch": 0.12915,
+      "grad_norm": 0.5607293844223022,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 12915
+    },
+    {
+      "epoch": 0.12916,
+      "grad_norm": 0.7262622117996216,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 12916
+    },
+    {
+      "epoch": 0.12917,
+      "grad_norm": 1.0545352697372437,
+      "learning_rate": 0.003,
+      "loss": 4.0271,
+      "step": 12917
+    },
+    {
+      "epoch": 0.12918,
+      "grad_norm": 1.1793196201324463,
+      "learning_rate": 0.003,
+      "loss": 4.0473,
+      "step": 12918
+    },
+    {
+      "epoch": 0.12919,
+      "grad_norm": 0.710120439529419,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 12919
+    },
+    {
+      "epoch": 0.1292,
+      "grad_norm": 0.6058240532875061,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 12920
+    },
+    {
+      "epoch": 0.12921,
+      "grad_norm": 0.6739874482154846,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 12921
+    },
+    {
+      "epoch": 0.12922,
+      "grad_norm": 0.7524260878562927,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 12922
+    },
+    {
+      "epoch": 0.12923,
+      "grad_norm": 0.8127276301383972,
+      "learning_rate": 0.003,
+      "loss": 3.9774,
+      "step": 12923
+    },
+    {
+      "epoch": 0.12924,
+      "grad_norm": 0.8501728177070618,
+      "learning_rate": 0.003,
+      "loss": 4.0375,
+      "step": 12924
+    },
+    {
+      "epoch": 0.12925,
+      "grad_norm": 0.7651922702789307,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 12925
+    },
+    {
+      "epoch": 0.12926,
+      "grad_norm": 0.8245254755020142,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 12926
+    },
+    {
+      "epoch": 0.12927,
+      "grad_norm": 0.8267773389816284,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 12927
+    },
+    {
+      "epoch": 0.12928,
+      "grad_norm": 0.9031170606613159,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 12928
+    },
+    {
+      "epoch": 0.12929,
+      "grad_norm": 1.1104865074157715,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 12929
+    },
+    {
+      "epoch": 0.1293,
+      "grad_norm": 0.9228253364562988,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 12930
+    },
+    {
+      "epoch": 0.12931,
+      "grad_norm": 0.6955811381340027,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 12931
+    },
+    {
+      "epoch": 0.12932,
+      "grad_norm": 0.5862828493118286,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 12932
+    },
+    {
+      "epoch": 0.12933,
+      "grad_norm": 0.6926164031028748,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 12933
+    },
+    {
+      "epoch": 0.12934,
+      "grad_norm": 0.8853495717048645,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 12934
+    },
+    {
+      "epoch": 0.12935,
+      "grad_norm": 0.9204506874084473,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 12935
+    },
+    {
+      "epoch": 0.12936,
+      "grad_norm": 0.852302610874176,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 12936
+    },
+    {
+      "epoch": 0.12937,
+      "grad_norm": 0.6635742783546448,
+      "learning_rate": 0.003,
+      "loss": 3.9934,
+      "step": 12937
+    },
+    {
+      "epoch": 0.12938,
+      "grad_norm": 0.6472227573394775,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 12938
+    },
+    {
+      "epoch": 0.12939,
+      "grad_norm": 0.7804316282272339,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 12939
+    },
+    {
+      "epoch": 0.1294,
+      "grad_norm": 0.8684574365615845,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 12940
+    },
+    {
+      "epoch": 0.12941,
+      "grad_norm": 0.8802387118339539,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 12941
+    },
+    {
+      "epoch": 0.12942,
+      "grad_norm": 0.796864926815033,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 12942
+    },
+    {
+      "epoch": 0.12943,
+      "grad_norm": 0.6880064606666565,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 12943
+    },
+    {
+      "epoch": 0.12944,
+      "grad_norm": 0.5624730587005615,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 12944
+    },
+    {
+      "epoch": 0.12945,
+      "grad_norm": 0.5585161447525024,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 12945
+    },
+    {
+      "epoch": 0.12946,
+      "grad_norm": 0.5746771097183228,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 12946
+    },
+    {
+      "epoch": 0.12947,
+      "grad_norm": 0.637833297252655,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 12947
+    },
+    {
+      "epoch": 0.12948,
+      "grad_norm": 0.6900153756141663,
+      "learning_rate": 0.003,
+      "loss": 3.9931,
+      "step": 12948
+    },
+    {
+      "epoch": 0.12949,
+      "grad_norm": 0.7411880493164062,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 12949
+    },
+    {
+      "epoch": 0.1295,
+      "grad_norm": 0.7695903182029724,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 12950
+    },
+    {
+      "epoch": 0.12951,
+      "grad_norm": 0.8654025793075562,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 12951
+    },
+    {
+      "epoch": 0.12952,
+      "grad_norm": 0.9959903955459595,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 12952
+    },
+    {
+      "epoch": 0.12953,
+      "grad_norm": 1.0637086629867554,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 12953
+    },
+    {
+      "epoch": 0.12954,
+      "grad_norm": 0.8980008363723755,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 12954
+    },
+    {
+      "epoch": 0.12955,
+      "grad_norm": 0.9990094900131226,
+      "learning_rate": 0.003,
+      "loss": 4.0449,
+      "step": 12955
+    },
+    {
+      "epoch": 0.12956,
+      "grad_norm": 1.0221704244613647,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 12956
+    },
+    {
+      "epoch": 0.12957,
+      "grad_norm": 1.0424240827560425,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 12957
+    },
+    {
+      "epoch": 0.12958,
+      "grad_norm": 0.9421285390853882,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 12958
+    },
+    {
+      "epoch": 0.12959,
+      "grad_norm": 0.8220925331115723,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 12959
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.9669578075408936,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 12960
+    },
+    {
+      "epoch": 0.12961,
+      "grad_norm": 1.1217392683029175,
+      "learning_rate": 0.003,
+      "loss": 4.0615,
+      "step": 12961
+    },
+    {
+      "epoch": 0.12962,
+      "grad_norm": 1.0248963832855225,
+      "learning_rate": 0.003,
+      "loss": 4.0464,
+      "step": 12962
+    },
+    {
+      "epoch": 0.12963,
+      "grad_norm": 1.0191049575805664,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 12963
+    },
+    {
+      "epoch": 0.12964,
+      "grad_norm": 1.205246090888977,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 12964
+    },
+    {
+      "epoch": 0.12965,
+      "grad_norm": 0.8745798468589783,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 12965
+    },
+    {
+      "epoch": 0.12966,
+      "grad_norm": 0.8760191798210144,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 12966
+    },
+    {
+      "epoch": 0.12967,
+      "grad_norm": 0.9490057826042175,
+      "learning_rate": 0.003,
+      "loss": 3.9819,
+      "step": 12967
+    },
+    {
+      "epoch": 0.12968,
+      "grad_norm": 1.0541231632232666,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 12968
+    },
+    {
+      "epoch": 0.12969,
+      "grad_norm": 1.1173850297927856,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 12969
+    },
+    {
+      "epoch": 0.1297,
+      "grad_norm": 0.7659215331077576,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 12970
+    },
+    {
+      "epoch": 0.12971,
+      "grad_norm": 0.7066275477409363,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 12971
+    },
+    {
+      "epoch": 0.12972,
+      "grad_norm": 0.6963605880737305,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 12972
+    },
+    {
+      "epoch": 0.12973,
+      "grad_norm": 0.756811797618866,
+      "learning_rate": 0.003,
+      "loss": 4.0549,
+      "step": 12973
+    },
+    {
+      "epoch": 0.12974,
+      "grad_norm": 0.801081120967865,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 12974
+    },
+    {
+      "epoch": 0.12975,
+      "grad_norm": 0.9290609955787659,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 12975
+    },
+    {
+      "epoch": 0.12976,
+      "grad_norm": 0.8734400272369385,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 12976
+    },
+    {
+      "epoch": 0.12977,
+      "grad_norm": 0.7204127907752991,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 12977
+    },
+    {
+      "epoch": 0.12978,
+      "grad_norm": 0.6100812554359436,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 12978
+    },
+    {
+      "epoch": 0.12979,
+      "grad_norm": 0.493714302778244,
+      "learning_rate": 0.003,
+      "loss": 3.989,
+      "step": 12979
+    },
+    {
+      "epoch": 0.1298,
+      "grad_norm": 0.6417368650436401,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 12980
+    },
+    {
+      "epoch": 0.12981,
+      "grad_norm": 0.8046885132789612,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 12981
+    },
+    {
+      "epoch": 0.12982,
+      "grad_norm": 0.8287491202354431,
+      "learning_rate": 0.003,
+      "loss": 4.0464,
+      "step": 12982
+    },
+    {
+      "epoch": 0.12983,
+      "grad_norm": 0.8081839680671692,
+      "learning_rate": 0.003,
+      "loss": 3.993,
+      "step": 12983
+    },
+    {
+      "epoch": 0.12984,
+      "grad_norm": 0.6881992220878601,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 12984
+    },
+    {
+      "epoch": 0.12985,
+      "grad_norm": 0.5813509225845337,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 12985
+    },
+    {
+      "epoch": 0.12986,
+      "grad_norm": 0.6606971621513367,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 12986
+    },
+    {
+      "epoch": 0.12987,
+      "grad_norm": 0.6416282653808594,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 12987
+    },
+    {
+      "epoch": 0.12988,
+      "grad_norm": 0.6557727456092834,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 12988
+    },
+    {
+      "epoch": 0.12989,
+      "grad_norm": 0.7388274669647217,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 12989
+    },
+    {
+      "epoch": 0.1299,
+      "grad_norm": 0.7772365212440491,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 12990
+    },
+    {
+      "epoch": 0.12991,
+      "grad_norm": 0.824487030506134,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 12991
+    },
+    {
+      "epoch": 0.12992,
+      "grad_norm": 0.9061369895935059,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 12992
+    },
+    {
+      "epoch": 0.12993,
+      "grad_norm": 0.9541290402412415,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 12993
+    },
+    {
+      "epoch": 0.12994,
+      "grad_norm": 0.9159508347511292,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 12994
+    },
+    {
+      "epoch": 0.12995,
+      "grad_norm": 0.8194580078125,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 12995
+    },
+    {
+      "epoch": 0.12996,
+      "grad_norm": 0.799264669418335,
+      "learning_rate": 0.003,
+      "loss": 3.9887,
+      "step": 12996
+    },
+    {
+      "epoch": 0.12997,
+      "grad_norm": 0.88383948802948,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 12997
+    },
+    {
+      "epoch": 0.12998,
+      "grad_norm": 0.8685938715934753,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 12998
+    },
+    {
+      "epoch": 0.12999,
+      "grad_norm": 0.8577209115028381,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 12999
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.8677167296409607,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 13000
+    },
+    {
+      "epoch": 0.13001,
+      "grad_norm": 0.9226064085960388,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 13001
+    },
+    {
+      "epoch": 0.13002,
+      "grad_norm": 0.8852340579032898,
+      "learning_rate": 0.003,
+      "loss": 3.9988,
+      "step": 13002
+    },
+    {
+      "epoch": 0.13003,
+      "grad_norm": 0.9692376852035522,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 13003
+    },
+    {
+      "epoch": 0.13004,
+      "grad_norm": 0.9139446020126343,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 13004
+    },
+    {
+      "epoch": 0.13005,
+      "grad_norm": 0.7610192894935608,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 13005
+    },
+    {
+      "epoch": 0.13006,
+      "grad_norm": 0.7845240235328674,
+      "learning_rate": 0.003,
+      "loss": 3.9965,
+      "step": 13006
+    },
+    {
+      "epoch": 0.13007,
+      "grad_norm": 0.826093852519989,
+      "learning_rate": 0.003,
+      "loss": 4.042,
+      "step": 13007
+    },
+    {
+      "epoch": 0.13008,
+      "grad_norm": 0.8527858853340149,
+      "learning_rate": 0.003,
+      "loss": 4.0421,
+      "step": 13008
+    },
+    {
+      "epoch": 0.13009,
+      "grad_norm": 0.8826994895935059,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 13009
+    },
+    {
+      "epoch": 0.1301,
+      "grad_norm": 0.8605583906173706,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 13010
+    },
+    {
+      "epoch": 0.13011,
+      "grad_norm": 0.905889093875885,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 13011
+    },
+    {
+      "epoch": 0.13012,
+      "grad_norm": 0.9393042325973511,
+      "learning_rate": 0.003,
+      "loss": 4.0469,
+      "step": 13012
+    },
+    {
+      "epoch": 0.13013,
+      "grad_norm": 0.8369143605232239,
+      "learning_rate": 0.003,
+      "loss": 4.0493,
+      "step": 13013
+    },
+    {
+      "epoch": 0.13014,
+      "grad_norm": 0.8191674947738647,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 13014
+    },
+    {
+      "epoch": 0.13015,
+      "grad_norm": 0.8748950362205505,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 13015
+    },
+    {
+      "epoch": 0.13016,
+      "grad_norm": 0.9429699778556824,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 13016
+    },
+    {
+      "epoch": 0.13017,
+      "grad_norm": 0.8982075452804565,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 13017
+    },
+    {
+      "epoch": 0.13018,
+      "grad_norm": 0.9404157400131226,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 13018
+    },
+    {
+      "epoch": 0.13019,
+      "grad_norm": 1.006630539894104,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 13019
+    },
+    {
+      "epoch": 0.1302,
+      "grad_norm": 0.8865049481391907,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 13020
+    },
+    {
+      "epoch": 0.13021,
+      "grad_norm": 0.7027060985565186,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 13021
+    },
+    {
+      "epoch": 0.13022,
+      "grad_norm": 0.6569135785102844,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 13022
+    },
+    {
+      "epoch": 0.13023,
+      "grad_norm": 0.6144932508468628,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 13023
+    },
+    {
+      "epoch": 0.13024,
+      "grad_norm": 0.6117271780967712,
+      "learning_rate": 0.003,
+      "loss": 3.97,
+      "step": 13024
+    },
+    {
+      "epoch": 0.13025,
+      "grad_norm": 0.5788269639015198,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 13025
+    },
+    {
+      "epoch": 0.13026,
+      "grad_norm": 0.6472201347351074,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 13026
+    },
+    {
+      "epoch": 0.13027,
+      "grad_norm": 0.7238059043884277,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 13027
+    },
+    {
+      "epoch": 0.13028,
+      "grad_norm": 0.7223124504089355,
+      "learning_rate": 0.003,
+      "loss": 4.0366,
+      "step": 13028
+    },
+    {
+      "epoch": 0.13029,
+      "grad_norm": 0.7082488536834717,
+      "learning_rate": 0.003,
+      "loss": 3.993,
+      "step": 13029
+    },
+    {
+      "epoch": 0.1303,
+      "grad_norm": 0.8084279298782349,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 13030
+    },
+    {
+      "epoch": 0.13031,
+      "grad_norm": 0.9165552854537964,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 13031
+    },
+    {
+      "epoch": 0.13032,
+      "grad_norm": 0.9037119150161743,
+      "learning_rate": 0.003,
+      "loss": 4.0095,
+      "step": 13032
+    },
+    {
+      "epoch": 0.13033,
+      "grad_norm": 0.9617390036582947,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 13033
+    },
+    {
+      "epoch": 0.13034,
+      "grad_norm": 0.9257889986038208,
+      "learning_rate": 0.003,
+      "loss": 4.0471,
+      "step": 13034
+    },
+    {
+      "epoch": 0.13035,
+      "grad_norm": 0.8421040773391724,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 13035
+    },
+    {
+      "epoch": 0.13036,
+      "grad_norm": 0.8228592276573181,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 13036
+    },
+    {
+      "epoch": 0.13037,
+      "grad_norm": 0.8626160025596619,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 13037
+    },
+    {
+      "epoch": 0.13038,
+      "grad_norm": 0.7961760759353638,
+      "learning_rate": 0.003,
+      "loss": 4.0403,
+      "step": 13038
+    },
+    {
+      "epoch": 0.13039,
+      "grad_norm": 0.8328917026519775,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 13039
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.858963668346405,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 13040
+    },
+    {
+      "epoch": 0.13041,
+      "grad_norm": 1.034338355064392,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 13041
+    },
+    {
+      "epoch": 0.13042,
+      "grad_norm": 1.2494183778762817,
+      "learning_rate": 0.003,
+      "loss": 4.034,
+      "step": 13042
+    },
+    {
+      "epoch": 0.13043,
+      "grad_norm": 0.5884882807731628,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 13043
+    },
+    {
+      "epoch": 0.13044,
+      "grad_norm": 0.7131565809249878,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 13044
+    },
+    {
+      "epoch": 0.13045,
+      "grad_norm": 0.7175974249839783,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 13045
+    },
+    {
+      "epoch": 0.13046,
+      "grad_norm": 0.746105432510376,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 13046
+    },
+    {
+      "epoch": 0.13047,
+      "grad_norm": 0.8767604231834412,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 13047
+    },
+    {
+      "epoch": 0.13048,
+      "grad_norm": 0.8941057920455933,
+      "learning_rate": 0.003,
+      "loss": 3.9829,
+      "step": 13048
+    },
+    {
+      "epoch": 0.13049,
+      "grad_norm": 0.8328637480735779,
+      "learning_rate": 0.003,
+      "loss": 3.9852,
+      "step": 13049
+    },
+    {
+      "epoch": 0.1305,
+      "grad_norm": 0.8061503171920776,
+      "learning_rate": 0.003,
+      "loss": 3.9879,
+      "step": 13050
+    },
+    {
+      "epoch": 0.13051,
+      "grad_norm": 0.8265740871429443,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 13051
+    },
+    {
+      "epoch": 0.13052,
+      "grad_norm": 0.845345139503479,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 13052
+    },
+    {
+      "epoch": 0.13053,
+      "grad_norm": 0.7674865126609802,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 13053
+    },
+    {
+      "epoch": 0.13054,
+      "grad_norm": 0.9645304679870605,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 13054
+    },
+    {
+      "epoch": 0.13055,
+      "grad_norm": 1.078338623046875,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 13055
+    },
+    {
+      "epoch": 0.13056,
+      "grad_norm": 0.8772696852684021,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 13056
+    },
+    {
+      "epoch": 0.13057,
+      "grad_norm": 0.8021833300590515,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 13057
+    },
+    {
+      "epoch": 0.13058,
+      "grad_norm": 0.9574021697044373,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 13058
+    },
+    {
+      "epoch": 0.13059,
+      "grad_norm": 1.120952844619751,
+      "learning_rate": 0.003,
+      "loss": 4.0529,
+      "step": 13059
+    },
+    {
+      "epoch": 0.1306,
+      "grad_norm": 0.8463264107704163,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 13060
+    },
+    {
+      "epoch": 0.13061,
+      "grad_norm": 0.8074550032615662,
+      "learning_rate": 0.003,
+      "loss": 4.0,
+      "step": 13061
+    },
+    {
+      "epoch": 0.13062,
+      "grad_norm": 0.7267398238182068,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 13062
+    },
+    {
+      "epoch": 0.13063,
+      "grad_norm": 0.7274813652038574,
+      "learning_rate": 0.003,
+      "loss": 3.9844,
+      "step": 13063
+    },
+    {
+      "epoch": 0.13064,
+      "grad_norm": 0.6505253314971924,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 13064
+    },
+    {
+      "epoch": 0.13065,
+      "grad_norm": 0.6174700260162354,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 13065
+    },
+    {
+      "epoch": 0.13066,
+      "grad_norm": 0.6143016219139099,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 13066
+    },
+    {
+      "epoch": 0.13067,
+      "grad_norm": 0.6251732707023621,
+      "learning_rate": 0.003,
+      "loss": 4.0619,
+      "step": 13067
+    },
+    {
+      "epoch": 0.13068,
+      "grad_norm": 0.6309815645217896,
+      "learning_rate": 0.003,
+      "loss": 3.9916,
+      "step": 13068
+    },
+    {
+      "epoch": 0.13069,
+      "grad_norm": 0.6709125638008118,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 13069
+    },
+    {
+      "epoch": 0.1307,
+      "grad_norm": 0.6948953866958618,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 13070
+    },
+    {
+      "epoch": 0.13071,
+      "grad_norm": 0.7670696377754211,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 13071
+    },
+    {
+      "epoch": 0.13072,
+      "grad_norm": 0.8004587292671204,
+      "learning_rate": 0.003,
+      "loss": 4.0006,
+      "step": 13072
+    },
+    {
+      "epoch": 0.13073,
+      "grad_norm": 0.8366334438323975,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 13073
+    },
+    {
+      "epoch": 0.13074,
+      "grad_norm": 0.8868482112884521,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 13074
+    },
+    {
+      "epoch": 0.13075,
+      "grad_norm": 0.8429307341575623,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 13075
+    },
+    {
+      "epoch": 0.13076,
+      "grad_norm": 0.7455659508705139,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 13076
+    },
+    {
+      "epoch": 0.13077,
+      "grad_norm": 0.8017807602882385,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 13077
+    },
+    {
+      "epoch": 0.13078,
+      "grad_norm": 0.9632897973060608,
+      "learning_rate": 0.003,
+      "loss": 4.0039,
+      "step": 13078
+    },
+    {
+      "epoch": 0.13079,
+      "grad_norm": 1.227366328239441,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 13079
+    },
+    {
+      "epoch": 0.1308,
+      "grad_norm": 0.9098268747329712,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 13080
+    },
+    {
+      "epoch": 0.13081,
+      "grad_norm": 1.0370805263519287,
+      "learning_rate": 0.003,
+      "loss": 4.0371,
+      "step": 13081
+    },
+    {
+      "epoch": 0.13082,
+      "grad_norm": 1.001566767692566,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 13082
+    },
+    {
+      "epoch": 0.13083,
+      "grad_norm": 0.8821815848350525,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 13083
+    },
+    {
+      "epoch": 0.13084,
+      "grad_norm": 0.7804445028305054,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 13084
+    },
+    {
+      "epoch": 0.13085,
+      "grad_norm": 0.8916219472885132,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 13085
+    },
+    {
+      "epoch": 0.13086,
+      "grad_norm": 0.7816018462181091,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 13086
+    },
+    {
+      "epoch": 0.13087,
+      "grad_norm": 0.7173853516578674,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 13087
+    },
+    {
+      "epoch": 0.13088,
+      "grad_norm": 0.6855860948562622,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 13088
+    },
+    {
+      "epoch": 0.13089,
+      "grad_norm": 0.6942515969276428,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 13089
+    },
+    {
+      "epoch": 0.1309,
+      "grad_norm": 0.7197878956794739,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 13090
+    },
+    {
+      "epoch": 0.13091,
+      "grad_norm": 0.8413582444190979,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 13091
+    },
+    {
+      "epoch": 0.13092,
+      "grad_norm": 0.9425943493843079,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 13092
+    },
+    {
+      "epoch": 0.13093,
+      "grad_norm": 1.069011926651001,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 13093
+    },
+    {
+      "epoch": 0.13094,
+      "grad_norm": 0.8811655044555664,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 13094
+    },
+    {
+      "epoch": 0.13095,
+      "grad_norm": 0.6359062194824219,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 13095
+    },
+    {
+      "epoch": 0.13096,
+      "grad_norm": 0.619958221912384,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 13096
+    },
+    {
+      "epoch": 0.13097,
+      "grad_norm": 0.5777302384376526,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 13097
+    },
+    {
+      "epoch": 0.13098,
+      "grad_norm": 0.6459702849388123,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 13098
+    },
+    {
+      "epoch": 0.13099,
+      "grad_norm": 0.6265745162963867,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 13099
+    },
+    {
+      "epoch": 0.131,
+      "grad_norm": 0.6241823434829712,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 13100
+    },
+    {
+      "epoch": 0.13101,
+      "grad_norm": 0.6626433730125427,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 13101
+    },
+    {
+      "epoch": 0.13102,
+      "grad_norm": 0.7558318376541138,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 13102
+    },
+    {
+      "epoch": 0.13103,
+      "grad_norm": 0.7066266536712646,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 13103
+    },
+    {
+      "epoch": 0.13104,
+      "grad_norm": 0.7672896981239319,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 13104
+    },
+    {
+      "epoch": 0.13105,
+      "grad_norm": 0.9246180653572083,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 13105
+    },
+    {
+      "epoch": 0.13106,
+      "grad_norm": 0.9987875819206238,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 13106
+    },
+    {
+      "epoch": 0.13107,
+      "grad_norm": 1.169500470161438,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 13107
+    },
+    {
+      "epoch": 0.13108,
+      "grad_norm": 0.9271288514137268,
+      "learning_rate": 0.003,
+      "loss": 4.0001,
+      "step": 13108
+    },
+    {
+      "epoch": 0.13109,
+      "grad_norm": 0.7768493890762329,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 13109
+    },
+    {
+      "epoch": 0.1311,
+      "grad_norm": 0.705818772315979,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 13110
+    },
+    {
+      "epoch": 0.13111,
+      "grad_norm": 0.6956490278244019,
+      "learning_rate": 0.003,
+      "loss": 3.9983,
+      "step": 13111
+    },
+    {
+      "epoch": 0.13112,
+      "grad_norm": 0.7605606913566589,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 13112
+    },
+    {
+      "epoch": 0.13113,
+      "grad_norm": 0.7887161374092102,
+      "learning_rate": 0.003,
+      "loss": 4.0474,
+      "step": 13113
+    },
+    {
+      "epoch": 0.13114,
+      "grad_norm": 0.82375168800354,
+      "learning_rate": 0.003,
+      "loss": 3.9862,
+      "step": 13114
+    },
+    {
+      "epoch": 0.13115,
+      "grad_norm": 0.8668541312217712,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 13115
+    },
+    {
+      "epoch": 0.13116,
+      "grad_norm": 1.0358824729919434,
+      "learning_rate": 0.003,
+      "loss": 4.0237,
+      "step": 13116
+    },
+    {
+      "epoch": 0.13117,
+      "grad_norm": 1.1184041500091553,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 13117
+    },
+    {
+      "epoch": 0.13118,
+      "grad_norm": 0.7959602475166321,
+      "learning_rate": 0.003,
+      "loss": 4.0382,
+      "step": 13118
+    },
+    {
+      "epoch": 0.13119,
+      "grad_norm": 0.6026773452758789,
+      "learning_rate": 0.003,
+      "loss": 3.9967,
+      "step": 13119
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.7496844530105591,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 13120
+    },
+    {
+      "epoch": 0.13121,
+      "grad_norm": 0.7308691143989563,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 13121
+    },
+    {
+      "epoch": 0.13122,
+      "grad_norm": 0.6883700489997864,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 13122
+    },
+    {
+      "epoch": 0.13123,
+      "grad_norm": 0.6974976062774658,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 13123
+    },
+    {
+      "epoch": 0.13124,
+      "grad_norm": 0.8474065065383911,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 13124
+    },
+    {
+      "epoch": 0.13125,
+      "grad_norm": 1.0197933912277222,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 13125
+    },
+    {
+      "epoch": 0.13126,
+      "grad_norm": 1.2125210762023926,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 13126
+    },
+    {
+      "epoch": 0.13127,
+      "grad_norm": 0.7846187949180603,
+      "learning_rate": 0.003,
+      "loss": 3.9998,
+      "step": 13127
+    },
+    {
+      "epoch": 0.13128,
+      "grad_norm": 0.8484559655189514,
+      "learning_rate": 0.003,
+      "loss": 3.9901,
+      "step": 13128
+    },
+    {
+      "epoch": 0.13129,
+      "grad_norm": 1.1045933961868286,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 13129
+    },
+    {
+      "epoch": 0.1313,
+      "grad_norm": 0.9178270101547241,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 13130
+    },
+    {
+      "epoch": 0.13131,
+      "grad_norm": 0.8832055330276489,
+      "learning_rate": 0.003,
+      "loss": 4.0289,
+      "step": 13131
+    },
+    {
+      "epoch": 0.13132,
+      "grad_norm": 0.8153164386749268,
+      "learning_rate": 0.003,
+      "loss": 4.0342,
+      "step": 13132
+    },
+    {
+      "epoch": 0.13133,
+      "grad_norm": 0.9219346046447754,
+      "learning_rate": 0.003,
+      "loss": 4.0531,
+      "step": 13133
+    },
+    {
+      "epoch": 0.13134,
+      "grad_norm": 0.8553265929222107,
+      "learning_rate": 0.003,
+      "loss": 4.009,
+      "step": 13134
+    },
+    {
+      "epoch": 0.13135,
+      "grad_norm": 0.8976668119430542,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 13135
+    },
+    {
+      "epoch": 0.13136,
+      "grad_norm": 0.882498562335968,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 13136
+    },
+    {
+      "epoch": 0.13137,
+      "grad_norm": 0.9435257911682129,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 13137
+    },
+    {
+      "epoch": 0.13138,
+      "grad_norm": 0.9073676466941833,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 13138
+    },
+    {
+      "epoch": 0.13139,
+      "grad_norm": 0.9786491394042969,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 13139
+    },
+    {
+      "epoch": 0.1314,
+      "grad_norm": 0.9401513934135437,
+      "learning_rate": 0.003,
+      "loss": 4.0443,
+      "step": 13140
+    },
+    {
+      "epoch": 0.13141,
+      "grad_norm": 0.8031672835350037,
+      "learning_rate": 0.003,
+      "loss": 4.0371,
+      "step": 13141
+    },
+    {
+      "epoch": 0.13142,
+      "grad_norm": 0.9604483842849731,
+      "learning_rate": 0.003,
+      "loss": 4.0634,
+      "step": 13142
+    },
+    {
+      "epoch": 0.13143,
+      "grad_norm": 0.9621740579605103,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 13143
+    },
+    {
+      "epoch": 0.13144,
+      "grad_norm": 0.7710050344467163,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 13144
+    },
+    {
+      "epoch": 0.13145,
+      "grad_norm": 0.8008609414100647,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 13145
+    },
+    {
+      "epoch": 0.13146,
+      "grad_norm": 0.6710706353187561,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 13146
+    },
+    {
+      "epoch": 0.13147,
+      "grad_norm": 0.7560577392578125,
+      "learning_rate": 0.003,
+      "loss": 4.0384,
+      "step": 13147
+    },
+    {
+      "epoch": 0.13148,
+      "grad_norm": 0.781263530254364,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 13148
+    },
+    {
+      "epoch": 0.13149,
+      "grad_norm": 0.7743945717811584,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 13149
+    },
+    {
+      "epoch": 0.1315,
+      "grad_norm": 0.7937132716178894,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 13150
+    },
+    {
+      "epoch": 0.13151,
+      "grad_norm": 0.8247299194335938,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 13151
+    },
+    {
+      "epoch": 0.13152,
+      "grad_norm": 0.8840051889419556,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 13152
+    },
+    {
+      "epoch": 0.13153,
+      "grad_norm": 0.8960859775543213,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 13153
+    },
+    {
+      "epoch": 0.13154,
+      "grad_norm": 0.9697312712669373,
+      "learning_rate": 0.003,
+      "loss": 3.9938,
+      "step": 13154
+    },
+    {
+      "epoch": 0.13155,
+      "grad_norm": 1.0689184665679932,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 13155
+    },
+    {
+      "epoch": 0.13156,
+      "grad_norm": 1.0230175256729126,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 13156
+    },
+    {
+      "epoch": 0.13157,
+      "grad_norm": 0.8293696045875549,
+      "learning_rate": 0.003,
+      "loss": 4.061,
+      "step": 13157
+    },
+    {
+      "epoch": 0.13158,
+      "grad_norm": 0.6961458325386047,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 13158
+    },
+    {
+      "epoch": 0.13159,
+      "grad_norm": 0.6905359625816345,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 13159
+    },
+    {
+      "epoch": 0.1316,
+      "grad_norm": 0.6143713593482971,
+      "learning_rate": 0.003,
+      "loss": 3.9762,
+      "step": 13160
+    },
+    {
+      "epoch": 0.13161,
+      "grad_norm": 0.5593964457511902,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 13161
+    },
+    {
+      "epoch": 0.13162,
+      "grad_norm": 0.5873188376426697,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 13162
+    },
+    {
+      "epoch": 0.13163,
+      "grad_norm": 0.6039851307868958,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 13163
+    },
+    {
+      "epoch": 0.13164,
+      "grad_norm": 0.6437452435493469,
+      "learning_rate": 0.003,
+      "loss": 3.995,
+      "step": 13164
+    },
+    {
+      "epoch": 0.13165,
+      "grad_norm": 0.696509599685669,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 13165
+    },
+    {
+      "epoch": 0.13166,
+      "grad_norm": 0.6366514563560486,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 13166
+    },
+    {
+      "epoch": 0.13167,
+      "grad_norm": 0.5992265939712524,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 13167
+    },
+    {
+      "epoch": 0.13168,
+      "grad_norm": 0.7489793300628662,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 13168
+    },
+    {
+      "epoch": 0.13169,
+      "grad_norm": 0.8105339407920837,
+      "learning_rate": 0.003,
+      "loss": 3.9848,
+      "step": 13169
+    },
+    {
+      "epoch": 0.1317,
+      "grad_norm": 0.8892659544944763,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 13170
+    },
+    {
+      "epoch": 0.13171,
+      "grad_norm": 0.9990162253379822,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 13171
+    },
+    {
+      "epoch": 0.13172,
+      "grad_norm": 1.0135401487350464,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 13172
+    },
+    {
+      "epoch": 0.13173,
+      "grad_norm": 0.829917311668396,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 13173
+    },
+    {
+      "epoch": 0.13174,
+      "grad_norm": 0.6570572853088379,
+      "learning_rate": 0.003,
+      "loss": 4.0357,
+      "step": 13174
+    },
+    {
+      "epoch": 0.13175,
+      "grad_norm": 0.654499888420105,
+      "learning_rate": 0.003,
+      "loss": 3.9727,
+      "step": 13175
+    },
+    {
+      "epoch": 0.13176,
+      "grad_norm": 0.6734559535980225,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 13176
+    },
+    {
+      "epoch": 0.13177,
+      "grad_norm": 0.708061695098877,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 13177
+    },
+    {
+      "epoch": 0.13178,
+      "grad_norm": 0.6476017832756042,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 13178
+    },
+    {
+      "epoch": 0.13179,
+      "grad_norm": 0.7670444846153259,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 13179
+    },
+    {
+      "epoch": 0.1318,
+      "grad_norm": 0.9231152534484863,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 13180
+    },
+    {
+      "epoch": 0.13181,
+      "grad_norm": 1.0558562278747559,
+      "learning_rate": 0.003,
+      "loss": 3.9919,
+      "step": 13181
+    },
+    {
+      "epoch": 0.13182,
+      "grad_norm": 1.0957084894180298,
+      "learning_rate": 0.003,
+      "loss": 3.9991,
+      "step": 13182
+    },
+    {
+      "epoch": 0.13183,
+      "grad_norm": 0.754009485244751,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 13183
+    },
+    {
+      "epoch": 0.13184,
+      "grad_norm": 0.5575379133224487,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 13184
+    },
+    {
+      "epoch": 0.13185,
+      "grad_norm": 0.7497402429580688,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 13185
+    },
+    {
+      "epoch": 0.13186,
+      "grad_norm": 0.9239612221717834,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 13186
+    },
+    {
+      "epoch": 0.13187,
+      "grad_norm": 1.0049750804901123,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 13187
+    },
+    {
+      "epoch": 0.13188,
+      "grad_norm": 0.9554564952850342,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 13188
+    },
+    {
+      "epoch": 0.13189,
+      "grad_norm": 0.8094347715377808,
+      "learning_rate": 0.003,
+      "loss": 3.982,
+      "step": 13189
+    },
+    {
+      "epoch": 0.1319,
+      "grad_norm": 0.7651543617248535,
+      "learning_rate": 0.003,
+      "loss": 4.0481,
+      "step": 13190
+    },
+    {
+      "epoch": 0.13191,
+      "grad_norm": 0.7921789288520813,
+      "learning_rate": 0.003,
+      "loss": 4.0423,
+      "step": 13191
+    },
+    {
+      "epoch": 0.13192,
+      "grad_norm": 0.7744830846786499,
+      "learning_rate": 0.003,
+      "loss": 3.9862,
+      "step": 13192
+    },
+    {
+      "epoch": 0.13193,
+      "grad_norm": 0.8157005310058594,
+      "learning_rate": 0.003,
+      "loss": 3.9675,
+      "step": 13193
+    },
+    {
+      "epoch": 0.13194,
+      "grad_norm": 0.7987657785415649,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 13194
+    },
+    {
+      "epoch": 0.13195,
+      "grad_norm": 0.7527452111244202,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 13195
+    },
+    {
+      "epoch": 0.13196,
+      "grad_norm": 0.6971560120582581,
+      "learning_rate": 0.003,
+      "loss": 3.9714,
+      "step": 13196
+    },
+    {
+      "epoch": 0.13197,
+      "grad_norm": 0.6103990077972412,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 13197
+    },
+    {
+      "epoch": 0.13198,
+      "grad_norm": 0.5640221834182739,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 13198
+    },
+    {
+      "epoch": 0.13199,
+      "grad_norm": 0.512528657913208,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 13199
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.535915195941925,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 13200
+    },
+    {
+      "epoch": 0.13201,
+      "grad_norm": 0.5550373196601868,
+      "learning_rate": 0.003,
+      "loss": 3.9851,
+      "step": 13201
+    },
+    {
+      "epoch": 0.13202,
+      "grad_norm": 0.821470320224762,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 13202
+    },
+    {
+      "epoch": 0.13203,
+      "grad_norm": 1.1169755458831787,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 13203
+    },
+    {
+      "epoch": 0.13204,
+      "grad_norm": 0.9754448533058167,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 13204
+    },
+    {
+      "epoch": 0.13205,
+      "grad_norm": 1.0438214540481567,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 13205
+    },
+    {
+      "epoch": 0.13206,
+      "grad_norm": 0.8361873626708984,
+      "learning_rate": 0.003,
+      "loss": 4.0095,
+      "step": 13206
+    },
+    {
+      "epoch": 0.13207,
+      "grad_norm": 0.7784366011619568,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 13207
+    },
+    {
+      "epoch": 0.13208,
+      "grad_norm": 0.7746293544769287,
+      "learning_rate": 0.003,
+      "loss": 3.9921,
+      "step": 13208
+    },
+    {
+      "epoch": 0.13209,
+      "grad_norm": 0.7999920845031738,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 13209
+    },
+    {
+      "epoch": 0.1321,
+      "grad_norm": 0.8070949912071228,
+      "learning_rate": 0.003,
+      "loss": 4.0554,
+      "step": 13210
+    },
+    {
+      "epoch": 0.13211,
+      "grad_norm": 0.9598238468170166,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 13211
+    },
+    {
+      "epoch": 0.13212,
+      "grad_norm": 1.0701309442520142,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 13212
+    },
+    {
+      "epoch": 0.13213,
+      "grad_norm": 1.023281216621399,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 13213
+    },
+    {
+      "epoch": 0.13214,
+      "grad_norm": 0.9652174115180969,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 13214
+    },
+    {
+      "epoch": 0.13215,
+      "grad_norm": 0.9821644425392151,
+      "learning_rate": 0.003,
+      "loss": 4.0302,
+      "step": 13215
+    },
+    {
+      "epoch": 0.13216,
+      "grad_norm": 1.14763605594635,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 13216
+    },
+    {
+      "epoch": 0.13217,
+      "grad_norm": 0.7339428067207336,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 13217
+    },
+    {
+      "epoch": 0.13218,
+      "grad_norm": 0.7249696254730225,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 13218
+    },
+    {
+      "epoch": 0.13219,
+      "grad_norm": 0.7884764075279236,
+      "learning_rate": 0.003,
+      "loss": 4.0719,
+      "step": 13219
+    },
+    {
+      "epoch": 0.1322,
+      "grad_norm": 0.7975813150405884,
+      "learning_rate": 0.003,
+      "loss": 4.0575,
+      "step": 13220
+    },
+    {
+      "epoch": 0.13221,
+      "grad_norm": 0.8157402276992798,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 13221
+    },
+    {
+      "epoch": 0.13222,
+      "grad_norm": 0.6767211556434631,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 13222
+    },
+    {
+      "epoch": 0.13223,
+      "grad_norm": 0.6843516826629639,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 13223
+    },
+    {
+      "epoch": 0.13224,
+      "grad_norm": 0.5921446084976196,
+      "learning_rate": 0.003,
+      "loss": 3.9938,
+      "step": 13224
+    },
+    {
+      "epoch": 0.13225,
+      "grad_norm": 0.5578984022140503,
+      "learning_rate": 0.003,
+      "loss": 3.9845,
+      "step": 13225
+    },
+    {
+      "epoch": 0.13226,
+      "grad_norm": 0.5266581177711487,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 13226
+    },
+    {
+      "epoch": 0.13227,
+      "grad_norm": 0.5994659662246704,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 13227
+    },
+    {
+      "epoch": 0.13228,
+      "grad_norm": 0.8346339464187622,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 13228
+    },
+    {
+      "epoch": 0.13229,
+      "grad_norm": 1.1519432067871094,
+      "learning_rate": 0.003,
+      "loss": 4.0294,
+      "step": 13229
+    },
+    {
+      "epoch": 0.1323,
+      "grad_norm": 0.9563078284263611,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 13230
+    },
+    {
+      "epoch": 0.13231,
+      "grad_norm": 0.9070895314216614,
+      "learning_rate": 0.003,
+      "loss": 3.9946,
+      "step": 13231
+    },
+    {
+      "epoch": 0.13232,
+      "grad_norm": 0.8742591738700867,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 13232
+    },
+    {
+      "epoch": 0.13233,
+      "grad_norm": 0.8203812837600708,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 13233
+    },
+    {
+      "epoch": 0.13234,
+      "grad_norm": 0.8483673930168152,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 13234
+    },
+    {
+      "epoch": 0.13235,
+      "grad_norm": 0.8297470808029175,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 13235
+    },
+    {
+      "epoch": 0.13236,
+      "grad_norm": 0.8401219844818115,
+      "learning_rate": 0.003,
+      "loss": 3.986,
+      "step": 13236
+    },
+    {
+      "epoch": 0.13237,
+      "grad_norm": 0.899707019329071,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 13237
+    },
+    {
+      "epoch": 0.13238,
+      "grad_norm": 0.8262010216712952,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 13238
+    },
+    {
+      "epoch": 0.13239,
+      "grad_norm": 0.691097617149353,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 13239
+    },
+    {
+      "epoch": 0.1324,
+      "grad_norm": 0.6713021993637085,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 13240
+    },
+    {
+      "epoch": 0.13241,
+      "grad_norm": 0.7988129258155823,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 13241
+    },
+    {
+      "epoch": 0.13242,
+      "grad_norm": 0.9341169595718384,
+      "learning_rate": 0.003,
+      "loss": 4.039,
+      "step": 13242
+    },
+    {
+      "epoch": 0.13243,
+      "grad_norm": 1.187171459197998,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 13243
+    },
+    {
+      "epoch": 0.13244,
+      "grad_norm": 0.8251987099647522,
+      "learning_rate": 0.003,
+      "loss": 3.9883,
+      "step": 13244
+    },
+    {
+      "epoch": 0.13245,
+      "grad_norm": 0.6872908473014832,
+      "learning_rate": 0.003,
+      "loss": 3.9722,
+      "step": 13245
+    },
+    {
+      "epoch": 0.13246,
+      "grad_norm": 0.6644535064697266,
+      "learning_rate": 0.003,
+      "loss": 3.9906,
+      "step": 13246
+    },
+    {
+      "epoch": 0.13247,
+      "grad_norm": 0.7135847210884094,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 13247
+    },
+    {
+      "epoch": 0.13248,
+      "grad_norm": 0.7650418281555176,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 13248
+    },
+    {
+      "epoch": 0.13249,
+      "grad_norm": 0.7449178695678711,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 13249
+    },
+    {
+      "epoch": 0.1325,
+      "grad_norm": 0.664030134677887,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 13250
+    },
+    {
+      "epoch": 0.13251,
+      "grad_norm": 0.6319637298583984,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 13251
+    },
+    {
+      "epoch": 0.13252,
+      "grad_norm": 0.7340186834335327,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 13252
+    },
+    {
+      "epoch": 0.13253,
+      "grad_norm": 0.7993084788322449,
+      "learning_rate": 0.003,
+      "loss": 3.9771,
+      "step": 13253
+    },
+    {
+      "epoch": 0.13254,
+      "grad_norm": 0.986703097820282,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 13254
+    },
+    {
+      "epoch": 0.13255,
+      "grad_norm": 1.1799412965774536,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 13255
+    },
+    {
+      "epoch": 0.13256,
+      "grad_norm": 0.7233451008796692,
+      "learning_rate": 0.003,
+      "loss": 4.0263,
+      "step": 13256
+    },
+    {
+      "epoch": 0.13257,
+      "grad_norm": 0.5764203071594238,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 13257
+    },
+    {
+      "epoch": 0.13258,
+      "grad_norm": 0.6633540391921997,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 13258
+    },
+    {
+      "epoch": 0.13259,
+      "grad_norm": 0.6555531024932861,
+      "learning_rate": 0.003,
+      "loss": 4.0331,
+      "step": 13259
+    },
+    {
+      "epoch": 0.1326,
+      "grad_norm": 0.6837939023971558,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 13260
+    },
+    {
+      "epoch": 0.13261,
+      "grad_norm": 0.7089996337890625,
+      "learning_rate": 0.003,
+      "loss": 4.015,
+      "step": 13261
+    },
+    {
+      "epoch": 0.13262,
+      "grad_norm": 0.8657823204994202,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 13262
+    },
+    {
+      "epoch": 0.13263,
+      "grad_norm": 1.006532073020935,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 13263
+    },
+    {
+      "epoch": 0.13264,
+      "grad_norm": 0.9170596599578857,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 13264
+    },
+    {
+      "epoch": 0.13265,
+      "grad_norm": 0.8759198784828186,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 13265
+    },
+    {
+      "epoch": 0.13266,
+      "grad_norm": 0.9278095960617065,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 13266
+    },
+    {
+      "epoch": 0.13267,
+      "grad_norm": 0.9916574358940125,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 13267
+    },
+    {
+      "epoch": 0.13268,
+      "grad_norm": 0.9431864023208618,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 13268
+    },
+    {
+      "epoch": 0.13269,
+      "grad_norm": 0.8153677582740784,
+      "learning_rate": 0.003,
+      "loss": 3.993,
+      "step": 13269
+    },
+    {
+      "epoch": 0.1327,
+      "grad_norm": 0.8011820316314697,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 13270
+    },
+    {
+      "epoch": 0.13271,
+      "grad_norm": 0.7391328811645508,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 13271
+    },
+    {
+      "epoch": 0.13272,
+      "grad_norm": 0.7164938449859619,
+      "learning_rate": 0.003,
+      "loss": 3.9913,
+      "step": 13272
+    },
+    {
+      "epoch": 0.13273,
+      "grad_norm": 0.6879814267158508,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 13273
+    },
+    {
+      "epoch": 0.13274,
+      "grad_norm": 0.6241834759712219,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 13274
+    },
+    {
+      "epoch": 0.13275,
+      "grad_norm": 0.6053652167320251,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 13275
+    },
+    {
+      "epoch": 0.13276,
+      "grad_norm": 0.6461558938026428,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 13276
+    },
+    {
+      "epoch": 0.13277,
+      "grad_norm": 0.6877732276916504,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 13277
+    },
+    {
+      "epoch": 0.13278,
+      "grad_norm": 0.9395847320556641,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 13278
+    },
+    {
+      "epoch": 0.13279,
+      "grad_norm": 1.2979737520217896,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 13279
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.7131104469299316,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 13280
+    },
+    {
+      "epoch": 0.13281,
+      "grad_norm": 0.7071772813796997,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 13281
+    },
+    {
+      "epoch": 0.13282,
+      "grad_norm": 0.7539332509040833,
+      "learning_rate": 0.003,
+      "loss": 4.0406,
+      "step": 13282
+    },
+    {
+      "epoch": 0.13283,
+      "grad_norm": 0.7766580581665039,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 13283
+    },
+    {
+      "epoch": 0.13284,
+      "grad_norm": 0.8711881041526794,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 13284
+    },
+    {
+      "epoch": 0.13285,
+      "grad_norm": 0.9808143377304077,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 13285
+    },
+    {
+      "epoch": 0.13286,
+      "grad_norm": 1.0515544414520264,
+      "learning_rate": 0.003,
+      "loss": 4.0544,
+      "step": 13286
+    },
+    {
+      "epoch": 0.13287,
+      "grad_norm": 0.9690415859222412,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 13287
+    },
+    {
+      "epoch": 0.13288,
+      "grad_norm": 0.9867044687271118,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 13288
+    },
+    {
+      "epoch": 0.13289,
+      "grad_norm": 1.0373833179473877,
+      "learning_rate": 0.003,
+      "loss": 4.0543,
+      "step": 13289
+    },
+    {
+      "epoch": 0.1329,
+      "grad_norm": 0.9455235600471497,
+      "learning_rate": 0.003,
+      "loss": 4.0587,
+      "step": 13290
+    },
+    {
+      "epoch": 0.13291,
+      "grad_norm": 0.9202231168746948,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 13291
+    },
+    {
+      "epoch": 0.13292,
+      "grad_norm": 1.135408878326416,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 13292
+    },
+    {
+      "epoch": 0.13293,
+      "grad_norm": 1.058363914489746,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 13293
+    },
+    {
+      "epoch": 0.13294,
+      "grad_norm": 0.9352374076843262,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 13294
+    },
+    {
+      "epoch": 0.13295,
+      "grad_norm": 0.8602332472801208,
+      "learning_rate": 0.003,
+      "loss": 4.0369,
+      "step": 13295
+    },
+    {
+      "epoch": 0.13296,
+      "grad_norm": 0.9056118130683899,
+      "learning_rate": 0.003,
+      "loss": 4.0655,
+      "step": 13296
+    },
+    {
+      "epoch": 0.13297,
+      "grad_norm": 0.8816995024681091,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 13297
+    },
+    {
+      "epoch": 0.13298,
+      "grad_norm": 0.8176944851875305,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 13298
+    },
+    {
+      "epoch": 0.13299,
+      "grad_norm": 0.7336651086807251,
+      "learning_rate": 0.003,
+      "loss": 4.0716,
+      "step": 13299
+    },
+    {
+      "epoch": 0.133,
+      "grad_norm": 0.7953958511352539,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 13300
+    },
+    {
+      "epoch": 0.13301,
+      "grad_norm": 0.6614307761192322,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 13301
+    },
+    {
+      "epoch": 0.13302,
+      "grad_norm": 0.6963691115379333,
+      "learning_rate": 0.003,
+      "loss": 3.9799,
+      "step": 13302
+    },
+    {
+      "epoch": 0.13303,
+      "grad_norm": 0.7363566756248474,
+      "learning_rate": 0.003,
+      "loss": 4.0316,
+      "step": 13303
+    },
+    {
+      "epoch": 0.13304,
+      "grad_norm": 0.8909085988998413,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 13304
+    },
+    {
+      "epoch": 0.13305,
+      "grad_norm": 0.9495358467102051,
+      "learning_rate": 0.003,
+      "loss": 4.0493,
+      "step": 13305
+    },
+    {
+      "epoch": 0.13306,
+      "grad_norm": 0.7758327126502991,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 13306
+    },
+    {
+      "epoch": 0.13307,
+      "grad_norm": 0.6987218260765076,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 13307
+    },
+    {
+      "epoch": 0.13308,
+      "grad_norm": 0.7181601524353027,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 13308
+    },
+    {
+      "epoch": 0.13309,
+      "grad_norm": 0.621520459651947,
+      "learning_rate": 0.003,
+      "loss": 3.9952,
+      "step": 13309
+    },
+    {
+      "epoch": 0.1331,
+      "grad_norm": 0.5995641350746155,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 13310
+    },
+    {
+      "epoch": 0.13311,
+      "grad_norm": 0.6506985425949097,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 13311
+    },
+    {
+      "epoch": 0.13312,
+      "grad_norm": 0.7738531827926636,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 13312
+    },
+    {
+      "epoch": 0.13313,
+      "grad_norm": 0.8469628691673279,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 13313
+    },
+    {
+      "epoch": 0.13314,
+      "grad_norm": 0.8940180540084839,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 13314
+    },
+    {
+      "epoch": 0.13315,
+      "grad_norm": 0.8369051218032837,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 13315
+    },
+    {
+      "epoch": 0.13316,
+      "grad_norm": 0.8768254518508911,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 13316
+    },
+    {
+      "epoch": 0.13317,
+      "grad_norm": 0.8258373141288757,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 13317
+    },
+    {
+      "epoch": 0.13318,
+      "grad_norm": 0.9554781913757324,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 13318
+    },
+    {
+      "epoch": 0.13319,
+      "grad_norm": 1.2251650094985962,
+      "learning_rate": 0.003,
+      "loss": 4.0402,
+      "step": 13319
+    },
+    {
+      "epoch": 0.1332,
+      "grad_norm": 1.1571624279022217,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 13320
+    },
+    {
+      "epoch": 0.13321,
+      "grad_norm": 0.7736831307411194,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 13321
+    },
+    {
+      "epoch": 0.13322,
+      "grad_norm": 0.7312729954719543,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 13322
+    },
+    {
+      "epoch": 0.13323,
+      "grad_norm": 0.7697762250900269,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 13323
+    },
+    {
+      "epoch": 0.13324,
+      "grad_norm": 0.8291617035865784,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 13324
+    },
+    {
+      "epoch": 0.13325,
+      "grad_norm": 0.773373544216156,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 13325
+    },
+    {
+      "epoch": 0.13326,
+      "grad_norm": 0.8538793325424194,
+      "learning_rate": 0.003,
+      "loss": 4.0483,
+      "step": 13326
+    },
+    {
+      "epoch": 0.13327,
+      "grad_norm": 0.8671174049377441,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 13327
+    },
+    {
+      "epoch": 0.13328,
+      "grad_norm": 0.8660413026809692,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 13328
+    },
+    {
+      "epoch": 0.13329,
+      "grad_norm": 0.937990128993988,
+      "learning_rate": 0.003,
+      "loss": 4.0399,
+      "step": 13329
+    },
+    {
+      "epoch": 0.1333,
+      "grad_norm": 0.9119920134544373,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 13330
+    },
+    {
+      "epoch": 0.13331,
+      "grad_norm": 0.8983891606330872,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 13331
+    },
+    {
+      "epoch": 0.13332,
+      "grad_norm": 0.7864333987236023,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 13332
+    },
+    {
+      "epoch": 0.13333,
+      "grad_norm": 0.6092193126678467,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 13333
+    },
+    {
+      "epoch": 0.13334,
+      "grad_norm": 0.6237526535987854,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 13334
+    },
+    {
+      "epoch": 0.13335,
+      "grad_norm": 0.6614526510238647,
+      "learning_rate": 0.003,
+      "loss": 3.9876,
+      "step": 13335
+    },
+    {
+      "epoch": 0.13336,
+      "grad_norm": 0.6877928376197815,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 13336
+    },
+    {
+      "epoch": 0.13337,
+      "grad_norm": 0.6516103744506836,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 13337
+    },
+    {
+      "epoch": 0.13338,
+      "grad_norm": 0.6226836442947388,
+      "learning_rate": 0.003,
+      "loss": 4.0467,
+      "step": 13338
+    },
+    {
+      "epoch": 0.13339,
+      "grad_norm": 0.6081013679504395,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 13339
+    },
+    {
+      "epoch": 0.1334,
+      "grad_norm": 0.6231227517127991,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 13340
+    },
+    {
+      "epoch": 0.13341,
+      "grad_norm": 0.6606029868125916,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 13341
+    },
+    {
+      "epoch": 0.13342,
+      "grad_norm": 0.6467459797859192,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 13342
+    },
+    {
+      "epoch": 0.13343,
+      "grad_norm": 0.5785483121871948,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 13343
+    },
+    {
+      "epoch": 0.13344,
+      "grad_norm": 0.6745023727416992,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 13344
+    },
+    {
+      "epoch": 0.13345,
+      "grad_norm": 0.6457123756408691,
+      "learning_rate": 0.003,
+      "loss": 4.0488,
+      "step": 13345
+    },
+    {
+      "epoch": 0.13346,
+      "grad_norm": 0.6489463448524475,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 13346
+    },
+    {
+      "epoch": 0.13347,
+      "grad_norm": 0.82561856508255,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 13347
+    },
+    {
+      "epoch": 0.13348,
+      "grad_norm": 1.219636082649231,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 13348
+    },
+    {
+      "epoch": 0.13349,
+      "grad_norm": 0.9405218362808228,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 13349
+    },
+    {
+      "epoch": 0.1335,
+      "grad_norm": 0.7687895894050598,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 13350
+    },
+    {
+      "epoch": 0.13351,
+      "grad_norm": 0.7437965869903564,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 13351
+    },
+    {
+      "epoch": 0.13352,
+      "grad_norm": 0.7874672412872314,
+      "learning_rate": 0.003,
+      "loss": 3.9953,
+      "step": 13352
+    },
+    {
+      "epoch": 0.13353,
+      "grad_norm": 0.7944535613059998,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 13353
+    },
+    {
+      "epoch": 0.13354,
+      "grad_norm": 0.8526052236557007,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 13354
+    },
+    {
+      "epoch": 0.13355,
+      "grad_norm": 0.9036006331443787,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 13355
+    },
+    {
+      "epoch": 0.13356,
+      "grad_norm": 1.0086641311645508,
+      "learning_rate": 0.003,
+      "loss": 4.034,
+      "step": 13356
+    },
+    {
+      "epoch": 0.13357,
+      "grad_norm": 1.0628443956375122,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 13357
+    },
+    {
+      "epoch": 0.13358,
+      "grad_norm": 0.9245560169219971,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 13358
+    },
+    {
+      "epoch": 0.13359,
+      "grad_norm": 0.9113062024116516,
+      "learning_rate": 0.003,
+      "loss": 4.0441,
+      "step": 13359
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.8313310742378235,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 13360
+    },
+    {
+      "epoch": 0.13361,
+      "grad_norm": 0.7733218669891357,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 13361
+    },
+    {
+      "epoch": 0.13362,
+      "grad_norm": 0.8257316946983337,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 13362
+    },
+    {
+      "epoch": 0.13363,
+      "grad_norm": 0.9762942790985107,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 13363
+    },
+    {
+      "epoch": 0.13364,
+      "grad_norm": 1.0518579483032227,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 13364
+    },
+    {
+      "epoch": 0.13365,
+      "grad_norm": 1.0676583051681519,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 13365
+    },
+    {
+      "epoch": 0.13366,
+      "grad_norm": 1.1223315000534058,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 13366
+    },
+    {
+      "epoch": 0.13367,
+      "grad_norm": 0.8500106930732727,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 13367
+    },
+    {
+      "epoch": 0.13368,
+      "grad_norm": 0.793196976184845,
+      "learning_rate": 0.003,
+      "loss": 4.0435,
+      "step": 13368
+    },
+    {
+      "epoch": 0.13369,
+      "grad_norm": 0.8577861785888672,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 13369
+    },
+    {
+      "epoch": 0.1337,
+      "grad_norm": 0.792363166809082,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 13370
+    },
+    {
+      "epoch": 0.13371,
+      "grad_norm": 0.7480350136756897,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 13371
+    },
+    {
+      "epoch": 0.13372,
+      "grad_norm": 0.7009800672531128,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 13372
+    },
+    {
+      "epoch": 0.13373,
+      "grad_norm": 0.6068524122238159,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 13373
+    },
+    {
+      "epoch": 0.13374,
+      "grad_norm": 0.5666672587394714,
+      "learning_rate": 0.003,
+      "loss": 4.0463,
+      "step": 13374
+    },
+    {
+      "epoch": 0.13375,
+      "grad_norm": 0.5108823776245117,
+      "learning_rate": 0.003,
+      "loss": 3.9938,
+      "step": 13375
+    },
+    {
+      "epoch": 0.13376,
+      "grad_norm": 0.5774194598197937,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 13376
+    },
+    {
+      "epoch": 0.13377,
+      "grad_norm": 0.6002748608589172,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 13377
+    },
+    {
+      "epoch": 0.13378,
+      "grad_norm": 0.7773520946502686,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 13378
+    },
+    {
+      "epoch": 0.13379,
+      "grad_norm": 1.0793273448944092,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 13379
+    },
+    {
+      "epoch": 0.1338,
+      "grad_norm": 1.0280603170394897,
+      "learning_rate": 0.003,
+      "loss": 4.009,
+      "step": 13380
+    },
+    {
+      "epoch": 0.13381,
+      "grad_norm": 1.089820384979248,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 13381
+    },
+    {
+      "epoch": 0.13382,
+      "grad_norm": 0.8872218132019043,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 13382
+    },
+    {
+      "epoch": 0.13383,
+      "grad_norm": 0.7349159121513367,
+      "learning_rate": 0.003,
+      "loss": 4.0022,
+      "step": 13383
+    },
+    {
+      "epoch": 0.13384,
+      "grad_norm": 0.6866400837898254,
+      "learning_rate": 0.003,
+      "loss": 3.9795,
+      "step": 13384
+    },
+    {
+      "epoch": 0.13385,
+      "grad_norm": 0.7582387328147888,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 13385
+    },
+    {
+      "epoch": 0.13386,
+      "grad_norm": 0.8372118473052979,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 13386
+    },
+    {
+      "epoch": 0.13387,
+      "grad_norm": 0.9449887871742249,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 13387
+    },
+    {
+      "epoch": 0.13388,
+      "grad_norm": 1.0124119520187378,
+      "learning_rate": 0.003,
+      "loss": 4.0452,
+      "step": 13388
+    },
+    {
+      "epoch": 0.13389,
+      "grad_norm": 1.1869573593139648,
+      "learning_rate": 0.003,
+      "loss": 4.0505,
+      "step": 13389
+    },
+    {
+      "epoch": 0.1339,
+      "grad_norm": 0.8177801370620728,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 13390
+    },
+    {
+      "epoch": 0.13391,
+      "grad_norm": 0.8380702137947083,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 13391
+    },
+    {
+      "epoch": 0.13392,
+      "grad_norm": 0.9253681898117065,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 13392
+    },
+    {
+      "epoch": 0.13393,
+      "grad_norm": 0.8196813464164734,
+      "learning_rate": 0.003,
+      "loss": 4.0424,
+      "step": 13393
+    },
+    {
+      "epoch": 0.13394,
+      "grad_norm": 0.7437666058540344,
+      "learning_rate": 0.003,
+      "loss": 4.0316,
+      "step": 13394
+    },
+    {
+      "epoch": 0.13395,
+      "grad_norm": 0.7029215097427368,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 13395
+    },
+    {
+      "epoch": 0.13396,
+      "grad_norm": 0.6601882576942444,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 13396
+    },
+    {
+      "epoch": 0.13397,
+      "grad_norm": 0.5576168894767761,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 13397
+    },
+    {
+      "epoch": 0.13398,
+      "grad_norm": 0.5518843531608582,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 13398
+    },
+    {
+      "epoch": 0.13399,
+      "grad_norm": 0.6258888840675354,
+      "learning_rate": 0.003,
+      "loss": 3.9844,
+      "step": 13399
+    },
+    {
+      "epoch": 0.134,
+      "grad_norm": 0.6261287331581116,
+      "learning_rate": 0.003,
+      "loss": 4.0403,
+      "step": 13400
+    },
+    {
+      "epoch": 0.13401,
+      "grad_norm": 0.6984511017799377,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 13401
+    },
+    {
+      "epoch": 0.13402,
+      "grad_norm": 0.9650695323944092,
+      "learning_rate": 0.003,
+      "loss": 3.9722,
+      "step": 13402
+    },
+    {
+      "epoch": 0.13403,
+      "grad_norm": 1.528144121170044,
+      "learning_rate": 0.003,
+      "loss": 4.0679,
+      "step": 13403
+    },
+    {
+      "epoch": 0.13404,
+      "grad_norm": 0.5469996333122253,
+      "learning_rate": 0.003,
+      "loss": 4.0469,
+      "step": 13404
+    },
+    {
+      "epoch": 0.13405,
+      "grad_norm": 0.8591877818107605,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 13405
+    },
+    {
+      "epoch": 0.13406,
+      "grad_norm": 1.1252491474151611,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 13406
+    },
+    {
+      "epoch": 0.13407,
+      "grad_norm": 0.895357608795166,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 13407
+    },
+    {
+      "epoch": 0.13408,
+      "grad_norm": 0.8792118430137634,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 13408
+    },
+    {
+      "epoch": 0.13409,
+      "grad_norm": 0.7852404117584229,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 13409
+    },
+    {
+      "epoch": 0.1341,
+      "grad_norm": 0.6954460740089417,
+      "learning_rate": 0.003,
+      "loss": 3.9957,
+      "step": 13410
+    },
+    {
+      "epoch": 0.13411,
+      "grad_norm": 0.6863563060760498,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 13411
+    },
+    {
+      "epoch": 0.13412,
+      "grad_norm": 0.7757440209388733,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 13412
+    },
+    {
+      "epoch": 0.13413,
+      "grad_norm": 0.7826729416847229,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 13413
+    },
+    {
+      "epoch": 0.13414,
+      "grad_norm": 0.833393394947052,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 13414
+    },
+    {
+      "epoch": 0.13415,
+      "grad_norm": 0.8553380966186523,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 13415
+    },
+    {
+      "epoch": 0.13416,
+      "grad_norm": 0.7370104789733887,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 13416
+    },
+    {
+      "epoch": 0.13417,
+      "grad_norm": 0.594001054763794,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 13417
+    },
+    {
+      "epoch": 0.13418,
+      "grad_norm": 0.6473716497421265,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 13418
+    },
+    {
+      "epoch": 0.13419,
+      "grad_norm": 0.6357209086418152,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 13419
+    },
+    {
+      "epoch": 0.1342,
+      "grad_norm": 0.761445939540863,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 13420
+    },
+    {
+      "epoch": 0.13421,
+      "grad_norm": 1.2044970989227295,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 13421
+    },
+    {
+      "epoch": 0.13422,
+      "grad_norm": 1.1725409030914307,
+      "learning_rate": 0.003,
+      "loss": 4.0434,
+      "step": 13422
+    },
+    {
+      "epoch": 0.13423,
+      "grad_norm": 0.7587118744850159,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 13423
+    },
+    {
+      "epoch": 0.13424,
+      "grad_norm": 0.7179146409034729,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 13424
+    },
+    {
+      "epoch": 0.13425,
+      "grad_norm": 0.8006987571716309,
+      "learning_rate": 0.003,
+      "loss": 4.0256,
+      "step": 13425
+    },
+    {
+      "epoch": 0.13426,
+      "grad_norm": 0.7377433776855469,
+      "learning_rate": 0.003,
+      "loss": 3.9865,
+      "step": 13426
+    },
+    {
+      "epoch": 0.13427,
+      "grad_norm": 0.68075031042099,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 13427
+    },
+    {
+      "epoch": 0.13428,
+      "grad_norm": 0.7437992691993713,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 13428
+    },
+    {
+      "epoch": 0.13429,
+      "grad_norm": 0.7679684162139893,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 13429
+    },
+    {
+      "epoch": 0.1343,
+      "grad_norm": 0.8779183626174927,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 13430
+    },
+    {
+      "epoch": 0.13431,
+      "grad_norm": 0.8668930530548096,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 13431
+    },
+    {
+      "epoch": 0.13432,
+      "grad_norm": 0.75432950258255,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 13432
+    },
+    {
+      "epoch": 0.13433,
+      "grad_norm": 0.9304035902023315,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 13433
+    },
+    {
+      "epoch": 0.13434,
+      "grad_norm": 1.0698155164718628,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 13434
+    },
+    {
+      "epoch": 0.13435,
+      "grad_norm": 1.0459843873977661,
+      "learning_rate": 0.003,
+      "loss": 4.0564,
+      "step": 13435
+    },
+    {
+      "epoch": 0.13436,
+      "grad_norm": 0.9980152249336243,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 13436
+    },
+    {
+      "epoch": 0.13437,
+      "grad_norm": 1.0447239875793457,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 13437
+    },
+    {
+      "epoch": 0.13438,
+      "grad_norm": 0.9681057929992676,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 13438
+    },
+    {
+      "epoch": 0.13439,
+      "grad_norm": 0.8853546380996704,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 13439
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.910854697227478,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 13440
+    },
+    {
+      "epoch": 0.13441,
+      "grad_norm": 0.8217945098876953,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 13441
+    },
+    {
+      "epoch": 0.13442,
+      "grad_norm": 0.7678465247154236,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 13442
+    },
+    {
+      "epoch": 0.13443,
+      "grad_norm": 0.6415266394615173,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 13443
+    },
+    {
+      "epoch": 0.13444,
+      "grad_norm": 0.6081373691558838,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 13444
+    },
+    {
+      "epoch": 0.13445,
+      "grad_norm": 0.6811102628707886,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 13445
+    },
+    {
+      "epoch": 0.13446,
+      "grad_norm": 0.9398423433303833,
+      "learning_rate": 0.003,
+      "loss": 4.0302,
+      "step": 13446
+    },
+    {
+      "epoch": 0.13447,
+      "grad_norm": 1.2884865999221802,
+      "learning_rate": 0.003,
+      "loss": 4.0618,
+      "step": 13447
+    },
+    {
+      "epoch": 0.13448,
+      "grad_norm": 0.7476019859313965,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 13448
+    },
+    {
+      "epoch": 0.13449,
+      "grad_norm": 0.6717467904090881,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 13449
+    },
+    {
+      "epoch": 0.1345,
+      "grad_norm": 0.6408231854438782,
+      "learning_rate": 0.003,
+      "loss": 3.9629,
+      "step": 13450
+    },
+    {
+      "epoch": 0.13451,
+      "grad_norm": 0.7041977643966675,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 13451
+    },
+    {
+      "epoch": 0.13452,
+      "grad_norm": 0.6291084885597229,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 13452
+    },
+    {
+      "epoch": 0.13453,
+      "grad_norm": 0.6171227097511292,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 13453
+    },
+    {
+      "epoch": 0.13454,
+      "grad_norm": 0.6809126734733582,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 13454
+    },
+    {
+      "epoch": 0.13455,
+      "grad_norm": 0.7555040121078491,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 13455
+    },
+    {
+      "epoch": 0.13456,
+      "grad_norm": 0.7336331009864807,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 13456
+    },
+    {
+      "epoch": 0.13457,
+      "grad_norm": 0.8007299900054932,
+      "learning_rate": 0.003,
+      "loss": 3.9876,
+      "step": 13457
+    },
+    {
+      "epoch": 0.13458,
+      "grad_norm": 1.040239691734314,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 13458
+    },
+    {
+      "epoch": 0.13459,
+      "grad_norm": 1.1279948949813843,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 13459
+    },
+    {
+      "epoch": 0.1346,
+      "grad_norm": 0.5545344352722168,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 13460
+    },
+    {
+      "epoch": 0.13461,
+      "grad_norm": 0.584706723690033,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 13461
+    },
+    {
+      "epoch": 0.13462,
+      "grad_norm": 0.731682300567627,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 13462
+    },
+    {
+      "epoch": 0.13463,
+      "grad_norm": 0.8271353840827942,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 13463
+    },
+    {
+      "epoch": 0.13464,
+      "grad_norm": 0.979057252407074,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 13464
+    },
+    {
+      "epoch": 0.13465,
+      "grad_norm": 0.8582794666290283,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 13465
+    },
+    {
+      "epoch": 0.13466,
+      "grad_norm": 0.6173996925354004,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 13466
+    },
+    {
+      "epoch": 0.13467,
+      "grad_norm": 0.593360424041748,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 13467
+    },
+    {
+      "epoch": 0.13468,
+      "grad_norm": 0.5905532836914062,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 13468
+    },
+    {
+      "epoch": 0.13469,
+      "grad_norm": 0.6375170946121216,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 13469
+    },
+    {
+      "epoch": 0.1347,
+      "grad_norm": 0.6386033296585083,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 13470
+    },
+    {
+      "epoch": 0.13471,
+      "grad_norm": 0.630262553691864,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 13471
+    },
+    {
+      "epoch": 0.13472,
+      "grad_norm": 0.6448857188224792,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 13472
+    },
+    {
+      "epoch": 0.13473,
+      "grad_norm": 0.7303140759468079,
+      "learning_rate": 0.003,
+      "loss": 3.9795,
+      "step": 13473
+    },
+    {
+      "epoch": 0.13474,
+      "grad_norm": 0.8613793253898621,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 13474
+    },
+    {
+      "epoch": 0.13475,
+      "grad_norm": 0.9755415916442871,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 13475
+    },
+    {
+      "epoch": 0.13476,
+      "grad_norm": 0.942423403263092,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 13476
+    },
+    {
+      "epoch": 0.13477,
+      "grad_norm": 0.9543547034263611,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 13477
+    },
+    {
+      "epoch": 0.13478,
+      "grad_norm": 0.9376398324966431,
+      "learning_rate": 0.003,
+      "loss": 4.0376,
+      "step": 13478
+    },
+    {
+      "epoch": 0.13479,
+      "grad_norm": 0.8613263964653015,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 13479
+    },
+    {
+      "epoch": 0.1348,
+      "grad_norm": 0.8967022895812988,
+      "learning_rate": 0.003,
+      "loss": 4.0406,
+      "step": 13480
+    },
+    {
+      "epoch": 0.13481,
+      "grad_norm": 0.9113709926605225,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 13481
+    },
+    {
+      "epoch": 0.13482,
+      "grad_norm": 1.0850682258605957,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 13482
+    },
+    {
+      "epoch": 0.13483,
+      "grad_norm": 1.093895673751831,
+      "learning_rate": 0.003,
+      "loss": 3.9944,
+      "step": 13483
+    },
+    {
+      "epoch": 0.13484,
+      "grad_norm": 1.1075501441955566,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 13484
+    },
+    {
+      "epoch": 0.13485,
+      "grad_norm": 1.0485846996307373,
+      "learning_rate": 0.003,
+      "loss": 4.0506,
+      "step": 13485
+    },
+    {
+      "epoch": 0.13486,
+      "grad_norm": 0.9366323351860046,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 13486
+    },
+    {
+      "epoch": 0.13487,
+      "grad_norm": 0.8190991878509521,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 13487
+    },
+    {
+      "epoch": 0.13488,
+      "grad_norm": 0.8177504539489746,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 13488
+    },
+    {
+      "epoch": 0.13489,
+      "grad_norm": 0.8176032900810242,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 13489
+    },
+    {
+      "epoch": 0.1349,
+      "grad_norm": 0.8497923612594604,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 13490
+    },
+    {
+      "epoch": 0.13491,
+      "grad_norm": 1.0562101602554321,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 13491
+    },
+    {
+      "epoch": 0.13492,
+      "grad_norm": 1.167596459388733,
+      "learning_rate": 0.003,
+      "loss": 4.043,
+      "step": 13492
+    },
+    {
+      "epoch": 0.13493,
+      "grad_norm": 1.1683759689331055,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 13493
+    },
+    {
+      "epoch": 0.13494,
+      "grad_norm": 1.0237256288528442,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 13494
+    },
+    {
+      "epoch": 0.13495,
+      "grad_norm": 0.8731281757354736,
+      "learning_rate": 0.003,
+      "loss": 4.0597,
+      "step": 13495
+    },
+    {
+      "epoch": 0.13496,
+      "grad_norm": 0.8121966123580933,
+      "learning_rate": 0.003,
+      "loss": 4.0476,
+      "step": 13496
+    },
+    {
+      "epoch": 0.13497,
+      "grad_norm": 0.8778128623962402,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 13497
+    },
+    {
+      "epoch": 0.13498,
+      "grad_norm": 0.7970010638237,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 13498
+    },
+    {
+      "epoch": 0.13499,
+      "grad_norm": 0.7046995759010315,
+      "learning_rate": 0.003,
+      "loss": 3.9872,
+      "step": 13499
+    },
+    {
+      "epoch": 0.135,
+      "grad_norm": 0.6215248107910156,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 13500
+    },
+    {
+      "epoch": 0.13501,
+      "grad_norm": 0.6347829699516296,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 13501
+    },
+    {
+      "epoch": 0.13502,
+      "grad_norm": 0.6442795991897583,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 13502
+    },
+    {
+      "epoch": 0.13503,
+      "grad_norm": 0.6396309733390808,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 13503
+    },
+    {
+      "epoch": 0.13504,
+      "grad_norm": 0.6402974128723145,
+      "learning_rate": 0.003,
+      "loss": 4.009,
+      "step": 13504
+    },
+    {
+      "epoch": 0.13505,
+      "grad_norm": 0.7731344103813171,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 13505
+    },
+    {
+      "epoch": 0.13506,
+      "grad_norm": 0.9753412008285522,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 13506
+    },
+    {
+      "epoch": 0.13507,
+      "grad_norm": 1.1997851133346558,
+      "learning_rate": 0.003,
+      "loss": 4.0388,
+      "step": 13507
+    },
+    {
+      "epoch": 0.13508,
+      "grad_norm": 0.8190879821777344,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 13508
+    },
+    {
+      "epoch": 0.13509,
+      "grad_norm": 0.5890142321586609,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 13509
+    },
+    {
+      "epoch": 0.1351,
+      "grad_norm": 0.5968877077102661,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 13510
+    },
+    {
+      "epoch": 0.13511,
+      "grad_norm": 0.6396861672401428,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 13511
+    },
+    {
+      "epoch": 0.13512,
+      "grad_norm": 0.6868215799331665,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 13512
+    },
+    {
+      "epoch": 0.13513,
+      "grad_norm": 0.7117477059364319,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 13513
+    },
+    {
+      "epoch": 0.13514,
+      "grad_norm": 0.782675564289093,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 13514
+    },
+    {
+      "epoch": 0.13515,
+      "grad_norm": 0.8306724429130554,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 13515
+    },
+    {
+      "epoch": 0.13516,
+      "grad_norm": 0.7327220439910889,
+      "learning_rate": 0.003,
+      "loss": 3.9721,
+      "step": 13516
+    },
+    {
+      "epoch": 0.13517,
+      "grad_norm": 0.5924178957939148,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 13517
+    },
+    {
+      "epoch": 0.13518,
+      "grad_norm": 0.5788958668708801,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 13518
+    },
+    {
+      "epoch": 0.13519,
+      "grad_norm": 0.602343738079071,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 13519
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.671697199344635,
+      "learning_rate": 0.003,
+      "loss": 3.9822,
+      "step": 13520
+    },
+    {
+      "epoch": 0.13521,
+      "grad_norm": 0.6452050805091858,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 13521
+    },
+    {
+      "epoch": 0.13522,
+      "grad_norm": 0.7343513369560242,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 13522
+    },
+    {
+      "epoch": 0.13523,
+      "grad_norm": 0.8468841910362244,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 13523
+    },
+    {
+      "epoch": 0.13524,
+      "grad_norm": 0.941436767578125,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 13524
+    },
+    {
+      "epoch": 0.13525,
+      "grad_norm": 1.0155689716339111,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 13525
+    },
+    {
+      "epoch": 0.13526,
+      "grad_norm": 1.1248732805252075,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 13526
+    },
+    {
+      "epoch": 0.13527,
+      "grad_norm": 0.9863359332084656,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 13527
+    },
+    {
+      "epoch": 0.13528,
+      "grad_norm": 0.922791063785553,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 13528
+    },
+    {
+      "epoch": 0.13529,
+      "grad_norm": 1.0490010976791382,
+      "learning_rate": 0.003,
+      "loss": 4.0553,
+      "step": 13529
+    },
+    {
+      "epoch": 0.1353,
+      "grad_norm": 1.0780727863311768,
+      "learning_rate": 0.003,
+      "loss": 4.0473,
+      "step": 13530
+    },
+    {
+      "epoch": 0.13531,
+      "grad_norm": 0.9660634398460388,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 13531
+    },
+    {
+      "epoch": 0.13532,
+      "grad_norm": 0.9875202775001526,
+      "learning_rate": 0.003,
+      "loss": 4.0482,
+      "step": 13532
+    },
+    {
+      "epoch": 0.13533,
+      "grad_norm": 1.06731379032135,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 13533
+    },
+    {
+      "epoch": 0.13534,
+      "grad_norm": 0.8824008107185364,
+      "learning_rate": 0.003,
+      "loss": 4.0531,
+      "step": 13534
+    },
+    {
+      "epoch": 0.13535,
+      "grad_norm": 0.8004950881004333,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 13535
+    },
+    {
+      "epoch": 0.13536,
+      "grad_norm": 0.7252044081687927,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 13536
+    },
+    {
+      "epoch": 0.13537,
+      "grad_norm": 0.7182325720787048,
+      "learning_rate": 0.003,
+      "loss": 4.0176,
+      "step": 13537
+    },
+    {
+      "epoch": 0.13538,
+      "grad_norm": 0.6931709051132202,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 13538
+    },
+    {
+      "epoch": 0.13539,
+      "grad_norm": 0.6495668888092041,
+      "learning_rate": 0.003,
+      "loss": 4.0234,
+      "step": 13539
+    },
+    {
+      "epoch": 0.1354,
+      "grad_norm": 0.6988504528999329,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 13540
+    },
+    {
+      "epoch": 0.13541,
+      "grad_norm": 0.8158469200134277,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 13541
+    },
+    {
+      "epoch": 0.13542,
+      "grad_norm": 0.989193320274353,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 13542
+    },
+    {
+      "epoch": 0.13543,
+      "grad_norm": 1.1236380338668823,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 13543
+    },
+    {
+      "epoch": 0.13544,
+      "grad_norm": 0.9561525583267212,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 13544
+    },
+    {
+      "epoch": 0.13545,
+      "grad_norm": 0.9818124175071716,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 13545
+    },
+    {
+      "epoch": 0.13546,
+      "grad_norm": 1.0886470079421997,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 13546
+    },
+    {
+      "epoch": 0.13547,
+      "grad_norm": 0.9130642414093018,
+      "learning_rate": 0.003,
+      "loss": 3.9926,
+      "step": 13547
+    },
+    {
+      "epoch": 0.13548,
+      "grad_norm": 0.7905890345573425,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 13548
+    },
+    {
+      "epoch": 0.13549,
+      "grad_norm": 0.7697553634643555,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 13549
+    },
+    {
+      "epoch": 0.1355,
+      "grad_norm": 0.6934955716133118,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 13550
+    },
+    {
+      "epoch": 0.13551,
+      "grad_norm": 0.6529156565666199,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 13551
+    },
+    {
+      "epoch": 0.13552,
+      "grad_norm": 0.5677207112312317,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 13552
+    },
+    {
+      "epoch": 0.13553,
+      "grad_norm": 0.6107994318008423,
+      "learning_rate": 0.003,
+      "loss": 4.0465,
+      "step": 13553
+    },
+    {
+      "epoch": 0.13554,
+      "grad_norm": 0.7828263640403748,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 13554
+    },
+    {
+      "epoch": 0.13555,
+      "grad_norm": 1.0647815465927124,
+      "learning_rate": 0.003,
+      "loss": 3.9884,
+      "step": 13555
+    },
+    {
+      "epoch": 0.13556,
+      "grad_norm": 1.0612136125564575,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 13556
+    },
+    {
+      "epoch": 0.13557,
+      "grad_norm": 0.8446271419525146,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 13557
+    },
+    {
+      "epoch": 0.13558,
+      "grad_norm": 0.6697544455528259,
+      "learning_rate": 0.003,
+      "loss": 3.9948,
+      "step": 13558
+    },
+    {
+      "epoch": 0.13559,
+      "grad_norm": 0.6345346570014954,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 13559
+    },
+    {
+      "epoch": 0.1356,
+      "grad_norm": 0.9283767938613892,
+      "learning_rate": 0.003,
+      "loss": 3.9975,
+      "step": 13560
+    },
+    {
+      "epoch": 0.13561,
+      "grad_norm": 0.9697877168655396,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 13561
+    },
+    {
+      "epoch": 0.13562,
+      "grad_norm": 0.7506445050239563,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 13562
+    },
+    {
+      "epoch": 0.13563,
+      "grad_norm": 0.6054522395133972,
+      "learning_rate": 0.003,
+      "loss": 3.9634,
+      "step": 13563
+    },
+    {
+      "epoch": 0.13564,
+      "grad_norm": 0.6194526553153992,
+      "learning_rate": 0.003,
+      "loss": 4.0332,
+      "step": 13564
+    },
+    {
+      "epoch": 0.13565,
+      "grad_norm": 0.6902199983596802,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 13565
+    },
+    {
+      "epoch": 0.13566,
+      "grad_norm": 0.7207865118980408,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 13566
+    },
+    {
+      "epoch": 0.13567,
+      "grad_norm": 0.7407782077789307,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 13567
+    },
+    {
+      "epoch": 0.13568,
+      "grad_norm": 0.7367984652519226,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 13568
+    },
+    {
+      "epoch": 0.13569,
+      "grad_norm": 0.703450620174408,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 13569
+    },
+    {
+      "epoch": 0.1357,
+      "grad_norm": 0.6739119291305542,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 13570
+    },
+    {
+      "epoch": 0.13571,
+      "grad_norm": 0.5856640934944153,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 13571
+    },
+    {
+      "epoch": 0.13572,
+      "grad_norm": 0.6790838241577148,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 13572
+    },
+    {
+      "epoch": 0.13573,
+      "grad_norm": 0.6438860297203064,
+      "learning_rate": 0.003,
+      "loss": 3.9929,
+      "step": 13573
+    },
+    {
+      "epoch": 0.13574,
+      "grad_norm": 0.6685054898262024,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 13574
+    },
+    {
+      "epoch": 0.13575,
+      "grad_norm": 0.8931761980056763,
+      "learning_rate": 0.003,
+      "loss": 3.9815,
+      "step": 13575
+    },
+    {
+      "epoch": 0.13576,
+      "grad_norm": 1.385057806968689,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 13576
+    },
+    {
+      "epoch": 0.13577,
+      "grad_norm": 0.7037860751152039,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 13577
+    },
+    {
+      "epoch": 0.13578,
+      "grad_norm": 0.6238915324211121,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 13578
+    },
+    {
+      "epoch": 0.13579,
+      "grad_norm": 0.8868461847305298,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 13579
+    },
+    {
+      "epoch": 0.1358,
+      "grad_norm": 1.170206904411316,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 13580
+    },
+    {
+      "epoch": 0.13581,
+      "grad_norm": 0.9208158254623413,
+      "learning_rate": 0.003,
+      "loss": 3.997,
+      "step": 13581
+    },
+    {
+      "epoch": 0.13582,
+      "grad_norm": 0.778101921081543,
+      "learning_rate": 0.003,
+      "loss": 4.0302,
+      "step": 13582
+    },
+    {
+      "epoch": 0.13583,
+      "grad_norm": 0.8657846450805664,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 13583
+    },
+    {
+      "epoch": 0.13584,
+      "grad_norm": 0.9851948022842407,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 13584
+    },
+    {
+      "epoch": 0.13585,
+      "grad_norm": 1.0144891738891602,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 13585
+    },
+    {
+      "epoch": 0.13586,
+      "grad_norm": 0.9492523670196533,
+      "learning_rate": 0.003,
+      "loss": 4.064,
+      "step": 13586
+    },
+    {
+      "epoch": 0.13587,
+      "grad_norm": 0.8685741424560547,
+      "learning_rate": 0.003,
+      "loss": 4.0072,
+      "step": 13587
+    },
+    {
+      "epoch": 0.13588,
+      "grad_norm": 0.8987164497375488,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 13588
+    },
+    {
+      "epoch": 0.13589,
+      "grad_norm": 0.8334383368492126,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 13589
+    },
+    {
+      "epoch": 0.1359,
+      "grad_norm": 0.7396984696388245,
+      "learning_rate": 0.003,
+      "loss": 3.9936,
+      "step": 13590
+    },
+    {
+      "epoch": 0.13591,
+      "grad_norm": 0.8303385972976685,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 13591
+    },
+    {
+      "epoch": 0.13592,
+      "grad_norm": 0.9425806403160095,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 13592
+    },
+    {
+      "epoch": 0.13593,
+      "grad_norm": 0.928742527961731,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 13593
+    },
+    {
+      "epoch": 0.13594,
+      "grad_norm": 0.7929375767707825,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 13594
+    },
+    {
+      "epoch": 0.13595,
+      "grad_norm": 0.7942190766334534,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 13595
+    },
+    {
+      "epoch": 0.13596,
+      "grad_norm": 0.9037315845489502,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 13596
+    },
+    {
+      "epoch": 0.13597,
+      "grad_norm": 0.718514621257782,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 13597
+    },
+    {
+      "epoch": 0.13598,
+      "grad_norm": 0.8128697276115417,
+      "learning_rate": 0.003,
+      "loss": 4.0595,
+      "step": 13598
+    },
+    {
+      "epoch": 0.13599,
+      "grad_norm": 0.9463288187980652,
+      "learning_rate": 0.003,
+      "loss": 4.0478,
+      "step": 13599
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 1.0681328773498535,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 13600
+    },
+    {
+      "epoch": 0.13601,
+      "grad_norm": 0.9706233143806458,
+      "learning_rate": 0.003,
+      "loss": 4.0458,
+      "step": 13601
+    },
+    {
+      "epoch": 0.13602,
+      "grad_norm": 0.9826554656028748,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 13602
+    },
+    {
+      "epoch": 0.13603,
+      "grad_norm": 0.9344239830970764,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 13603
+    },
+    {
+      "epoch": 0.13604,
+      "grad_norm": 0.8888978958129883,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 13604
+    },
+    {
+      "epoch": 0.13605,
+      "grad_norm": 0.8250786066055298,
+      "learning_rate": 0.003,
+      "loss": 3.9819,
+      "step": 13605
+    },
+    {
+      "epoch": 0.13606,
+      "grad_norm": 0.734150230884552,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 13606
+    },
+    {
+      "epoch": 0.13607,
+      "grad_norm": 0.7302347421646118,
+      "learning_rate": 0.003,
+      "loss": 4.0522,
+      "step": 13607
+    },
+    {
+      "epoch": 0.13608,
+      "grad_norm": 0.8481457829475403,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 13608
+    },
+    {
+      "epoch": 0.13609,
+      "grad_norm": 1.174586296081543,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 13609
+    },
+    {
+      "epoch": 0.1361,
+      "grad_norm": 1.094225525856018,
+      "learning_rate": 0.003,
+      "loss": 4.0242,
+      "step": 13610
+    },
+    {
+      "epoch": 0.13611,
+      "grad_norm": 0.7953914403915405,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 13611
+    },
+    {
+      "epoch": 0.13612,
+      "grad_norm": 0.6411198973655701,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 13612
+    },
+    {
+      "epoch": 0.13613,
+      "grad_norm": 0.5927622318267822,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 13613
+    },
+    {
+      "epoch": 0.13614,
+      "grad_norm": 0.5709002614021301,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 13614
+    },
+    {
+      "epoch": 0.13615,
+      "grad_norm": 0.6412385702133179,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 13615
+    },
+    {
+      "epoch": 0.13616,
+      "grad_norm": 0.6297585964202881,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 13616
+    },
+    {
+      "epoch": 0.13617,
+      "grad_norm": 0.5881394147872925,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 13617
+    },
+    {
+      "epoch": 0.13618,
+      "grad_norm": 0.5443843007087708,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 13618
+    },
+    {
+      "epoch": 0.13619,
+      "grad_norm": 0.541292130947113,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 13619
+    },
+    {
+      "epoch": 0.1362,
+      "grad_norm": 0.5641786456108093,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 13620
+    },
+    {
+      "epoch": 0.13621,
+      "grad_norm": 0.6552765369415283,
+      "learning_rate": 0.003,
+      "loss": 3.9998,
+      "step": 13621
+    },
+    {
+      "epoch": 0.13622,
+      "grad_norm": 0.7206015586853027,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 13622
+    },
+    {
+      "epoch": 0.13623,
+      "grad_norm": 0.7735071778297424,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 13623
+    },
+    {
+      "epoch": 0.13624,
+      "grad_norm": 0.9413561820983887,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 13624
+    },
+    {
+      "epoch": 0.13625,
+      "grad_norm": 1.1914006471633911,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 13625
+    },
+    {
+      "epoch": 0.13626,
+      "grad_norm": 0.9235355257987976,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 13626
+    },
+    {
+      "epoch": 0.13627,
+      "grad_norm": 0.8000466227531433,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 13627
+    },
+    {
+      "epoch": 0.13628,
+      "grad_norm": 0.7122133374214172,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 13628
+    },
+    {
+      "epoch": 0.13629,
+      "grad_norm": 0.8387061953544617,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 13629
+    },
+    {
+      "epoch": 0.1363,
+      "grad_norm": 0.9526289701461792,
+      "learning_rate": 0.003,
+      "loss": 3.9884,
+      "step": 13630
+    },
+    {
+      "epoch": 0.13631,
+      "grad_norm": 0.9225990772247314,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 13631
+    },
+    {
+      "epoch": 0.13632,
+      "grad_norm": 0.8726639151573181,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 13632
+    },
+    {
+      "epoch": 0.13633,
+      "grad_norm": 0.7738573551177979,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 13633
+    },
+    {
+      "epoch": 0.13634,
+      "grad_norm": 0.7793677449226379,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 13634
+    },
+    {
+      "epoch": 0.13635,
+      "grad_norm": 0.9074026346206665,
+      "learning_rate": 0.003,
+      "loss": 4.039,
+      "step": 13635
+    },
+    {
+      "epoch": 0.13636,
+      "grad_norm": 1.1774895191192627,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 13636
+    },
+    {
+      "epoch": 0.13637,
+      "grad_norm": 0.9255838394165039,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 13637
+    },
+    {
+      "epoch": 0.13638,
+      "grad_norm": 0.7061364054679871,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 13638
+    },
+    {
+      "epoch": 0.13639,
+      "grad_norm": 0.6312673091888428,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 13639
+    },
+    {
+      "epoch": 0.1364,
+      "grad_norm": 0.7434985637664795,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 13640
+    },
+    {
+      "epoch": 0.13641,
+      "grad_norm": 0.6780279874801636,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 13641
+    },
+    {
+      "epoch": 0.13642,
+      "grad_norm": 0.6205322742462158,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 13642
+    },
+    {
+      "epoch": 0.13643,
+      "grad_norm": 0.7035835385322571,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 13643
+    },
+    {
+      "epoch": 0.13644,
+      "grad_norm": 0.8032388091087341,
+      "learning_rate": 0.003,
+      "loss": 3.9868,
+      "step": 13644
+    },
+    {
+      "epoch": 0.13645,
+      "grad_norm": 0.8594980239868164,
+      "learning_rate": 0.003,
+      "loss": 3.9912,
+      "step": 13645
+    },
+    {
+      "epoch": 0.13646,
+      "grad_norm": 0.9656379222869873,
+      "learning_rate": 0.003,
+      "loss": 3.981,
+      "step": 13646
+    },
+    {
+      "epoch": 0.13647,
+      "grad_norm": 1.038041114807129,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 13647
+    },
+    {
+      "epoch": 0.13648,
+      "grad_norm": 0.907473087310791,
+      "learning_rate": 0.003,
+      "loss": 3.979,
+      "step": 13648
+    },
+    {
+      "epoch": 0.13649,
+      "grad_norm": 0.8047930002212524,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 13649
+    },
+    {
+      "epoch": 0.1365,
+      "grad_norm": 0.7641589641571045,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 13650
+    },
+    {
+      "epoch": 0.13651,
+      "grad_norm": 0.732158362865448,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 13651
+    },
+    {
+      "epoch": 0.13652,
+      "grad_norm": 0.6533657908439636,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 13652
+    },
+    {
+      "epoch": 0.13653,
+      "grad_norm": 0.7759801149368286,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 13653
+    },
+    {
+      "epoch": 0.13654,
+      "grad_norm": 0.8410548567771912,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 13654
+    },
+    {
+      "epoch": 0.13655,
+      "grad_norm": 0.907723605632782,
+      "learning_rate": 0.003,
+      "loss": 3.9917,
+      "step": 13655
+    },
+    {
+      "epoch": 0.13656,
+      "grad_norm": 0.9699224829673767,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 13656
+    },
+    {
+      "epoch": 0.13657,
+      "grad_norm": 1.0853382349014282,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 13657
+    },
+    {
+      "epoch": 0.13658,
+      "grad_norm": 1.0450563430786133,
+      "learning_rate": 0.003,
+      "loss": 4.0559,
+      "step": 13658
+    },
+    {
+      "epoch": 0.13659,
+      "grad_norm": 1.0690622329711914,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 13659
+    },
+    {
+      "epoch": 0.1366,
+      "grad_norm": 0.9868205785751343,
+      "learning_rate": 0.003,
+      "loss": 4.0622,
+      "step": 13660
+    },
+    {
+      "epoch": 0.13661,
+      "grad_norm": 1.0094572305679321,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 13661
+    },
+    {
+      "epoch": 0.13662,
+      "grad_norm": 0.900059700012207,
+      "learning_rate": 0.003,
+      "loss": 4.0425,
+      "step": 13662
+    },
+    {
+      "epoch": 0.13663,
+      "grad_norm": 0.9126373529434204,
+      "learning_rate": 0.003,
+      "loss": 4.044,
+      "step": 13663
+    },
+    {
+      "epoch": 0.13664,
+      "grad_norm": 1.091404914855957,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 13664
+    },
+    {
+      "epoch": 0.13665,
+      "grad_norm": 0.9754030108451843,
+      "learning_rate": 0.003,
+      "loss": 4.0574,
+      "step": 13665
+    },
+    {
+      "epoch": 0.13666,
+      "grad_norm": 0.9318715929985046,
+      "learning_rate": 0.003,
+      "loss": 4.0519,
+      "step": 13666
+    },
+    {
+      "epoch": 0.13667,
+      "grad_norm": 0.9125456213951111,
+      "learning_rate": 0.003,
+      "loss": 4.047,
+      "step": 13667
+    },
+    {
+      "epoch": 0.13668,
+      "grad_norm": 1.003765344619751,
+      "learning_rate": 0.003,
+      "loss": 4.0539,
+      "step": 13668
+    },
+    {
+      "epoch": 0.13669,
+      "grad_norm": 0.789003312587738,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 13669
+    },
+    {
+      "epoch": 0.1367,
+      "grad_norm": 0.7328260540962219,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 13670
+    },
+    {
+      "epoch": 0.13671,
+      "grad_norm": 0.8168613314628601,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 13671
+    },
+    {
+      "epoch": 0.13672,
+      "grad_norm": 0.6888318657875061,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 13672
+    },
+    {
+      "epoch": 0.13673,
+      "grad_norm": 0.7158035039901733,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 13673
+    },
+    {
+      "epoch": 0.13674,
+      "grad_norm": 0.6708112955093384,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 13674
+    },
+    {
+      "epoch": 0.13675,
+      "grad_norm": 0.9656121134757996,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 13675
+    },
+    {
+      "epoch": 0.13676,
+      "grad_norm": 1.0979527235031128,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 13676
+    },
+    {
+      "epoch": 0.13677,
+      "grad_norm": 1.1144235134124756,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 13677
+    },
+    {
+      "epoch": 0.13678,
+      "grad_norm": 0.8657492399215698,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 13678
+    },
+    {
+      "epoch": 0.13679,
+      "grad_norm": 0.707968533039093,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 13679
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.6362190246582031,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 13680
+    },
+    {
+      "epoch": 0.13681,
+      "grad_norm": 0.7352656722068787,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 13681
+    },
+    {
+      "epoch": 0.13682,
+      "grad_norm": 0.7660700082778931,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 13682
+    },
+    {
+      "epoch": 0.13683,
+      "grad_norm": 0.8414221405982971,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 13683
+    },
+    {
+      "epoch": 0.13684,
+      "grad_norm": 0.9656125903129578,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 13684
+    },
+    {
+      "epoch": 0.13685,
+      "grad_norm": 1.0350044965744019,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 13685
+    },
+    {
+      "epoch": 0.13686,
+      "grad_norm": 0.9132952094078064,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 13686
+    },
+    {
+      "epoch": 0.13687,
+      "grad_norm": 0.6878951191902161,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 13687
+    },
+    {
+      "epoch": 0.13688,
+      "grad_norm": 0.5668317079544067,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 13688
+    },
+    {
+      "epoch": 0.13689,
+      "grad_norm": 0.6665008068084717,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 13689
+    },
+    {
+      "epoch": 0.1369,
+      "grad_norm": 0.672523558139801,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 13690
+    },
+    {
+      "epoch": 0.13691,
+      "grad_norm": 0.6515674591064453,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 13691
+    },
+    {
+      "epoch": 0.13692,
+      "grad_norm": 0.4998486042022705,
+      "learning_rate": 0.003,
+      "loss": 3.9892,
+      "step": 13692
+    },
+    {
+      "epoch": 0.13693,
+      "grad_norm": 0.4743835926055908,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 13693
+    },
+    {
+      "epoch": 0.13694,
+      "grad_norm": 0.5076819658279419,
+      "learning_rate": 0.003,
+      "loss": 3.9865,
+      "step": 13694
+    },
+    {
+      "epoch": 0.13695,
+      "grad_norm": 0.4468526542186737,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 13695
+    },
+    {
+      "epoch": 0.13696,
+      "grad_norm": 0.5420965552330017,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 13696
+    },
+    {
+      "epoch": 0.13697,
+      "grad_norm": 0.6205175518989563,
+      "learning_rate": 0.003,
+      "loss": 3.9833,
+      "step": 13697
+    },
+    {
+      "epoch": 0.13698,
+      "grad_norm": 0.6296148896217346,
+      "learning_rate": 0.003,
+      "loss": 3.9918,
+      "step": 13698
+    },
+    {
+      "epoch": 0.13699,
+      "grad_norm": 0.6779153943061829,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 13699
+    },
+    {
+      "epoch": 0.137,
+      "grad_norm": 0.8325897455215454,
+      "learning_rate": 0.003,
+      "loss": 3.9948,
+      "step": 13700
+    },
+    {
+      "epoch": 0.13701,
+      "grad_norm": 0.9757177829742432,
+      "learning_rate": 0.003,
+      "loss": 4.0371,
+      "step": 13701
+    },
+    {
+      "epoch": 0.13702,
+      "grad_norm": 1.009141206741333,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 13702
+    },
+    {
+      "epoch": 0.13703,
+      "grad_norm": 1.0573129653930664,
+      "learning_rate": 0.003,
+      "loss": 3.9902,
+      "step": 13703
+    },
+    {
+      "epoch": 0.13704,
+      "grad_norm": 0.9576799273490906,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 13704
+    },
+    {
+      "epoch": 0.13705,
+      "grad_norm": 0.8672870397567749,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 13705
+    },
+    {
+      "epoch": 0.13706,
+      "grad_norm": 0.799553394317627,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 13706
+    },
+    {
+      "epoch": 0.13707,
+      "grad_norm": 0.7652454376220703,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 13707
+    },
+    {
+      "epoch": 0.13708,
+      "grad_norm": 0.8123028874397278,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 13708
+    },
+    {
+      "epoch": 0.13709,
+      "grad_norm": 0.7795895338058472,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 13709
+    },
+    {
+      "epoch": 0.1371,
+      "grad_norm": 0.8851892948150635,
+      "learning_rate": 0.003,
+      "loss": 3.9801,
+      "step": 13710
+    },
+    {
+      "epoch": 0.13711,
+      "grad_norm": 1.0329574346542358,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 13711
+    },
+    {
+      "epoch": 0.13712,
+      "grad_norm": 0.9941471815109253,
+      "learning_rate": 0.003,
+      "loss": 3.996,
+      "step": 13712
+    },
+    {
+      "epoch": 0.13713,
+      "grad_norm": 0.9089716672897339,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 13713
+    },
+    {
+      "epoch": 0.13714,
+      "grad_norm": 0.8332324028015137,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 13714
+    },
+    {
+      "epoch": 0.13715,
+      "grad_norm": 0.9458366632461548,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 13715
+    },
+    {
+      "epoch": 0.13716,
+      "grad_norm": 1.0176984071731567,
+      "learning_rate": 0.003,
+      "loss": 3.9933,
+      "step": 13716
+    },
+    {
+      "epoch": 0.13717,
+      "grad_norm": 1.2446688413619995,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 13717
+    },
+    {
+      "epoch": 0.13718,
+      "grad_norm": 0.681408166885376,
+      "learning_rate": 0.003,
+      "loss": 4.0295,
+      "step": 13718
+    },
+    {
+      "epoch": 0.13719,
+      "grad_norm": 0.7224704623222351,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 13719
+    },
+    {
+      "epoch": 0.1372,
+      "grad_norm": 0.6912409663200378,
+      "learning_rate": 0.003,
+      "loss": 4.0119,
+      "step": 13720
+    },
+    {
+      "epoch": 0.13721,
+      "grad_norm": 0.7295917868614197,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 13721
+    },
+    {
+      "epoch": 0.13722,
+      "grad_norm": 0.8005015254020691,
+      "learning_rate": 0.003,
+      "loss": 3.9994,
+      "step": 13722
+    },
+    {
+      "epoch": 0.13723,
+      "grad_norm": 0.8219122886657715,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 13723
+    },
+    {
+      "epoch": 0.13724,
+      "grad_norm": 0.7524207830429077,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 13724
+    },
+    {
+      "epoch": 0.13725,
+      "grad_norm": 0.6833975911140442,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 13725
+    },
+    {
+      "epoch": 0.13726,
+      "grad_norm": 0.6006239652633667,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 13726
+    },
+    {
+      "epoch": 0.13727,
+      "grad_norm": 0.6788160800933838,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 13727
+    },
+    {
+      "epoch": 0.13728,
+      "grad_norm": 0.8558680415153503,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 13728
+    },
+    {
+      "epoch": 0.13729,
+      "grad_norm": 1.0257190465927124,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 13729
+    },
+    {
+      "epoch": 0.1373,
+      "grad_norm": 0.9806662797927856,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 13730
+    },
+    {
+      "epoch": 0.13731,
+      "grad_norm": 0.8801664113998413,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 13731
+    },
+    {
+      "epoch": 0.13732,
+      "grad_norm": 0.7832850813865662,
+      "learning_rate": 0.003,
+      "loss": 3.9757,
+      "step": 13732
+    },
+    {
+      "epoch": 0.13733,
+      "grad_norm": 0.9347167611122131,
+      "learning_rate": 0.003,
+      "loss": 4.0321,
+      "step": 13733
+    },
+    {
+      "epoch": 0.13734,
+      "grad_norm": 0.9872972369194031,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 13734
+    },
+    {
+      "epoch": 0.13735,
+      "grad_norm": 0.8403304219245911,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 13735
+    },
+    {
+      "epoch": 0.13736,
+      "grad_norm": 0.8516517877578735,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 13736
+    },
+    {
+      "epoch": 0.13737,
+      "grad_norm": 0.9545196294784546,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 13737
+    },
+    {
+      "epoch": 0.13738,
+      "grad_norm": 1.3185926675796509,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 13738
+    },
+    {
+      "epoch": 0.13739,
+      "grad_norm": 0.832253634929657,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 13739
+    },
+    {
+      "epoch": 0.1374,
+      "grad_norm": 0.6808013916015625,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 13740
+    },
+    {
+      "epoch": 0.13741,
+      "grad_norm": 0.6344847679138184,
+      "learning_rate": 0.003,
+      "loss": 3.9993,
+      "step": 13741
+    },
+    {
+      "epoch": 0.13742,
+      "grad_norm": 0.6468703746795654,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 13742
+    },
+    {
+      "epoch": 0.13743,
+      "grad_norm": 0.5937367677688599,
+      "learning_rate": 0.003,
+      "loss": 3.9829,
+      "step": 13743
+    },
+    {
+      "epoch": 0.13744,
+      "grad_norm": 0.6512554883956909,
+      "learning_rate": 0.003,
+      "loss": 3.9859,
+      "step": 13744
+    },
+    {
+      "epoch": 0.13745,
+      "grad_norm": 0.7392717003822327,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 13745
+    },
+    {
+      "epoch": 0.13746,
+      "grad_norm": 0.7809626460075378,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 13746
+    },
+    {
+      "epoch": 0.13747,
+      "grad_norm": 0.8470854163169861,
+      "learning_rate": 0.003,
+      "loss": 3.9967,
+      "step": 13747
+    },
+    {
+      "epoch": 0.13748,
+      "grad_norm": 0.8827667832374573,
+      "learning_rate": 0.003,
+      "loss": 4.0478,
+      "step": 13748
+    },
+    {
+      "epoch": 0.13749,
+      "grad_norm": 0.7677375078201294,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 13749
+    },
+    {
+      "epoch": 0.1375,
+      "grad_norm": 0.6403930187225342,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 13750
+    },
+    {
+      "epoch": 0.13751,
+      "grad_norm": 0.5701520442962646,
+      "learning_rate": 0.003,
+      "loss": 4.0321,
+      "step": 13751
+    },
+    {
+      "epoch": 0.13752,
+      "grad_norm": 0.6364880204200745,
+      "learning_rate": 0.003,
+      "loss": 3.9775,
+      "step": 13752
+    },
+    {
+      "epoch": 0.13753,
+      "grad_norm": 0.7677833437919617,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 13753
+    },
+    {
+      "epoch": 0.13754,
+      "grad_norm": 0.8433175683021545,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 13754
+    },
+    {
+      "epoch": 0.13755,
+      "grad_norm": 0.8671547770500183,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 13755
+    },
+    {
+      "epoch": 0.13756,
+      "grad_norm": 0.7540720701217651,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 13756
+    },
+    {
+      "epoch": 0.13757,
+      "grad_norm": 0.7149525284767151,
+      "learning_rate": 0.003,
+      "loss": 4.0001,
+      "step": 13757
+    },
+    {
+      "epoch": 0.13758,
+      "grad_norm": 0.658466100692749,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 13758
+    },
+    {
+      "epoch": 0.13759,
+      "grad_norm": 0.6632937788963318,
+      "learning_rate": 0.003,
+      "loss": 3.9851,
+      "step": 13759
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.7641159892082214,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 13760
+    },
+    {
+      "epoch": 0.13761,
+      "grad_norm": 0.8422225117683411,
+      "learning_rate": 0.003,
+      "loss": 3.9776,
+      "step": 13761
+    },
+    {
+      "epoch": 0.13762,
+      "grad_norm": 0.9423866868019104,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 13762
+    },
+    {
+      "epoch": 0.13763,
+      "grad_norm": 1.0230610370635986,
+      "learning_rate": 0.003,
+      "loss": 4.0546,
+      "step": 13763
+    },
+    {
+      "epoch": 0.13764,
+      "grad_norm": 1.2755424976348877,
+      "learning_rate": 0.003,
+      "loss": 4.0543,
+      "step": 13764
+    },
+    {
+      "epoch": 0.13765,
+      "grad_norm": 0.8471209406852722,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 13765
+    },
+    {
+      "epoch": 0.13766,
+      "grad_norm": 0.8734785318374634,
+      "learning_rate": 0.003,
+      "loss": 3.994,
+      "step": 13766
+    },
+    {
+      "epoch": 0.13767,
+      "grad_norm": 0.7789347171783447,
+      "learning_rate": 0.003,
+      "loss": 4.0022,
+      "step": 13767
+    },
+    {
+      "epoch": 0.13768,
+      "grad_norm": 0.9051489233970642,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 13768
+    },
+    {
+      "epoch": 0.13769,
+      "grad_norm": 0.7554457187652588,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 13769
+    },
+    {
+      "epoch": 0.1377,
+      "grad_norm": 0.7369177937507629,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 13770
+    },
+    {
+      "epoch": 0.13771,
+      "grad_norm": 1.0584901571273804,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 13771
+    },
+    {
+      "epoch": 0.13772,
+      "grad_norm": 1.3210872411727905,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 13772
+    },
+    {
+      "epoch": 0.13773,
+      "grad_norm": 0.5406081676483154,
+      "learning_rate": 0.003,
+      "loss": 3.9874,
+      "step": 13773
+    },
+    {
+      "epoch": 0.13774,
+      "grad_norm": 0.6872454881668091,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 13774
+    },
+    {
+      "epoch": 0.13775,
+      "grad_norm": 0.7570769190788269,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 13775
+    },
+    {
+      "epoch": 0.13776,
+      "grad_norm": 0.9164384603500366,
+      "learning_rate": 0.003,
+      "loss": 3.9673,
+      "step": 13776
+    },
+    {
+      "epoch": 0.13777,
+      "grad_norm": 1.1320087909698486,
+      "learning_rate": 0.003,
+      "loss": 4.0006,
+      "step": 13777
+    },
+    {
+      "epoch": 0.13778,
+      "grad_norm": 0.8633060455322266,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 13778
+    },
+    {
+      "epoch": 0.13779,
+      "grad_norm": 0.7784515023231506,
+      "learning_rate": 0.003,
+      "loss": 3.9909,
+      "step": 13779
+    },
+    {
+      "epoch": 0.1378,
+      "grad_norm": 0.7161611318588257,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 13780
+    },
+    {
+      "epoch": 0.13781,
+      "grad_norm": 0.7441836595535278,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 13781
+    },
+    {
+      "epoch": 0.13782,
+      "grad_norm": 0.711620032787323,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 13782
+    },
+    {
+      "epoch": 0.13783,
+      "grad_norm": 0.862492024898529,
+      "learning_rate": 0.003,
+      "loss": 4.0646,
+      "step": 13783
+    },
+    {
+      "epoch": 0.13784,
+      "grad_norm": 0.9315939545631409,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 13784
+    },
+    {
+      "epoch": 0.13785,
+      "grad_norm": 1.0295634269714355,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 13785
+    },
+    {
+      "epoch": 0.13786,
+      "grad_norm": 0.981230616569519,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 13786
+    },
+    {
+      "epoch": 0.13787,
+      "grad_norm": 0.9886899590492249,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 13787
+    },
+    {
+      "epoch": 0.13788,
+      "grad_norm": 0.9177408218383789,
+      "learning_rate": 0.003,
+      "loss": 3.9975,
+      "step": 13788
+    },
+    {
+      "epoch": 0.13789,
+      "grad_norm": 0.8614755868911743,
+      "learning_rate": 0.003,
+      "loss": 4.0509,
+      "step": 13789
+    },
+    {
+      "epoch": 0.1379,
+      "grad_norm": 0.7738600373268127,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 13790
+    },
+    {
+      "epoch": 0.13791,
+      "grad_norm": 0.7703483700752258,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 13791
+    },
+    {
+      "epoch": 0.13792,
+      "grad_norm": 0.7503597736358643,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 13792
+    },
+    {
+      "epoch": 0.13793,
+      "grad_norm": 0.8256163001060486,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 13793
+    },
+    {
+      "epoch": 0.13794,
+      "grad_norm": 0.7746387720108032,
+      "learning_rate": 0.003,
+      "loss": 4.0484,
+      "step": 13794
+    },
+    {
+      "epoch": 0.13795,
+      "grad_norm": 0.7241289615631104,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 13795
+    },
+    {
+      "epoch": 0.13796,
+      "grad_norm": 0.6929647326469421,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 13796
+    },
+    {
+      "epoch": 0.13797,
+      "grad_norm": 0.7155910730361938,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 13797
+    },
+    {
+      "epoch": 0.13798,
+      "grad_norm": 0.8154659867286682,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 13798
+    },
+    {
+      "epoch": 0.13799,
+      "grad_norm": 0.9948515892028809,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 13799
+    },
+    {
+      "epoch": 0.138,
+      "grad_norm": 1.161720633506775,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 13800
+    },
+    {
+      "epoch": 0.13801,
+      "grad_norm": 0.8052655458450317,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 13801
+    },
+    {
+      "epoch": 0.13802,
+      "grad_norm": 0.7105312347412109,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 13802
+    },
+    {
+      "epoch": 0.13803,
+      "grad_norm": 0.7463851571083069,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 13803
+    },
+    {
+      "epoch": 0.13804,
+      "grad_norm": 0.8518344163894653,
+      "learning_rate": 0.003,
+      "loss": 4.0435,
+      "step": 13804
+    },
+    {
+      "epoch": 0.13805,
+      "grad_norm": 0.926234781742096,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 13805
+    },
+    {
+      "epoch": 0.13806,
+      "grad_norm": 1.0436285734176636,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 13806
+    },
+    {
+      "epoch": 0.13807,
+      "grad_norm": 1.0926405191421509,
+      "learning_rate": 0.003,
+      "loss": 4.0351,
+      "step": 13807
+    },
+    {
+      "epoch": 0.13808,
+      "grad_norm": 0.9309552311897278,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 13808
+    },
+    {
+      "epoch": 0.13809,
+      "grad_norm": 0.7687803506851196,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 13809
+    },
+    {
+      "epoch": 0.1381,
+      "grad_norm": 0.6450755000114441,
+      "learning_rate": 0.003,
+      "loss": 3.9916,
+      "step": 13810
+    },
+    {
+      "epoch": 0.13811,
+      "grad_norm": 0.6558342576026917,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 13811
+    },
+    {
+      "epoch": 0.13812,
+      "grad_norm": 0.7857155203819275,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 13812
+    },
+    {
+      "epoch": 0.13813,
+      "grad_norm": 0.9008277058601379,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 13813
+    },
+    {
+      "epoch": 0.13814,
+      "grad_norm": 1.1033518314361572,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 13814
+    },
+    {
+      "epoch": 0.13815,
+      "grad_norm": 0.8633435964584351,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 13815
+    },
+    {
+      "epoch": 0.13816,
+      "grad_norm": 0.5964910984039307,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 13816
+    },
+    {
+      "epoch": 0.13817,
+      "grad_norm": 0.5598768591880798,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 13817
+    },
+    {
+      "epoch": 0.13818,
+      "grad_norm": 0.5990583896636963,
+      "learning_rate": 0.003,
+      "loss": 3.9884,
+      "step": 13818
+    },
+    {
+      "epoch": 0.13819,
+      "grad_norm": 0.5614003539085388,
+      "learning_rate": 0.003,
+      "loss": 3.992,
+      "step": 13819
+    },
+    {
+      "epoch": 0.1382,
+      "grad_norm": 0.5601891279220581,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 13820
+    },
+    {
+      "epoch": 0.13821,
+      "grad_norm": 0.6764342784881592,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 13821
+    },
+    {
+      "epoch": 0.13822,
+      "grad_norm": 0.7970566153526306,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 13822
+    },
+    {
+      "epoch": 0.13823,
+      "grad_norm": 1.0505768060684204,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 13823
+    },
+    {
+      "epoch": 0.13824,
+      "grad_norm": 1.0456247329711914,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 13824
+    },
+    {
+      "epoch": 0.13825,
+      "grad_norm": 0.8650031685829163,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 13825
+    },
+    {
+      "epoch": 0.13826,
+      "grad_norm": 0.7379238605499268,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 13826
+    },
+    {
+      "epoch": 0.13827,
+      "grad_norm": 0.7937077283859253,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 13827
+    },
+    {
+      "epoch": 0.13828,
+      "grad_norm": 0.8907710909843445,
+      "learning_rate": 0.003,
+      "loss": 3.9905,
+      "step": 13828
+    },
+    {
+      "epoch": 0.13829,
+      "grad_norm": 0.9939072728157043,
+      "learning_rate": 0.003,
+      "loss": 3.9794,
+      "step": 13829
+    },
+    {
+      "epoch": 0.1383,
+      "grad_norm": 1.0405255556106567,
+      "learning_rate": 0.003,
+      "loss": 4.0091,
+      "step": 13830
+    },
+    {
+      "epoch": 0.13831,
+      "grad_norm": 0.8047432899475098,
+      "learning_rate": 0.003,
+      "loss": 3.9836,
+      "step": 13831
+    },
+    {
+      "epoch": 0.13832,
+      "grad_norm": 0.7326861619949341,
+      "learning_rate": 0.003,
+      "loss": 4.0004,
+      "step": 13832
+    },
+    {
+      "epoch": 0.13833,
+      "grad_norm": 0.8576661944389343,
+      "learning_rate": 0.003,
+      "loss": 3.9983,
+      "step": 13833
+    },
+    {
+      "epoch": 0.13834,
+      "grad_norm": 0.8865624666213989,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 13834
+    },
+    {
+      "epoch": 0.13835,
+      "grad_norm": 1.0433152914047241,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 13835
+    },
+    {
+      "epoch": 0.13836,
+      "grad_norm": 1.1211081743240356,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 13836
+    },
+    {
+      "epoch": 0.13837,
+      "grad_norm": 0.8115255236625671,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 13837
+    },
+    {
+      "epoch": 0.13838,
+      "grad_norm": 0.7494043707847595,
+      "learning_rate": 0.003,
+      "loss": 3.9729,
+      "step": 13838
+    },
+    {
+      "epoch": 0.13839,
+      "grad_norm": 0.7975366115570068,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 13839
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.9419084191322327,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 13840
+    },
+    {
+      "epoch": 0.13841,
+      "grad_norm": 1.0205785036087036,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 13841
+    },
+    {
+      "epoch": 0.13842,
+      "grad_norm": 0.9009459018707275,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 13842
+    },
+    {
+      "epoch": 0.13843,
+      "grad_norm": 0.8871148228645325,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 13843
+    },
+    {
+      "epoch": 0.13844,
+      "grad_norm": 0.8316652774810791,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 13844
+    },
+    {
+      "epoch": 0.13845,
+      "grad_norm": 0.8803573250770569,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 13845
+    },
+    {
+      "epoch": 0.13846,
+      "grad_norm": 0.9513351917266846,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 13846
+    },
+    {
+      "epoch": 0.13847,
+      "grad_norm": 0.9584240317344666,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 13847
+    },
+    {
+      "epoch": 0.13848,
+      "grad_norm": 0.9925170540809631,
+      "learning_rate": 0.003,
+      "loss": 4.0311,
+      "step": 13848
+    },
+    {
+      "epoch": 0.13849,
+      "grad_norm": 1.1117236614227295,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 13849
+    },
+    {
+      "epoch": 0.1385,
+      "grad_norm": 0.8875993490219116,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 13850
+    },
+    {
+      "epoch": 0.13851,
+      "grad_norm": 1.1059731245040894,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 13851
+    },
+    {
+      "epoch": 0.13852,
+      "grad_norm": 1.0340826511383057,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 13852
+    },
+    {
+      "epoch": 0.13853,
+      "grad_norm": 0.8967835307121277,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 13853
+    },
+    {
+      "epoch": 0.13854,
+      "grad_norm": 0.7612320184707642,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 13854
+    },
+    {
+      "epoch": 0.13855,
+      "grad_norm": 0.7583867907524109,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 13855
+    },
+    {
+      "epoch": 0.13856,
+      "grad_norm": 0.8329528570175171,
+      "learning_rate": 0.003,
+      "loss": 4.0437,
+      "step": 13856
+    },
+    {
+      "epoch": 0.13857,
+      "grad_norm": 0.8736461997032166,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 13857
+    },
+    {
+      "epoch": 0.13858,
+      "grad_norm": 1.0216503143310547,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 13858
+    },
+    {
+      "epoch": 0.13859,
+      "grad_norm": 0.9922423958778381,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 13859
+    },
+    {
+      "epoch": 0.1386,
+      "grad_norm": 0.7935214042663574,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 13860
+    },
+    {
+      "epoch": 0.13861,
+      "grad_norm": 0.7321152687072754,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 13861
+    },
+    {
+      "epoch": 0.13862,
+      "grad_norm": 0.6173765659332275,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 13862
+    },
+    {
+      "epoch": 0.13863,
+      "grad_norm": 0.5426373481750488,
+      "learning_rate": 0.003,
+      "loss": 4.0529,
+      "step": 13863
+    },
+    {
+      "epoch": 0.13864,
+      "grad_norm": 0.5262988209724426,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 13864
+    },
+    {
+      "epoch": 0.13865,
+      "grad_norm": 0.49531736969947815,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 13865
+    },
+    {
+      "epoch": 0.13866,
+      "grad_norm": 0.6078068017959595,
+      "learning_rate": 0.003,
+      "loss": 3.992,
+      "step": 13866
+    },
+    {
+      "epoch": 0.13867,
+      "grad_norm": 0.7791035175323486,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 13867
+    },
+    {
+      "epoch": 0.13868,
+      "grad_norm": 1.068627953529358,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 13868
+    },
+    {
+      "epoch": 0.13869,
+      "grad_norm": 1.1645622253417969,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 13869
+    },
+    {
+      "epoch": 0.1387,
+      "grad_norm": 0.6107794642448425,
+      "learning_rate": 0.003,
+      "loss": 3.9806,
+      "step": 13870
+    },
+    {
+      "epoch": 0.13871,
+      "grad_norm": 0.6094905734062195,
+      "learning_rate": 0.003,
+      "loss": 3.9802,
+      "step": 13871
+    },
+    {
+      "epoch": 0.13872,
+      "grad_norm": 0.7466140985488892,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 13872
+    },
+    {
+      "epoch": 0.13873,
+      "grad_norm": 0.7020432949066162,
+      "learning_rate": 0.003,
+      "loss": 3.9952,
+      "step": 13873
+    },
+    {
+      "epoch": 0.13874,
+      "grad_norm": 0.7007889747619629,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 13874
+    },
+    {
+      "epoch": 0.13875,
+      "grad_norm": 0.6750438809394836,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 13875
+    },
+    {
+      "epoch": 0.13876,
+      "grad_norm": 0.5738099217414856,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 13876
+    },
+    {
+      "epoch": 0.13877,
+      "grad_norm": 0.6238150596618652,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 13877
+    },
+    {
+      "epoch": 0.13878,
+      "grad_norm": 0.6997177600860596,
+      "learning_rate": 0.003,
+      "loss": 3.982,
+      "step": 13878
+    },
+    {
+      "epoch": 0.13879,
+      "grad_norm": 0.7597888708114624,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 13879
+    },
+    {
+      "epoch": 0.1388,
+      "grad_norm": 0.7653550505638123,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 13880
+    },
+    {
+      "epoch": 0.13881,
+      "grad_norm": 0.7531750798225403,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 13881
+    },
+    {
+      "epoch": 0.13882,
+      "grad_norm": 0.7508596181869507,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 13882
+    },
+    {
+      "epoch": 0.13883,
+      "grad_norm": 0.762292742729187,
+      "learning_rate": 0.003,
+      "loss": 3.9921,
+      "step": 13883
+    },
+    {
+      "epoch": 0.13884,
+      "grad_norm": 0.8650839328765869,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 13884
+    },
+    {
+      "epoch": 0.13885,
+      "grad_norm": 0.9082847237586975,
+      "learning_rate": 0.003,
+      "loss": 4.0467,
+      "step": 13885
+    },
+    {
+      "epoch": 0.13886,
+      "grad_norm": 1.0475541353225708,
+      "learning_rate": 0.003,
+      "loss": 3.9797,
+      "step": 13886
+    },
+    {
+      "epoch": 0.13887,
+      "grad_norm": 0.8007717728614807,
+      "learning_rate": 0.003,
+      "loss": 3.9954,
+      "step": 13887
+    },
+    {
+      "epoch": 0.13888,
+      "grad_norm": 0.5885525941848755,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 13888
+    },
+    {
+      "epoch": 0.13889,
+      "grad_norm": 0.5717440247535706,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 13889
+    },
+    {
+      "epoch": 0.1389,
+      "grad_norm": 0.5407517552375793,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 13890
+    },
+    {
+      "epoch": 0.13891,
+      "grad_norm": 0.5301905274391174,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 13891
+    },
+    {
+      "epoch": 0.13892,
+      "grad_norm": 0.6643052697181702,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 13892
+    },
+    {
+      "epoch": 0.13893,
+      "grad_norm": 0.7942160367965698,
+      "learning_rate": 0.003,
+      "loss": 3.9893,
+      "step": 13893
+    },
+    {
+      "epoch": 0.13894,
+      "grad_norm": 1.1539918184280396,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 13894
+    },
+    {
+      "epoch": 0.13895,
+      "grad_norm": 1.0380985736846924,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 13895
+    },
+    {
+      "epoch": 0.13896,
+      "grad_norm": 1.0376297235488892,
+      "learning_rate": 0.003,
+      "loss": 4.0323,
+      "step": 13896
+    },
+    {
+      "epoch": 0.13897,
+      "grad_norm": 0.9031848311424255,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 13897
+    },
+    {
+      "epoch": 0.13898,
+      "grad_norm": 0.782201886177063,
+      "learning_rate": 0.003,
+      "loss": 3.9902,
+      "step": 13898
+    },
+    {
+      "epoch": 0.13899,
+      "grad_norm": 0.8607664108276367,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 13899
+    },
+    {
+      "epoch": 0.139,
+      "grad_norm": 0.9175698161125183,
+      "learning_rate": 0.003,
+      "loss": 4.0468,
+      "step": 13900
+    },
+    {
+      "epoch": 0.13901,
+      "grad_norm": 0.8617496490478516,
+      "learning_rate": 0.003,
+      "loss": 4.034,
+      "step": 13901
+    },
+    {
+      "epoch": 0.13902,
+      "grad_norm": 1.0018264055252075,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 13902
+    },
+    {
+      "epoch": 0.13903,
+      "grad_norm": 1.1453460454940796,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 13903
+    },
+    {
+      "epoch": 0.13904,
+      "grad_norm": 0.8639815449714661,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 13904
+    },
+    {
+      "epoch": 0.13905,
+      "grad_norm": 0.874984860420227,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 13905
+    },
+    {
+      "epoch": 0.13906,
+      "grad_norm": 0.9996791481971741,
+      "learning_rate": 0.003,
+      "loss": 3.9829,
+      "step": 13906
+    },
+    {
+      "epoch": 0.13907,
+      "grad_norm": 1.052925944328308,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 13907
+    },
+    {
+      "epoch": 0.13908,
+      "grad_norm": 0.8876622915267944,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 13908
+    },
+    {
+      "epoch": 0.13909,
+      "grad_norm": 0.859562873840332,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 13909
+    },
+    {
+      "epoch": 0.1391,
+      "grad_norm": 0.8640323877334595,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 13910
+    },
+    {
+      "epoch": 0.13911,
+      "grad_norm": 0.8964762091636658,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 13911
+    },
+    {
+      "epoch": 0.13912,
+      "grad_norm": 0.9361962676048279,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 13912
+    },
+    {
+      "epoch": 0.13913,
+      "grad_norm": 0.9901366829872131,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 13913
+    },
+    {
+      "epoch": 0.13914,
+      "grad_norm": 0.8960790634155273,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 13914
+    },
+    {
+      "epoch": 0.13915,
+      "grad_norm": 0.8571736812591553,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 13915
+    },
+    {
+      "epoch": 0.13916,
+      "grad_norm": 0.7628933191299438,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 13916
+    },
+    {
+      "epoch": 0.13917,
+      "grad_norm": 0.6896006464958191,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 13917
+    },
+    {
+      "epoch": 0.13918,
+      "grad_norm": 0.7465510368347168,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 13918
+    },
+    {
+      "epoch": 0.13919,
+      "grad_norm": 0.6486213803291321,
+      "learning_rate": 0.003,
+      "loss": 3.9983,
+      "step": 13919
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.6836336851119995,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 13920
+    },
+    {
+      "epoch": 0.13921,
+      "grad_norm": 0.7179591655731201,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 13921
+    },
+    {
+      "epoch": 0.13922,
+      "grad_norm": 0.9003704190254211,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 13922
+    },
+    {
+      "epoch": 0.13923,
+      "grad_norm": 1.1060301065444946,
+      "learning_rate": 0.003,
+      "loss": 4.0289,
+      "step": 13923
+    },
+    {
+      "epoch": 0.13924,
+      "grad_norm": 0.8880802989006042,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 13924
+    },
+    {
+      "epoch": 0.13925,
+      "grad_norm": 0.6416323184967041,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 13925
+    },
+    {
+      "epoch": 0.13926,
+      "grad_norm": 0.5431221127510071,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 13926
+    },
+    {
+      "epoch": 0.13927,
+      "grad_norm": 0.6355161666870117,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 13927
+    },
+    {
+      "epoch": 0.13928,
+      "grad_norm": 0.7978395819664001,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 13928
+    },
+    {
+      "epoch": 0.13929,
+      "grad_norm": 1.029797077178955,
+      "learning_rate": 0.003,
+      "loss": 3.993,
+      "step": 13929
+    },
+    {
+      "epoch": 0.1393,
+      "grad_norm": 1.049689531326294,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 13930
+    },
+    {
+      "epoch": 0.13931,
+      "grad_norm": 0.8194695115089417,
+      "learning_rate": 0.003,
+      "loss": 3.9472,
+      "step": 13931
+    },
+    {
+      "epoch": 0.13932,
+      "grad_norm": 0.7146439552307129,
+      "learning_rate": 0.003,
+      "loss": 3.9691,
+      "step": 13932
+    },
+    {
+      "epoch": 0.13933,
+      "grad_norm": 0.7814133167266846,
+      "learning_rate": 0.003,
+      "loss": 3.986,
+      "step": 13933
+    },
+    {
+      "epoch": 0.13934,
+      "grad_norm": 0.9935269951820374,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 13934
+    },
+    {
+      "epoch": 0.13935,
+      "grad_norm": 0.9926442503929138,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 13935
+    },
+    {
+      "epoch": 0.13936,
+      "grad_norm": 0.7810591459274292,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 13936
+    },
+    {
+      "epoch": 0.13937,
+      "grad_norm": 0.6529386043548584,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 13937
+    },
+    {
+      "epoch": 0.13938,
+      "grad_norm": 0.6165884137153625,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 13938
+    },
+    {
+      "epoch": 0.13939,
+      "grad_norm": 0.6040536165237427,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 13939
+    },
+    {
+      "epoch": 0.1394,
+      "grad_norm": 0.6759775280952454,
+      "learning_rate": 0.003,
+      "loss": 3.9884,
+      "step": 13940
+    },
+    {
+      "epoch": 0.13941,
+      "grad_norm": 0.7123100161552429,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 13941
+    },
+    {
+      "epoch": 0.13942,
+      "grad_norm": 0.7399691939353943,
+      "learning_rate": 0.003,
+      "loss": 4.0168,
+      "step": 13942
+    },
+    {
+      "epoch": 0.13943,
+      "grad_norm": 0.8146520853042603,
+      "learning_rate": 0.003,
+      "loss": 4.0588,
+      "step": 13943
+    },
+    {
+      "epoch": 0.13944,
+      "grad_norm": 0.7907037138938904,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 13944
+    },
+    {
+      "epoch": 0.13945,
+      "grad_norm": 0.8921141624450684,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 13945
+    },
+    {
+      "epoch": 0.13946,
+      "grad_norm": 0.9739939570426941,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 13946
+    },
+    {
+      "epoch": 0.13947,
+      "grad_norm": 0.9629883170127869,
+      "learning_rate": 0.003,
+      "loss": 4.0373,
+      "step": 13947
+    },
+    {
+      "epoch": 0.13948,
+      "grad_norm": 0.9390729665756226,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 13948
+    },
+    {
+      "epoch": 0.13949,
+      "grad_norm": 0.9858075380325317,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 13949
+    },
+    {
+      "epoch": 0.1395,
+      "grad_norm": 0.9481609463691711,
+      "learning_rate": 0.003,
+      "loss": 4.047,
+      "step": 13950
+    },
+    {
+      "epoch": 0.13951,
+      "grad_norm": 1.255966305732727,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 13951
+    },
+    {
+      "epoch": 0.13952,
+      "grad_norm": 1.0052669048309326,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 13952
+    },
+    {
+      "epoch": 0.13953,
+      "grad_norm": 1.0262274742126465,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 13953
+    },
+    {
+      "epoch": 0.13954,
+      "grad_norm": 1.0653098821640015,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 13954
+    },
+    {
+      "epoch": 0.13955,
+      "grad_norm": 0.9458314180374146,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 13955
+    },
+    {
+      "epoch": 0.13956,
+      "grad_norm": 0.7262864112854004,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 13956
+    },
+    {
+      "epoch": 0.13957,
+      "grad_norm": 0.6911898851394653,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 13957
+    },
+    {
+      "epoch": 0.13958,
+      "grad_norm": 0.7391590476036072,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 13958
+    },
+    {
+      "epoch": 0.13959,
+      "grad_norm": 0.7500927448272705,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 13959
+    },
+    {
+      "epoch": 0.1396,
+      "grad_norm": 0.7304272055625916,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 13960
+    },
+    {
+      "epoch": 0.13961,
+      "grad_norm": 0.786148726940155,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 13961
+    },
+    {
+      "epoch": 0.13962,
+      "grad_norm": 0.7240628600120544,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 13962
+    },
+    {
+      "epoch": 0.13963,
+      "grad_norm": 0.7036923170089722,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 13963
+    },
+    {
+      "epoch": 0.13964,
+      "grad_norm": 0.7554377913475037,
+      "learning_rate": 0.003,
+      "loss": 3.9929,
+      "step": 13964
+    },
+    {
+      "epoch": 0.13965,
+      "grad_norm": 0.8028139472007751,
+      "learning_rate": 0.003,
+      "loss": 3.9837,
+      "step": 13965
+    },
+    {
+      "epoch": 0.13966,
+      "grad_norm": 0.7784460783004761,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 13966
+    },
+    {
+      "epoch": 0.13967,
+      "grad_norm": 0.6501883268356323,
+      "learning_rate": 0.003,
+      "loss": 3.9916,
+      "step": 13967
+    },
+    {
+      "epoch": 0.13968,
+      "grad_norm": 0.6861026883125305,
+      "learning_rate": 0.003,
+      "loss": 3.9758,
+      "step": 13968
+    },
+    {
+      "epoch": 0.13969,
+      "grad_norm": 0.7006701231002808,
+      "learning_rate": 0.003,
+      "loss": 3.9837,
+      "step": 13969
+    },
+    {
+      "epoch": 0.1397,
+      "grad_norm": 0.5880072712898254,
+      "learning_rate": 0.003,
+      "loss": 3.9808,
+      "step": 13970
+    },
+    {
+      "epoch": 0.13971,
+      "grad_norm": 0.6021777391433716,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 13971
+    },
+    {
+      "epoch": 0.13972,
+      "grad_norm": 0.6727344393730164,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 13972
+    },
+    {
+      "epoch": 0.13973,
+      "grad_norm": 0.8657001256942749,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 13973
+    },
+    {
+      "epoch": 0.13974,
+      "grad_norm": 1.2038829326629639,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 13974
+    },
+    {
+      "epoch": 0.13975,
+      "grad_norm": 1.0898510217666626,
+      "learning_rate": 0.003,
+      "loss": 3.9994,
+      "step": 13975
+    },
+    {
+      "epoch": 0.13976,
+      "grad_norm": 0.7038004994392395,
+      "learning_rate": 0.003,
+      "loss": 3.9822,
+      "step": 13976
+    },
+    {
+      "epoch": 0.13977,
+      "grad_norm": 0.5771560072898865,
+      "learning_rate": 0.003,
+      "loss": 3.9903,
+      "step": 13977
+    },
+    {
+      "epoch": 0.13978,
+      "grad_norm": 0.7867242693901062,
+      "learning_rate": 0.003,
+      "loss": 4.0625,
+      "step": 13978
+    },
+    {
+      "epoch": 0.13979,
+      "grad_norm": 0.9191820621490479,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 13979
+    },
+    {
+      "epoch": 0.1398,
+      "grad_norm": 1.0388456583023071,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 13980
+    },
+    {
+      "epoch": 0.13981,
+      "grad_norm": 0.8392547369003296,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 13981
+    },
+    {
+      "epoch": 0.13982,
+      "grad_norm": 0.7875909805297852,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 13982
+    },
+    {
+      "epoch": 0.13983,
+      "grad_norm": 0.9248232245445251,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 13983
+    },
+    {
+      "epoch": 0.13984,
+      "grad_norm": 0.9973124861717224,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 13984
+    },
+    {
+      "epoch": 0.13985,
+      "grad_norm": 0.9626933932304382,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 13985
+    },
+    {
+      "epoch": 0.13986,
+      "grad_norm": 0.8206491470336914,
+      "learning_rate": 0.003,
+      "loss": 4.0339,
+      "step": 13986
+    },
+    {
+      "epoch": 0.13987,
+      "grad_norm": 0.8485277891159058,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 13987
+    },
+    {
+      "epoch": 0.13988,
+      "grad_norm": 1.0558282136917114,
+      "learning_rate": 0.003,
+      "loss": 4.0207,
+      "step": 13988
+    },
+    {
+      "epoch": 0.13989,
+      "grad_norm": 1.1129555702209473,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 13989
+    },
+    {
+      "epoch": 0.1399,
+      "grad_norm": 0.8321244120597839,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 13990
+    },
+    {
+      "epoch": 0.13991,
+      "grad_norm": 0.7786542773246765,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 13991
+    },
+    {
+      "epoch": 0.13992,
+      "grad_norm": 0.6897526383399963,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 13992
+    },
+    {
+      "epoch": 0.13993,
+      "grad_norm": 0.7743588089942932,
+      "learning_rate": 0.003,
+      "loss": 3.9884,
+      "step": 13993
+    },
+    {
+      "epoch": 0.13994,
+      "grad_norm": 0.8591880798339844,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 13994
+    },
+    {
+      "epoch": 0.13995,
+      "grad_norm": 0.8250160813331604,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 13995
+    },
+    {
+      "epoch": 0.13996,
+      "grad_norm": 0.7697608470916748,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 13996
+    },
+    {
+      "epoch": 0.13997,
+      "grad_norm": 0.7532920241355896,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 13997
+    },
+    {
+      "epoch": 0.13998,
+      "grad_norm": 0.7935050129890442,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 13998
+    },
+    {
+      "epoch": 0.13999,
+      "grad_norm": 0.8093975186347961,
+      "learning_rate": 0.003,
+      "loss": 4.0251,
+      "step": 13999
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.6865020990371704,
+      "learning_rate": 0.003,
+      "loss": 3.9909,
+      "step": 14000
+    },
+    {
+      "epoch": 0.14001,
+      "grad_norm": 0.6700130105018616,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 14001
+    },
+    {
+      "epoch": 0.14002,
+      "grad_norm": 0.6797581911087036,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 14002
+    },
+    {
+      "epoch": 0.14003,
+      "grad_norm": 0.6515060663223267,
+      "learning_rate": 0.003,
+      "loss": 4.0237,
+      "step": 14003
+    },
+    {
+      "epoch": 0.14004,
+      "grad_norm": 0.8096653819084167,
+      "learning_rate": 0.003,
+      "loss": 4.0119,
+      "step": 14004
+    },
+    {
+      "epoch": 0.14005,
+      "grad_norm": 0.997570276260376,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 14005
+    },
+    {
+      "epoch": 0.14006,
+      "grad_norm": 1.2135899066925049,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 14006
+    },
+    {
+      "epoch": 0.14007,
+      "grad_norm": 0.7099087834358215,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 14007
+    },
+    {
+      "epoch": 0.14008,
+      "grad_norm": 0.7432337403297424,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 14008
+    },
+    {
+      "epoch": 0.14009,
+      "grad_norm": 0.8485451936721802,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 14009
+    },
+    {
+      "epoch": 0.1401,
+      "grad_norm": 0.7696965932846069,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 14010
+    },
+    {
+      "epoch": 0.14011,
+      "grad_norm": 0.7383472323417664,
+      "learning_rate": 0.003,
+      "loss": 3.9915,
+      "step": 14011
+    },
+    {
+      "epoch": 0.14012,
+      "grad_norm": 0.8596447110176086,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 14012
+    },
+    {
+      "epoch": 0.14013,
+      "grad_norm": 1.019231915473938,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 14013
+    },
+    {
+      "epoch": 0.14014,
+      "grad_norm": 1.083799123764038,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 14014
+    },
+    {
+      "epoch": 0.14015,
+      "grad_norm": 0.7324127554893494,
+      "learning_rate": 0.003,
+      "loss": 3.9954,
+      "step": 14015
+    },
+    {
+      "epoch": 0.14016,
+      "grad_norm": 0.6100013852119446,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 14016
+    },
+    {
+      "epoch": 0.14017,
+      "grad_norm": 0.5659640431404114,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 14017
+    },
+    {
+      "epoch": 0.14018,
+      "grad_norm": 0.5738541483879089,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 14018
+    },
+    {
+      "epoch": 0.14019,
+      "grad_norm": 0.5823083519935608,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 14019
+    },
+    {
+      "epoch": 0.1402,
+      "grad_norm": 0.575903594493866,
+      "learning_rate": 0.003,
+      "loss": 3.9731,
+      "step": 14020
+    },
+    {
+      "epoch": 0.14021,
+      "grad_norm": 0.585667610168457,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 14021
+    },
+    {
+      "epoch": 0.14022,
+      "grad_norm": 0.6648114323616028,
+      "learning_rate": 0.003,
+      "loss": 3.9828,
+      "step": 14022
+    },
+    {
+      "epoch": 0.14023,
+      "grad_norm": 0.6566023826599121,
+      "learning_rate": 0.003,
+      "loss": 3.9973,
+      "step": 14023
+    },
+    {
+      "epoch": 0.14024,
+      "grad_norm": 0.7724025249481201,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 14024
+    },
+    {
+      "epoch": 0.14025,
+      "grad_norm": 0.9104265570640564,
+      "learning_rate": 0.003,
+      "loss": 3.9874,
+      "step": 14025
+    },
+    {
+      "epoch": 0.14026,
+      "grad_norm": 0.9596461057662964,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 14026
+    },
+    {
+      "epoch": 0.14027,
+      "grad_norm": 1.0541963577270508,
+      "learning_rate": 0.003,
+      "loss": 3.9847,
+      "step": 14027
+    },
+    {
+      "epoch": 0.14028,
+      "grad_norm": 0.9626950025558472,
+      "learning_rate": 0.003,
+      "loss": 3.9917,
+      "step": 14028
+    },
+    {
+      "epoch": 0.14029,
+      "grad_norm": 0.9601374268531799,
+      "learning_rate": 0.003,
+      "loss": 3.9872,
+      "step": 14029
+    },
+    {
+      "epoch": 0.1403,
+      "grad_norm": 0.92144376039505,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 14030
+    },
+    {
+      "epoch": 0.14031,
+      "grad_norm": 0.8757461905479431,
+      "learning_rate": 0.003,
+      "loss": 4.0001,
+      "step": 14031
+    },
+    {
+      "epoch": 0.14032,
+      "grad_norm": 0.8189901113510132,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 14032
+    },
+    {
+      "epoch": 0.14033,
+      "grad_norm": 0.8232151865959167,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 14033
+    },
+    {
+      "epoch": 0.14034,
+      "grad_norm": 0.8041811585426331,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 14034
+    },
+    {
+      "epoch": 0.14035,
+      "grad_norm": 0.895202100276947,
+      "learning_rate": 0.003,
+      "loss": 4.0484,
+      "step": 14035
+    },
+    {
+      "epoch": 0.14036,
+      "grad_norm": 0.7802772521972656,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 14036
+    },
+    {
+      "epoch": 0.14037,
+      "grad_norm": 0.8878588676452637,
+      "learning_rate": 0.003,
+      "loss": 4.0535,
+      "step": 14037
+    },
+    {
+      "epoch": 0.14038,
+      "grad_norm": 0.8776265382766724,
+      "learning_rate": 0.003,
+      "loss": 4.0574,
+      "step": 14038
+    },
+    {
+      "epoch": 0.14039,
+      "grad_norm": 0.7140911221504211,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 14039
+    },
+    {
+      "epoch": 0.1404,
+      "grad_norm": 0.7636114358901978,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 14040
+    },
+    {
+      "epoch": 0.14041,
+      "grad_norm": 0.8802103400230408,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 14041
+    },
+    {
+      "epoch": 0.14042,
+      "grad_norm": 0.8500376343727112,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 14042
+    },
+    {
+      "epoch": 0.14043,
+      "grad_norm": 0.9757203459739685,
+      "learning_rate": 0.003,
+      "loss": 3.9991,
+      "step": 14043
+    },
+    {
+      "epoch": 0.14044,
+      "grad_norm": 1.1028854846954346,
+      "learning_rate": 0.003,
+      "loss": 3.9871,
+      "step": 14044
+    },
+    {
+      "epoch": 0.14045,
+      "grad_norm": 1.1399049758911133,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 14045
+    },
+    {
+      "epoch": 0.14046,
+      "grad_norm": 0.7455863356590271,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 14046
+    },
+    {
+      "epoch": 0.14047,
+      "grad_norm": 0.6432068347930908,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 14047
+    },
+    {
+      "epoch": 0.14048,
+      "grad_norm": 0.6292878985404968,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 14048
+    },
+    {
+      "epoch": 0.14049,
+      "grad_norm": 0.6780155897140503,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 14049
+    },
+    {
+      "epoch": 0.1405,
+      "grad_norm": 0.8050853610038757,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 14050
+    },
+    {
+      "epoch": 0.14051,
+      "grad_norm": 0.8014886975288391,
+      "learning_rate": 0.003,
+      "loss": 3.9845,
+      "step": 14051
+    },
+    {
+      "epoch": 0.14052,
+      "grad_norm": 0.8241047859191895,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 14052
+    },
+    {
+      "epoch": 0.14053,
+      "grad_norm": 0.8780802488327026,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 14053
+    },
+    {
+      "epoch": 0.14054,
+      "grad_norm": 0.9953709840774536,
+      "learning_rate": 0.003,
+      "loss": 4.0168,
+      "step": 14054
+    },
+    {
+      "epoch": 0.14055,
+      "grad_norm": 1.020336389541626,
+      "learning_rate": 0.003,
+      "loss": 3.9893,
+      "step": 14055
+    },
+    {
+      "epoch": 0.14056,
+      "grad_norm": 0.9320582151412964,
+      "learning_rate": 0.003,
+      "loss": 4.034,
+      "step": 14056
+    },
+    {
+      "epoch": 0.14057,
+      "grad_norm": 0.7158468961715698,
+      "learning_rate": 0.003,
+      "loss": 3.9966,
+      "step": 14057
+    },
+    {
+      "epoch": 0.14058,
+      "grad_norm": 0.6542931795120239,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 14058
+    },
+    {
+      "epoch": 0.14059,
+      "grad_norm": 0.5777246952056885,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 14059
+    },
+    {
+      "epoch": 0.1406,
+      "grad_norm": 0.5632390379905701,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 14060
+    },
+    {
+      "epoch": 0.14061,
+      "grad_norm": 0.6434266567230225,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 14061
+    },
+    {
+      "epoch": 0.14062,
+      "grad_norm": 0.6584779024124146,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 14062
+    },
+    {
+      "epoch": 0.14063,
+      "grad_norm": 0.8279368281364441,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 14063
+    },
+    {
+      "epoch": 0.14064,
+      "grad_norm": 1.1137088537216187,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 14064
+    },
+    {
+      "epoch": 0.14065,
+      "grad_norm": 0.9908633828163147,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 14065
+    },
+    {
+      "epoch": 0.14066,
+      "grad_norm": 0.814708411693573,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 14066
+    },
+    {
+      "epoch": 0.14067,
+      "grad_norm": 0.6335097551345825,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 14067
+    },
+    {
+      "epoch": 0.14068,
+      "grad_norm": 0.6922863721847534,
+      "learning_rate": 0.003,
+      "loss": 3.9839,
+      "step": 14068
+    },
+    {
+      "epoch": 0.14069,
+      "grad_norm": 0.8013014197349548,
+      "learning_rate": 0.003,
+      "loss": 4.0176,
+      "step": 14069
+    },
+    {
+      "epoch": 0.1407,
+      "grad_norm": 0.8348655700683594,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 14070
+    },
+    {
+      "epoch": 0.14071,
+      "grad_norm": 0.8111964464187622,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 14071
+    },
+    {
+      "epoch": 0.14072,
+      "grad_norm": 0.8467102646827698,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 14072
+    },
+    {
+      "epoch": 0.14073,
+      "grad_norm": 1.0120962858200073,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 14073
+    },
+    {
+      "epoch": 0.14074,
+      "grad_norm": 1.0943655967712402,
+      "learning_rate": 0.003,
+      "loss": 4.042,
+      "step": 14074
+    },
+    {
+      "epoch": 0.14075,
+      "grad_norm": 1.0920785665512085,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 14075
+    },
+    {
+      "epoch": 0.14076,
+      "grad_norm": 0.9106650948524475,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 14076
+    },
+    {
+      "epoch": 0.14077,
+      "grad_norm": 0.8137316107749939,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 14077
+    },
+    {
+      "epoch": 0.14078,
+      "grad_norm": 0.8589572310447693,
+      "learning_rate": 0.003,
+      "loss": 4.0251,
+      "step": 14078
+    },
+    {
+      "epoch": 0.14079,
+      "grad_norm": 0.8999586701393127,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 14079
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.6625589728355408,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 14080
+    },
+    {
+      "epoch": 0.14081,
+      "grad_norm": 0.6258305311203003,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 14081
+    },
+    {
+      "epoch": 0.14082,
+      "grad_norm": 0.653458297252655,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 14082
+    },
+    {
+      "epoch": 0.14083,
+      "grad_norm": 0.6369495987892151,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 14083
+    },
+    {
+      "epoch": 0.14084,
+      "grad_norm": 0.6380630731582642,
+      "learning_rate": 0.003,
+      "loss": 3.9717,
+      "step": 14084
+    },
+    {
+      "epoch": 0.14085,
+      "grad_norm": 0.6367437839508057,
+      "learning_rate": 0.003,
+      "loss": 3.998,
+      "step": 14085
+    },
+    {
+      "epoch": 0.14086,
+      "grad_norm": 0.6577797532081604,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 14086
+    },
+    {
+      "epoch": 0.14087,
+      "grad_norm": 0.6576122641563416,
+      "learning_rate": 0.003,
+      "loss": 4.0168,
+      "step": 14087
+    },
+    {
+      "epoch": 0.14088,
+      "grad_norm": 0.6562979817390442,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 14088
+    },
+    {
+      "epoch": 0.14089,
+      "grad_norm": 0.7040120959281921,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 14089
+    },
+    {
+      "epoch": 0.1409,
+      "grad_norm": 0.7400263547897339,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 14090
+    },
+    {
+      "epoch": 0.14091,
+      "grad_norm": 1.0668511390686035,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 14091
+    },
+    {
+      "epoch": 0.14092,
+      "grad_norm": 1.3164247274398804,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 14092
+    },
+    {
+      "epoch": 0.14093,
+      "grad_norm": 0.7127968072891235,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 14093
+    },
+    {
+      "epoch": 0.14094,
+      "grad_norm": 0.5614868998527527,
+      "learning_rate": 0.003,
+      "loss": 3.9759,
+      "step": 14094
+    },
+    {
+      "epoch": 0.14095,
+      "grad_norm": 0.603585958480835,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 14095
+    },
+    {
+      "epoch": 0.14096,
+      "grad_norm": 0.6809159517288208,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 14096
+    },
+    {
+      "epoch": 0.14097,
+      "grad_norm": 0.8328123688697815,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 14097
+    },
+    {
+      "epoch": 0.14098,
+      "grad_norm": 0.9252586364746094,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 14098
+    },
+    {
+      "epoch": 0.14099,
+      "grad_norm": 0.9418660402297974,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 14099
+    },
+    {
+      "epoch": 0.141,
+      "grad_norm": 0.8653552532196045,
+      "learning_rate": 0.003,
+      "loss": 4.0437,
+      "step": 14100
+    },
+    {
+      "epoch": 0.14101,
+      "grad_norm": 0.9184288382530212,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 14101
+    },
+    {
+      "epoch": 0.14102,
+      "grad_norm": 1.0402677059173584,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 14102
+    },
+    {
+      "epoch": 0.14103,
+      "grad_norm": 1.1628098487854004,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 14103
+    },
+    {
+      "epoch": 0.14104,
+      "grad_norm": 0.7976023554801941,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 14104
+    },
+    {
+      "epoch": 0.14105,
+      "grad_norm": 0.6668494343757629,
+      "learning_rate": 0.003,
+      "loss": 3.9866,
+      "step": 14105
+    },
+    {
+      "epoch": 0.14106,
+      "grad_norm": 0.6816210746765137,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 14106
+    },
+    {
+      "epoch": 0.14107,
+      "grad_norm": 0.7886625528335571,
+      "learning_rate": 0.003,
+      "loss": 3.9984,
+      "step": 14107
+    },
+    {
+      "epoch": 0.14108,
+      "grad_norm": 0.7702372670173645,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 14108
+    },
+    {
+      "epoch": 0.14109,
+      "grad_norm": 0.8172313570976257,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 14109
+    },
+    {
+      "epoch": 0.1411,
+      "grad_norm": 0.8660810589790344,
+      "learning_rate": 0.003,
+      "loss": 4.0027,
+      "step": 14110
+    },
+    {
+      "epoch": 0.14111,
+      "grad_norm": 0.9497108459472656,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 14111
+    },
+    {
+      "epoch": 0.14112,
+      "grad_norm": 1.007104516029358,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 14112
+    },
+    {
+      "epoch": 0.14113,
+      "grad_norm": 1.1830443143844604,
+      "learning_rate": 0.003,
+      "loss": 4.0047,
+      "step": 14113
+    },
+    {
+      "epoch": 0.14114,
+      "grad_norm": 0.8512828350067139,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 14114
+    },
+    {
+      "epoch": 0.14115,
+      "grad_norm": 0.6526487469673157,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 14115
+    },
+    {
+      "epoch": 0.14116,
+      "grad_norm": 0.6798277497291565,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 14116
+    },
+    {
+      "epoch": 0.14117,
+      "grad_norm": 0.7448130249977112,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 14117
+    },
+    {
+      "epoch": 0.14118,
+      "grad_norm": 0.7307347655296326,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 14118
+    },
+    {
+      "epoch": 0.14119,
+      "grad_norm": 0.7131176590919495,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 14119
+    },
+    {
+      "epoch": 0.1412,
+      "grad_norm": 0.7631412744522095,
+      "learning_rate": 0.003,
+      "loss": 3.9982,
+      "step": 14120
+    },
+    {
+      "epoch": 0.14121,
+      "grad_norm": 0.9671663641929626,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 14121
+    },
+    {
+      "epoch": 0.14122,
+      "grad_norm": 1.2928788661956787,
+      "learning_rate": 0.003,
+      "loss": 4.0456,
+      "step": 14122
+    },
+    {
+      "epoch": 0.14123,
+      "grad_norm": 0.661189615726471,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 14123
+    },
+    {
+      "epoch": 0.14124,
+      "grad_norm": 0.6652116179466248,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 14124
+    },
+    {
+      "epoch": 0.14125,
+      "grad_norm": 0.8989524245262146,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 14125
+    },
+    {
+      "epoch": 0.14126,
+      "grad_norm": 1.0922940969467163,
+      "learning_rate": 0.003,
+      "loss": 3.97,
+      "step": 14126
+    },
+    {
+      "epoch": 0.14127,
+      "grad_norm": 1.0371108055114746,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 14127
+    },
+    {
+      "epoch": 0.14128,
+      "grad_norm": 1.0985851287841797,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 14128
+    },
+    {
+      "epoch": 0.14129,
+      "grad_norm": 0.8594481945037842,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 14129
+    },
+    {
+      "epoch": 0.1413,
+      "grad_norm": 0.8762556910514832,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 14130
+    },
+    {
+      "epoch": 0.14131,
+      "grad_norm": 1.2780417203903198,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 14131
+    },
+    {
+      "epoch": 0.14132,
+      "grad_norm": 0.8962858319282532,
+      "learning_rate": 0.003,
+      "loss": 4.0481,
+      "step": 14132
+    },
+    {
+      "epoch": 0.14133,
+      "grad_norm": 0.6861137747764587,
+      "learning_rate": 0.003,
+      "loss": 3.9856,
+      "step": 14133
+    },
+    {
+      "epoch": 0.14134,
+      "grad_norm": 0.637040913105011,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 14134
+    },
+    {
+      "epoch": 0.14135,
+      "grad_norm": 0.6423055529594421,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 14135
+    },
+    {
+      "epoch": 0.14136,
+      "grad_norm": 0.619636058807373,
+      "learning_rate": 0.003,
+      "loss": 3.9977,
+      "step": 14136
+    },
+    {
+      "epoch": 0.14137,
+      "grad_norm": 0.5906887650489807,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 14137
+    },
+    {
+      "epoch": 0.14138,
+      "grad_norm": 0.5638257265090942,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 14138
+    },
+    {
+      "epoch": 0.14139,
+      "grad_norm": 0.5778006315231323,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 14139
+    },
+    {
+      "epoch": 0.1414,
+      "grad_norm": 0.5948111414909363,
+      "learning_rate": 0.003,
+      "loss": 3.9806,
+      "step": 14140
+    },
+    {
+      "epoch": 0.14141,
+      "grad_norm": 0.6248180866241455,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 14141
+    },
+    {
+      "epoch": 0.14142,
+      "grad_norm": 0.7527813911437988,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 14142
+    },
+    {
+      "epoch": 0.14143,
+      "grad_norm": 0.7687675952911377,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 14143
+    },
+    {
+      "epoch": 0.14144,
+      "grad_norm": 0.7792863249778748,
+      "learning_rate": 0.003,
+      "loss": 3.9897,
+      "step": 14144
+    },
+    {
+      "epoch": 0.14145,
+      "grad_norm": 0.9133181571960449,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 14145
+    },
+    {
+      "epoch": 0.14146,
+      "grad_norm": 1.0635042190551758,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 14146
+    },
+    {
+      "epoch": 0.14147,
+      "grad_norm": 0.9550064206123352,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 14147
+    },
+    {
+      "epoch": 0.14148,
+      "grad_norm": 0.9956814646720886,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 14148
+    },
+    {
+      "epoch": 0.14149,
+      "grad_norm": 0.9905039072036743,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 14149
+    },
+    {
+      "epoch": 0.1415,
+      "grad_norm": 0.981390118598938,
+      "learning_rate": 0.003,
+      "loss": 4.0441,
+      "step": 14150
+    },
+    {
+      "epoch": 0.14151,
+      "grad_norm": 0.8436192274093628,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 14151
+    },
+    {
+      "epoch": 0.14152,
+      "grad_norm": 0.8707951307296753,
+      "learning_rate": 0.003,
+      "loss": 3.9998,
+      "step": 14152
+    },
+    {
+      "epoch": 0.14153,
+      "grad_norm": 0.9241832494735718,
+      "learning_rate": 0.003,
+      "loss": 3.992,
+      "step": 14153
+    },
+    {
+      "epoch": 0.14154,
+      "grad_norm": 0.9518395066261292,
+      "learning_rate": 0.003,
+      "loss": 3.9943,
+      "step": 14154
+    },
+    {
+      "epoch": 0.14155,
+      "grad_norm": 1.0943546295166016,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 14155
+    },
+    {
+      "epoch": 0.14156,
+      "grad_norm": 0.8065584897994995,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 14156
+    },
+    {
+      "epoch": 0.14157,
+      "grad_norm": 0.7145382165908813,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 14157
+    },
+    {
+      "epoch": 0.14158,
+      "grad_norm": 0.8641098141670227,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 14158
+    },
+    {
+      "epoch": 0.14159,
+      "grad_norm": 0.7735960483551025,
+      "learning_rate": 0.003,
+      "loss": 4.0234,
+      "step": 14159
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.7756565809249878,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 14160
+    },
+    {
+      "epoch": 0.14161,
+      "grad_norm": 0.7883841395378113,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 14161
+    },
+    {
+      "epoch": 0.14162,
+      "grad_norm": 0.7832257747650146,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 14162
+    },
+    {
+      "epoch": 0.14163,
+      "grad_norm": 0.8378828167915344,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 14163
+    },
+    {
+      "epoch": 0.14164,
+      "grad_norm": 0.8510462641716003,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 14164
+    },
+    {
+      "epoch": 0.14165,
+      "grad_norm": 0.9078649878501892,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 14165
+    },
+    {
+      "epoch": 0.14166,
+      "grad_norm": 0.8352487683296204,
+      "learning_rate": 0.003,
+      "loss": 4.0332,
+      "step": 14166
+    },
+    {
+      "epoch": 0.14167,
+      "grad_norm": 0.7029122710227966,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 14167
+    },
+    {
+      "epoch": 0.14168,
+      "grad_norm": 0.7005764842033386,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 14168
+    },
+    {
+      "epoch": 0.14169,
+      "grad_norm": 0.699022650718689,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 14169
+    },
+    {
+      "epoch": 0.1417,
+      "grad_norm": 0.6358689069747925,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 14170
+    },
+    {
+      "epoch": 0.14171,
+      "grad_norm": 0.8033818006515503,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 14171
+    },
+    {
+      "epoch": 0.14172,
+      "grad_norm": 1.1550252437591553,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 14172
+    },
+    {
+      "epoch": 0.14173,
+      "grad_norm": 1.2971549034118652,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 14173
+    },
+    {
+      "epoch": 0.14174,
+      "grad_norm": 0.7195215821266174,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 14174
+    },
+    {
+      "epoch": 0.14175,
+      "grad_norm": 0.5937075614929199,
+      "learning_rate": 0.003,
+      "loss": 4.0295,
+      "step": 14175
+    },
+    {
+      "epoch": 0.14176,
+      "grad_norm": 0.7166873216629028,
+      "learning_rate": 0.003,
+      "loss": 3.9991,
+      "step": 14176
+    },
+    {
+      "epoch": 0.14177,
+      "grad_norm": 0.7563587427139282,
+      "learning_rate": 0.003,
+      "loss": 4.0429,
+      "step": 14177
+    },
+    {
+      "epoch": 0.14178,
+      "grad_norm": 0.9330248832702637,
+      "learning_rate": 0.003,
+      "loss": 3.9759,
+      "step": 14178
+    },
+    {
+      "epoch": 0.14179,
+      "grad_norm": 0.9753488302230835,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 14179
+    },
+    {
+      "epoch": 0.1418,
+      "grad_norm": 0.8488276600837708,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 14180
+    },
+    {
+      "epoch": 0.14181,
+      "grad_norm": 0.6426655650138855,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 14181
+    },
+    {
+      "epoch": 0.14182,
+      "grad_norm": 0.6621089577674866,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 14182
+    },
+    {
+      "epoch": 0.14183,
+      "grad_norm": 0.8314296007156372,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 14183
+    },
+    {
+      "epoch": 0.14184,
+      "grad_norm": 1.0920621156692505,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 14184
+    },
+    {
+      "epoch": 0.14185,
+      "grad_norm": 1.1957831382751465,
+      "learning_rate": 0.003,
+      "loss": 4.0316,
+      "step": 14185
+    },
+    {
+      "epoch": 0.14186,
+      "grad_norm": 0.726463794708252,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 14186
+    },
+    {
+      "epoch": 0.14187,
+      "grad_norm": 0.7537030577659607,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 14187
+    },
+    {
+      "epoch": 0.14188,
+      "grad_norm": 0.7797044515609741,
+      "learning_rate": 0.003,
+      "loss": 3.9887,
+      "step": 14188
+    },
+    {
+      "epoch": 0.14189,
+      "grad_norm": 0.7619060277938843,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 14189
+    },
+    {
+      "epoch": 0.1419,
+      "grad_norm": 0.7681478261947632,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 14190
+    },
+    {
+      "epoch": 0.14191,
+      "grad_norm": 0.737553060054779,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 14191
+    },
+    {
+      "epoch": 0.14192,
+      "grad_norm": 0.5535001754760742,
+      "learning_rate": 0.003,
+      "loss": 4.0488,
+      "step": 14192
+    },
+    {
+      "epoch": 0.14193,
+      "grad_norm": 0.6274293661117554,
+      "learning_rate": 0.003,
+      "loss": 3.9972,
+      "step": 14193
+    },
+    {
+      "epoch": 0.14194,
+      "grad_norm": 0.640678882598877,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 14194
+    },
+    {
+      "epoch": 0.14195,
+      "grad_norm": 0.6661068201065063,
+      "learning_rate": 0.003,
+      "loss": 4.0234,
+      "step": 14195
+    },
+    {
+      "epoch": 0.14196,
+      "grad_norm": 0.7455634474754333,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 14196
+    },
+    {
+      "epoch": 0.14197,
+      "grad_norm": 0.8102667331695557,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 14197
+    },
+    {
+      "epoch": 0.14198,
+      "grad_norm": 0.8579203486442566,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 14198
+    },
+    {
+      "epoch": 0.14199,
+      "grad_norm": 1.149722695350647,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 14199
+    },
+    {
+      "epoch": 0.142,
+      "grad_norm": 1.1549619436264038,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 14200
+    },
+    {
+      "epoch": 0.14201,
+      "grad_norm": 0.8290915489196777,
+      "learning_rate": 0.003,
+      "loss": 3.9852,
+      "step": 14201
+    },
+    {
+      "epoch": 0.14202,
+      "grad_norm": 0.706985354423523,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 14202
+    },
+    {
+      "epoch": 0.14203,
+      "grad_norm": 0.7158949971199036,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 14203
+    },
+    {
+      "epoch": 0.14204,
+      "grad_norm": 0.7190188765525818,
+      "learning_rate": 0.003,
+      "loss": 4.0056,
+      "step": 14204
+    },
+    {
+      "epoch": 0.14205,
+      "grad_norm": 0.7959105968475342,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 14205
+    },
+    {
+      "epoch": 0.14206,
+      "grad_norm": 1.0919005870819092,
+      "learning_rate": 0.003,
+      "loss": 3.9923,
+      "step": 14206
+    },
+    {
+      "epoch": 0.14207,
+      "grad_norm": 1.1565816402435303,
+      "learning_rate": 0.003,
+      "loss": 4.0398,
+      "step": 14207
+    },
+    {
+      "epoch": 0.14208,
+      "grad_norm": 0.8859974145889282,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 14208
+    },
+    {
+      "epoch": 0.14209,
+      "grad_norm": 0.8769922852516174,
+      "learning_rate": 0.003,
+      "loss": 4.0119,
+      "step": 14209
+    },
+    {
+      "epoch": 0.1421,
+      "grad_norm": 0.7491868138313293,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 14210
+    },
+    {
+      "epoch": 0.14211,
+      "grad_norm": 0.7215003371238708,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 14211
+    },
+    {
+      "epoch": 0.14212,
+      "grad_norm": 0.7080946564674377,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 14212
+    },
+    {
+      "epoch": 0.14213,
+      "grad_norm": 0.7395340800285339,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 14213
+    },
+    {
+      "epoch": 0.14214,
+      "grad_norm": 0.7478938698768616,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 14214
+    },
+    {
+      "epoch": 0.14215,
+      "grad_norm": 0.6703423261642456,
+      "learning_rate": 0.003,
+      "loss": 3.9933,
+      "step": 14215
+    },
+    {
+      "epoch": 0.14216,
+      "grad_norm": 0.7728683352470398,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 14216
+    },
+    {
+      "epoch": 0.14217,
+      "grad_norm": 0.8863434791564941,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 14217
+    },
+    {
+      "epoch": 0.14218,
+      "grad_norm": 0.9171063899993896,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 14218
+    },
+    {
+      "epoch": 0.14219,
+      "grad_norm": 0.8939700126647949,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 14219
+    },
+    {
+      "epoch": 0.1422,
+      "grad_norm": 0.9751589298248291,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 14220
+    },
+    {
+      "epoch": 0.14221,
+      "grad_norm": 1.1756740808486938,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 14221
+    },
+    {
+      "epoch": 0.14222,
+      "grad_norm": 0.7842356562614441,
+      "learning_rate": 0.003,
+      "loss": 4.0459,
+      "step": 14222
+    },
+    {
+      "epoch": 0.14223,
+      "grad_norm": 0.8838635087013245,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 14223
+    },
+    {
+      "epoch": 0.14224,
+      "grad_norm": 1.165834665298462,
+      "learning_rate": 0.003,
+      "loss": 3.9857,
+      "step": 14224
+    },
+    {
+      "epoch": 0.14225,
+      "grad_norm": 1.0083396434783936,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 14225
+    },
+    {
+      "epoch": 0.14226,
+      "grad_norm": 1.2527788877487183,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 14226
+    },
+    {
+      "epoch": 0.14227,
+      "grad_norm": 0.6788728833198547,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 14227
+    },
+    {
+      "epoch": 0.14228,
+      "grad_norm": 0.6345340013504028,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 14228
+    },
+    {
+      "epoch": 0.14229,
+      "grad_norm": 0.7083902955055237,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 14229
+    },
+    {
+      "epoch": 0.1423,
+      "grad_norm": 0.6958598494529724,
+      "learning_rate": 0.003,
+      "loss": 3.9907,
+      "step": 14230
+    },
+    {
+      "epoch": 0.14231,
+      "grad_norm": 0.6795510649681091,
+      "learning_rate": 0.003,
+      "loss": 4.031,
+      "step": 14231
+    },
+    {
+      "epoch": 0.14232,
+      "grad_norm": 0.5360814332962036,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 14232
+    },
+    {
+      "epoch": 0.14233,
+      "grad_norm": 0.5661422610282898,
+      "learning_rate": 0.003,
+      "loss": 4.0397,
+      "step": 14233
+    },
+    {
+      "epoch": 0.14234,
+      "grad_norm": 0.5672509670257568,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 14234
+    },
+    {
+      "epoch": 0.14235,
+      "grad_norm": 0.6105342507362366,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 14235
+    },
+    {
+      "epoch": 0.14236,
+      "grad_norm": 0.6176836490631104,
+      "learning_rate": 0.003,
+      "loss": 3.9928,
+      "step": 14236
+    },
+    {
+      "epoch": 0.14237,
+      "grad_norm": 0.6107972860336304,
+      "learning_rate": 0.003,
+      "loss": 3.9988,
+      "step": 14237
+    },
+    {
+      "epoch": 0.14238,
+      "grad_norm": 0.7120127081871033,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 14238
+    },
+    {
+      "epoch": 0.14239,
+      "grad_norm": 0.9503802061080933,
+      "learning_rate": 0.003,
+      "loss": 4.043,
+      "step": 14239
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 1.435011863708496,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 14240
+    },
+    {
+      "epoch": 0.14241,
+      "grad_norm": 0.6943506598472595,
+      "learning_rate": 0.003,
+      "loss": 4.0355,
+      "step": 14241
+    },
+    {
+      "epoch": 0.14242,
+      "grad_norm": 0.7207216620445251,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 14242
+    },
+    {
+      "epoch": 0.14243,
+      "grad_norm": 0.8685964941978455,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 14243
+    },
+    {
+      "epoch": 0.14244,
+      "grad_norm": 0.8263289332389832,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 14244
+    },
+    {
+      "epoch": 0.14245,
+      "grad_norm": 0.8320058584213257,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 14245
+    },
+    {
+      "epoch": 0.14246,
+      "grad_norm": 0.8826883435249329,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 14246
+    },
+    {
+      "epoch": 0.14247,
+      "grad_norm": 0.9846150279045105,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 14247
+    },
+    {
+      "epoch": 0.14248,
+      "grad_norm": 0.9439970850944519,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 14248
+    },
+    {
+      "epoch": 0.14249,
+      "grad_norm": 0.7696685194969177,
+      "learning_rate": 0.003,
+      "loss": 3.9711,
+      "step": 14249
+    },
+    {
+      "epoch": 0.1425,
+      "grad_norm": 0.631126344203949,
+      "learning_rate": 0.003,
+      "loss": 3.9841,
+      "step": 14250
+    },
+    {
+      "epoch": 0.14251,
+      "grad_norm": 0.7179323434829712,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 14251
+    },
+    {
+      "epoch": 0.14252,
+      "grad_norm": 0.8308407068252563,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 14252
+    },
+    {
+      "epoch": 0.14253,
+      "grad_norm": 0.8950212597846985,
+      "learning_rate": 0.003,
+      "loss": 4.0429,
+      "step": 14253
+    },
+    {
+      "epoch": 0.14254,
+      "grad_norm": 1.186442494392395,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 14254
+    },
+    {
+      "epoch": 0.14255,
+      "grad_norm": 1.0353965759277344,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 14255
+    },
+    {
+      "epoch": 0.14256,
+      "grad_norm": 1.0124056339263916,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 14256
+    },
+    {
+      "epoch": 0.14257,
+      "grad_norm": 0.9364923238754272,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 14257
+    },
+    {
+      "epoch": 0.14258,
+      "grad_norm": 0.7490666508674622,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 14258
+    },
+    {
+      "epoch": 0.14259,
+      "grad_norm": 0.6310158967971802,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 14259
+    },
+    {
+      "epoch": 0.1426,
+      "grad_norm": 0.6345019936561584,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 14260
+    },
+    {
+      "epoch": 0.14261,
+      "grad_norm": 0.6502352952957153,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 14261
+    },
+    {
+      "epoch": 0.14262,
+      "grad_norm": 0.7433469295501709,
+      "learning_rate": 0.003,
+      "loss": 3.9886,
+      "step": 14262
+    },
+    {
+      "epoch": 0.14263,
+      "grad_norm": 0.9879776835441589,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 14263
+    },
+    {
+      "epoch": 0.14264,
+      "grad_norm": 1.222913384437561,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 14264
+    },
+    {
+      "epoch": 0.14265,
+      "grad_norm": 0.7845995426177979,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 14265
+    },
+    {
+      "epoch": 0.14266,
+      "grad_norm": 0.7524864077568054,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 14266
+    },
+    {
+      "epoch": 0.14267,
+      "grad_norm": 0.757966160774231,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 14267
+    },
+    {
+      "epoch": 0.14268,
+      "grad_norm": 0.7970731258392334,
+      "learning_rate": 0.003,
+      "loss": 3.9857,
+      "step": 14268
+    },
+    {
+      "epoch": 0.14269,
+      "grad_norm": 0.7954204082489014,
+      "learning_rate": 0.003,
+      "loss": 4.0373,
+      "step": 14269
+    },
+    {
+      "epoch": 0.1427,
+      "grad_norm": 0.864578366279602,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 14270
+    },
+    {
+      "epoch": 0.14271,
+      "grad_norm": 0.9321130514144897,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 14271
+    },
+    {
+      "epoch": 0.14272,
+      "grad_norm": 0.9318530559539795,
+      "learning_rate": 0.003,
+      "loss": 4.0376,
+      "step": 14272
+    },
+    {
+      "epoch": 0.14273,
+      "grad_norm": 0.9119703769683838,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 14273
+    },
+    {
+      "epoch": 0.14274,
+      "grad_norm": 0.9916678071022034,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 14274
+    },
+    {
+      "epoch": 0.14275,
+      "grad_norm": 0.8041375875473022,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 14275
+    },
+    {
+      "epoch": 0.14276,
+      "grad_norm": 0.6699801683425903,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 14276
+    },
+    {
+      "epoch": 0.14277,
+      "grad_norm": 0.7691702842712402,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 14277
+    },
+    {
+      "epoch": 0.14278,
+      "grad_norm": 0.9321637749671936,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 14278
+    },
+    {
+      "epoch": 0.14279,
+      "grad_norm": 1.0950120687484741,
+      "learning_rate": 0.003,
+      "loss": 4.0495,
+      "step": 14279
+    },
+    {
+      "epoch": 0.1428,
+      "grad_norm": 0.8870145678520203,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 14280
+    },
+    {
+      "epoch": 0.14281,
+      "grad_norm": 0.7608538866043091,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 14281
+    },
+    {
+      "epoch": 0.14282,
+      "grad_norm": 0.7831116914749146,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 14282
+    },
+    {
+      "epoch": 0.14283,
+      "grad_norm": 0.7721794247627258,
+      "learning_rate": 0.003,
+      "loss": 3.9942,
+      "step": 14283
+    },
+    {
+      "epoch": 0.14284,
+      "grad_norm": 0.7339931726455688,
+      "learning_rate": 0.003,
+      "loss": 3.9872,
+      "step": 14284
+    },
+    {
+      "epoch": 0.14285,
+      "grad_norm": 0.7375816702842712,
+      "learning_rate": 0.003,
+      "loss": 3.9991,
+      "step": 14285
+    },
+    {
+      "epoch": 0.14286,
+      "grad_norm": 0.7514878511428833,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 14286
+    },
+    {
+      "epoch": 0.14287,
+      "grad_norm": 0.7763694524765015,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 14287
+    },
+    {
+      "epoch": 0.14288,
+      "grad_norm": 0.7558686137199402,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 14288
+    },
+    {
+      "epoch": 0.14289,
+      "grad_norm": 0.6009389162063599,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 14289
+    },
+    {
+      "epoch": 0.1429,
+      "grad_norm": 0.6581655144691467,
+      "learning_rate": 0.003,
+      "loss": 3.9793,
+      "step": 14290
+    },
+    {
+      "epoch": 0.14291,
+      "grad_norm": 0.9600313901901245,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 14291
+    },
+    {
+      "epoch": 0.14292,
+      "grad_norm": 1.3686339855194092,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 14292
+    },
+    {
+      "epoch": 0.14293,
+      "grad_norm": 0.6834939122200012,
+      "learning_rate": 0.003,
+      "loss": 3.986,
+      "step": 14293
+    },
+    {
+      "epoch": 0.14294,
+      "grad_norm": 0.7326520681381226,
+      "learning_rate": 0.003,
+      "loss": 4.0397,
+      "step": 14294
+    },
+    {
+      "epoch": 0.14295,
+      "grad_norm": 0.863531231880188,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 14295
+    },
+    {
+      "epoch": 0.14296,
+      "grad_norm": 0.99330735206604,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 14296
+    },
+    {
+      "epoch": 0.14297,
+      "grad_norm": 1.054336667060852,
+      "learning_rate": 0.003,
+      "loss": 4.0441,
+      "step": 14297
+    },
+    {
+      "epoch": 0.14298,
+      "grad_norm": 1.0437896251678467,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 14298
+    },
+    {
+      "epoch": 0.14299,
+      "grad_norm": 0.9987184405326843,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 14299
+    },
+    {
+      "epoch": 0.143,
+      "grad_norm": 1.0736325979232788,
+      "learning_rate": 0.003,
+      "loss": 4.0535,
+      "step": 14300
+    },
+    {
+      "epoch": 0.14301,
+      "grad_norm": 1.1124069690704346,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 14301
+    },
+    {
+      "epoch": 0.14302,
+      "grad_norm": 1.0295443534851074,
+      "learning_rate": 0.003,
+      "loss": 3.9916,
+      "step": 14302
+    },
+    {
+      "epoch": 0.14303,
+      "grad_norm": 1.1448150873184204,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 14303
+    },
+    {
+      "epoch": 0.14304,
+      "grad_norm": 0.9082779884338379,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 14304
+    },
+    {
+      "epoch": 0.14305,
+      "grad_norm": 1.0699198246002197,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 14305
+    },
+    {
+      "epoch": 0.14306,
+      "grad_norm": 0.8107674717903137,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 14306
+    },
+    {
+      "epoch": 0.14307,
+      "grad_norm": 0.7893140316009521,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 14307
+    },
+    {
+      "epoch": 0.14308,
+      "grad_norm": 0.9483770132064819,
+      "learning_rate": 0.003,
+      "loss": 4.048,
+      "step": 14308
+    },
+    {
+      "epoch": 0.14309,
+      "grad_norm": 1.1574307680130005,
+      "learning_rate": 0.003,
+      "loss": 4.0552,
+      "step": 14309
+    },
+    {
+      "epoch": 0.1431,
+      "grad_norm": 0.8233171105384827,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 14310
+    },
+    {
+      "epoch": 0.14311,
+      "grad_norm": 0.7954649925231934,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 14311
+    },
+    {
+      "epoch": 0.14312,
+      "grad_norm": 0.7902868390083313,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 14312
+    },
+    {
+      "epoch": 0.14313,
+      "grad_norm": 0.8275642395019531,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 14313
+    },
+    {
+      "epoch": 0.14314,
+      "grad_norm": 0.9443146586418152,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 14314
+    },
+    {
+      "epoch": 0.14315,
+      "grad_norm": 0.9062085747718811,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 14315
+    },
+    {
+      "epoch": 0.14316,
+      "grad_norm": 0.8800166249275208,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 14316
+    },
+    {
+      "epoch": 0.14317,
+      "grad_norm": 0.7463732361793518,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 14317
+    },
+    {
+      "epoch": 0.14318,
+      "grad_norm": 0.6335029602050781,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 14318
+    },
+    {
+      "epoch": 0.14319,
+      "grad_norm": 0.6070913672447205,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 14319
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.6634745597839355,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 14320
+    },
+    {
+      "epoch": 0.14321,
+      "grad_norm": 0.5926946997642517,
+      "learning_rate": 0.003,
+      "loss": 3.987,
+      "step": 14321
+    },
+    {
+      "epoch": 0.14322,
+      "grad_norm": 0.5566388964653015,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 14322
+    },
+    {
+      "epoch": 0.14323,
+      "grad_norm": 0.5370780825614929,
+      "learning_rate": 0.003,
+      "loss": 3.9827,
+      "step": 14323
+    },
+    {
+      "epoch": 0.14324,
+      "grad_norm": 0.5878376364707947,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 14324
+    },
+    {
+      "epoch": 0.14325,
+      "grad_norm": 0.6978987455368042,
+      "learning_rate": 0.003,
+      "loss": 3.98,
+      "step": 14325
+    },
+    {
+      "epoch": 0.14326,
+      "grad_norm": 0.8885076642036438,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 14326
+    },
+    {
+      "epoch": 0.14327,
+      "grad_norm": 1.2482155561447144,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 14327
+    },
+    {
+      "epoch": 0.14328,
+      "grad_norm": 0.8148013949394226,
+      "learning_rate": 0.003,
+      "loss": 4.0474,
+      "step": 14328
+    },
+    {
+      "epoch": 0.14329,
+      "grad_norm": 0.5906567573547363,
+      "learning_rate": 0.003,
+      "loss": 3.995,
+      "step": 14329
+    },
+    {
+      "epoch": 0.1433,
+      "grad_norm": 0.5495211482048035,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 14330
+    },
+    {
+      "epoch": 0.14331,
+      "grad_norm": 0.5542047023773193,
+      "learning_rate": 0.003,
+      "loss": 3.9898,
+      "step": 14331
+    },
+    {
+      "epoch": 0.14332,
+      "grad_norm": 0.6755203008651733,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 14332
+    },
+    {
+      "epoch": 0.14333,
+      "grad_norm": 0.8951558470726013,
+      "learning_rate": 0.003,
+      "loss": 3.9965,
+      "step": 14333
+    },
+    {
+      "epoch": 0.14334,
+      "grad_norm": 0.9805142283439636,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 14334
+    },
+    {
+      "epoch": 0.14335,
+      "grad_norm": 0.8546414375305176,
+      "learning_rate": 0.003,
+      "loss": 3.9904,
+      "step": 14335
+    },
+    {
+      "epoch": 0.14336,
+      "grad_norm": 0.6646180748939514,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 14336
+    },
+    {
+      "epoch": 0.14337,
+      "grad_norm": 0.6930346488952637,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 14337
+    },
+    {
+      "epoch": 0.14338,
+      "grad_norm": 0.8854844570159912,
+      "learning_rate": 0.003,
+      "loss": 3.9967,
+      "step": 14338
+    },
+    {
+      "epoch": 0.14339,
+      "grad_norm": 1.0480695962905884,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 14339
+    },
+    {
+      "epoch": 0.1434,
+      "grad_norm": 0.8314961791038513,
+      "learning_rate": 0.003,
+      "loss": 3.993,
+      "step": 14340
+    },
+    {
+      "epoch": 0.14341,
+      "grad_norm": 0.6699071526527405,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 14341
+    },
+    {
+      "epoch": 0.14342,
+      "grad_norm": 0.6851991415023804,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 14342
+    },
+    {
+      "epoch": 0.14343,
+      "grad_norm": 0.8025005459785461,
+      "learning_rate": 0.003,
+      "loss": 4.048,
+      "step": 14343
+    },
+    {
+      "epoch": 0.14344,
+      "grad_norm": 0.7941917181015015,
+      "learning_rate": 0.003,
+      "loss": 4.052,
+      "step": 14344
+    },
+    {
+      "epoch": 0.14345,
+      "grad_norm": 0.8650415539741516,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 14345
+    },
+    {
+      "epoch": 0.14346,
+      "grad_norm": 1.114258050918579,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 14346
+    },
+    {
+      "epoch": 0.14347,
+      "grad_norm": 0.9780334234237671,
+      "learning_rate": 0.003,
+      "loss": 4.0378,
+      "step": 14347
+    },
+    {
+      "epoch": 0.14348,
+      "grad_norm": 0.8963751196861267,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 14348
+    },
+    {
+      "epoch": 0.14349,
+      "grad_norm": 1.0181679725646973,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 14349
+    },
+    {
+      "epoch": 0.1435,
+      "grad_norm": 0.9988883137702942,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 14350
+    },
+    {
+      "epoch": 0.14351,
+      "grad_norm": 0.9089468717575073,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 14351
+    },
+    {
+      "epoch": 0.14352,
+      "grad_norm": 0.77008455991745,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 14352
+    },
+    {
+      "epoch": 0.14353,
+      "grad_norm": 0.8111112117767334,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 14353
+    },
+    {
+      "epoch": 0.14354,
+      "grad_norm": 0.9026668667793274,
+      "learning_rate": 0.003,
+      "loss": 3.9925,
+      "step": 14354
+    },
+    {
+      "epoch": 0.14355,
+      "grad_norm": 1.0534589290618896,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 14355
+    },
+    {
+      "epoch": 0.14356,
+      "grad_norm": 1.1436132192611694,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 14356
+    },
+    {
+      "epoch": 0.14357,
+      "grad_norm": 0.9831972718238831,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 14357
+    },
+    {
+      "epoch": 0.14358,
+      "grad_norm": 1.1651567220687866,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 14358
+    },
+    {
+      "epoch": 0.14359,
+      "grad_norm": 0.7918037176132202,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 14359
+    },
+    {
+      "epoch": 0.1436,
+      "grad_norm": 0.6700632572174072,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 14360
+    },
+    {
+      "epoch": 0.14361,
+      "grad_norm": 0.5784865021705627,
+      "learning_rate": 0.003,
+      "loss": 3.9819,
+      "step": 14361
+    },
+    {
+      "epoch": 0.14362,
+      "grad_norm": 0.5571222901344299,
+      "learning_rate": 0.003,
+      "loss": 4.051,
+      "step": 14362
+    },
+    {
+      "epoch": 0.14363,
+      "grad_norm": 0.5615730285644531,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 14363
+    },
+    {
+      "epoch": 0.14364,
+      "grad_norm": 0.5923025012016296,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 14364
+    },
+    {
+      "epoch": 0.14365,
+      "grad_norm": 0.6931462287902832,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 14365
+    },
+    {
+      "epoch": 0.14366,
+      "grad_norm": 0.718263566493988,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 14366
+    },
+    {
+      "epoch": 0.14367,
+      "grad_norm": 0.8923155665397644,
+      "learning_rate": 0.003,
+      "loss": 3.9875,
+      "step": 14367
+    },
+    {
+      "epoch": 0.14368,
+      "grad_norm": 1.072655439376831,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 14368
+    },
+    {
+      "epoch": 0.14369,
+      "grad_norm": 0.9154521822929382,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 14369
+    },
+    {
+      "epoch": 0.1437,
+      "grad_norm": 0.7606717348098755,
+      "learning_rate": 0.003,
+      "loss": 3.9869,
+      "step": 14370
+    },
+    {
+      "epoch": 0.14371,
+      "grad_norm": 0.6659698486328125,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 14371
+    },
+    {
+      "epoch": 0.14372,
+      "grad_norm": 0.770995557308197,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 14372
+    },
+    {
+      "epoch": 0.14373,
+      "grad_norm": 0.865330696105957,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 14373
+    },
+    {
+      "epoch": 0.14374,
+      "grad_norm": 0.9684982895851135,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 14374
+    },
+    {
+      "epoch": 0.14375,
+      "grad_norm": 0.9319028258323669,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 14375
+    },
+    {
+      "epoch": 0.14376,
+      "grad_norm": 0.7594645619392395,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 14376
+    },
+    {
+      "epoch": 0.14377,
+      "grad_norm": 0.7884442210197449,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 14377
+    },
+    {
+      "epoch": 0.14378,
+      "grad_norm": 0.8902500867843628,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 14378
+    },
+    {
+      "epoch": 0.14379,
+      "grad_norm": 1.033056378364563,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 14379
+    },
+    {
+      "epoch": 0.1438,
+      "grad_norm": 0.8748496174812317,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 14380
+    },
+    {
+      "epoch": 0.14381,
+      "grad_norm": 0.9056290984153748,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 14381
+    },
+    {
+      "epoch": 0.14382,
+      "grad_norm": 1.0608599185943604,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 14382
+    },
+    {
+      "epoch": 0.14383,
+      "grad_norm": 1.0192004442214966,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 14383
+    },
+    {
+      "epoch": 0.14384,
+      "grad_norm": 0.8027071952819824,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 14384
+    },
+    {
+      "epoch": 0.14385,
+      "grad_norm": 0.7244923710823059,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 14385
+    },
+    {
+      "epoch": 0.14386,
+      "grad_norm": 0.7831423878669739,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 14386
+    },
+    {
+      "epoch": 0.14387,
+      "grad_norm": 0.7867714762687683,
+      "learning_rate": 0.003,
+      "loss": 3.9934,
+      "step": 14387
+    },
+    {
+      "epoch": 0.14388,
+      "grad_norm": 0.917858362197876,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 14388
+    },
+    {
+      "epoch": 0.14389,
+      "grad_norm": 1.0930110216140747,
+      "learning_rate": 0.003,
+      "loss": 4.0451,
+      "step": 14389
+    },
+    {
+      "epoch": 0.1439,
+      "grad_norm": 0.9966136813163757,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 14390
+    },
+    {
+      "epoch": 0.14391,
+      "grad_norm": 0.8886557817459106,
+      "learning_rate": 0.003,
+      "loss": 4.0056,
+      "step": 14391
+    },
+    {
+      "epoch": 0.14392,
+      "grad_norm": 0.7261216044425964,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 14392
+    },
+    {
+      "epoch": 0.14393,
+      "grad_norm": 0.6868066787719727,
+      "learning_rate": 0.003,
+      "loss": 3.9909,
+      "step": 14393
+    },
+    {
+      "epoch": 0.14394,
+      "grad_norm": 0.6649624705314636,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 14394
+    },
+    {
+      "epoch": 0.14395,
+      "grad_norm": 0.7228597402572632,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 14395
+    },
+    {
+      "epoch": 0.14396,
+      "grad_norm": 0.7414789795875549,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 14396
+    },
+    {
+      "epoch": 0.14397,
+      "grad_norm": 0.7033632397651672,
+      "learning_rate": 0.003,
+      "loss": 3.9917,
+      "step": 14397
+    },
+    {
+      "epoch": 0.14398,
+      "grad_norm": 0.6831393241882324,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 14398
+    },
+    {
+      "epoch": 0.14399,
+      "grad_norm": 0.6953455209732056,
+      "learning_rate": 0.003,
+      "loss": 3.9898,
+      "step": 14399
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.6803028583526611,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 14400
+    },
+    {
+      "epoch": 0.14401,
+      "grad_norm": 0.7951777577400208,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 14401
+    },
+    {
+      "epoch": 0.14402,
+      "grad_norm": 0.8658850789070129,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 14402
+    },
+    {
+      "epoch": 0.14403,
+      "grad_norm": 0.9823598861694336,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 14403
+    },
+    {
+      "epoch": 0.14404,
+      "grad_norm": 1.1010268926620483,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 14404
+    },
+    {
+      "epoch": 0.14405,
+      "grad_norm": 0.9394732117652893,
+      "learning_rate": 0.003,
+      "loss": 4.0237,
+      "step": 14405
+    },
+    {
+      "epoch": 0.14406,
+      "grad_norm": 0.840592622756958,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 14406
+    },
+    {
+      "epoch": 0.14407,
+      "grad_norm": 0.7738531231880188,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 14407
+    },
+    {
+      "epoch": 0.14408,
+      "grad_norm": 0.7063372731208801,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 14408
+    },
+    {
+      "epoch": 0.14409,
+      "grad_norm": 0.6962958574295044,
+      "learning_rate": 0.003,
+      "loss": 3.9876,
+      "step": 14409
+    },
+    {
+      "epoch": 0.1441,
+      "grad_norm": 0.6501365303993225,
+      "learning_rate": 0.003,
+      "loss": 3.98,
+      "step": 14410
+    },
+    {
+      "epoch": 0.14411,
+      "grad_norm": 0.5795419216156006,
+      "learning_rate": 0.003,
+      "loss": 3.9917,
+      "step": 14411
+    },
+    {
+      "epoch": 0.14412,
+      "grad_norm": 0.6202126145362854,
+      "learning_rate": 0.003,
+      "loss": 3.9871,
+      "step": 14412
+    },
+    {
+      "epoch": 0.14413,
+      "grad_norm": 0.8158420920372009,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 14413
+    },
+    {
+      "epoch": 0.14414,
+      "grad_norm": 0.8209929466247559,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 14414
+    },
+    {
+      "epoch": 0.14415,
+      "grad_norm": 0.8057379126548767,
+      "learning_rate": 0.003,
+      "loss": 3.9689,
+      "step": 14415
+    },
+    {
+      "epoch": 0.14416,
+      "grad_norm": 0.8939820528030396,
+      "learning_rate": 0.003,
+      "loss": 3.986,
+      "step": 14416
+    },
+    {
+      "epoch": 0.14417,
+      "grad_norm": 1.0734862089157104,
+      "learning_rate": 0.003,
+      "loss": 3.9973,
+      "step": 14417
+    },
+    {
+      "epoch": 0.14418,
+      "grad_norm": 0.9874979257583618,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 14418
+    },
+    {
+      "epoch": 0.14419,
+      "grad_norm": 1.05538809299469,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 14419
+    },
+    {
+      "epoch": 0.1442,
+      "grad_norm": 0.8185986280441284,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 14420
+    },
+    {
+      "epoch": 0.14421,
+      "grad_norm": 0.6607561111450195,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 14421
+    },
+    {
+      "epoch": 0.14422,
+      "grad_norm": 0.5887096524238586,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 14422
+    },
+    {
+      "epoch": 0.14423,
+      "grad_norm": 0.5062503218650818,
+      "learning_rate": 0.003,
+      "loss": 4.0072,
+      "step": 14423
+    },
+    {
+      "epoch": 0.14424,
+      "grad_norm": 0.5057479739189148,
+      "learning_rate": 0.003,
+      "loss": 3.9687,
+      "step": 14424
+    },
+    {
+      "epoch": 0.14425,
+      "grad_norm": 0.5376363396644592,
+      "learning_rate": 0.003,
+      "loss": 3.9781,
+      "step": 14425
+    },
+    {
+      "epoch": 0.14426,
+      "grad_norm": 0.647079586982727,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 14426
+    },
+    {
+      "epoch": 0.14427,
+      "grad_norm": 0.7828883528709412,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 14427
+    },
+    {
+      "epoch": 0.14428,
+      "grad_norm": 0.7896987795829773,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 14428
+    },
+    {
+      "epoch": 0.14429,
+      "grad_norm": 0.7318724989891052,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 14429
+    },
+    {
+      "epoch": 0.1443,
+      "grad_norm": 0.7879105806350708,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 14430
+    },
+    {
+      "epoch": 0.14431,
+      "grad_norm": 0.8891569972038269,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 14431
+    },
+    {
+      "epoch": 0.14432,
+      "grad_norm": 1.0650385618209839,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 14432
+    },
+    {
+      "epoch": 0.14433,
+      "grad_norm": 0.9432113170623779,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 14433
+    },
+    {
+      "epoch": 0.14434,
+      "grad_norm": 0.8969363570213318,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 14434
+    },
+    {
+      "epoch": 0.14435,
+      "grad_norm": 0.855061948299408,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 14435
+    },
+    {
+      "epoch": 0.14436,
+      "grad_norm": 0.7581377029418945,
+      "learning_rate": 0.003,
+      "loss": 4.0428,
+      "step": 14436
+    },
+    {
+      "epoch": 0.14437,
+      "grad_norm": 0.7667735815048218,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 14437
+    },
+    {
+      "epoch": 0.14438,
+      "grad_norm": 0.8322454690933228,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 14438
+    },
+    {
+      "epoch": 0.14439,
+      "grad_norm": 0.914036214351654,
+      "learning_rate": 0.003,
+      "loss": 4.0556,
+      "step": 14439
+    },
+    {
+      "epoch": 0.1444,
+      "grad_norm": 1.0237014293670654,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 14440
+    },
+    {
+      "epoch": 0.14441,
+      "grad_norm": 1.2672991752624512,
+      "learning_rate": 0.003,
+      "loss": 4.0517,
+      "step": 14441
+    },
+    {
+      "epoch": 0.14442,
+      "grad_norm": 0.8671845197677612,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 14442
+    },
+    {
+      "epoch": 0.14443,
+      "grad_norm": 0.6996704339981079,
+      "learning_rate": 0.003,
+      "loss": 3.994,
+      "step": 14443
+    },
+    {
+      "epoch": 0.14444,
+      "grad_norm": 0.6526155471801758,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 14444
+    },
+    {
+      "epoch": 0.14445,
+      "grad_norm": 0.6436783075332642,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 14445
+    },
+    {
+      "epoch": 0.14446,
+      "grad_norm": 0.600752592086792,
+      "learning_rate": 0.003,
+      "loss": 3.9924,
+      "step": 14446
+    },
+    {
+      "epoch": 0.14447,
+      "grad_norm": 0.6957166194915771,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 14447
+    },
+    {
+      "epoch": 0.14448,
+      "grad_norm": 0.9329416155815125,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 14448
+    },
+    {
+      "epoch": 0.14449,
+      "grad_norm": 1.3451831340789795,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 14449
+    },
+    {
+      "epoch": 0.1445,
+      "grad_norm": 0.9525917172431946,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 14450
+    },
+    {
+      "epoch": 0.14451,
+      "grad_norm": 0.9183449745178223,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 14451
+    },
+    {
+      "epoch": 0.14452,
+      "grad_norm": 0.8432508707046509,
+      "learning_rate": 0.003,
+      "loss": 4.009,
+      "step": 14452
+    },
+    {
+      "epoch": 0.14453,
+      "grad_norm": 0.7553740739822388,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 14453
+    },
+    {
+      "epoch": 0.14454,
+      "grad_norm": 0.7473167777061462,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 14454
+    },
+    {
+      "epoch": 0.14455,
+      "grad_norm": 0.7682895660400391,
+      "learning_rate": 0.003,
+      "loss": 3.9875,
+      "step": 14455
+    },
+    {
+      "epoch": 0.14456,
+      "grad_norm": 0.7811801433563232,
+      "learning_rate": 0.003,
+      "loss": 3.981,
+      "step": 14456
+    },
+    {
+      "epoch": 0.14457,
+      "grad_norm": 0.6911787986755371,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 14457
+    },
+    {
+      "epoch": 0.14458,
+      "grad_norm": 0.6579571962356567,
+      "learning_rate": 0.003,
+      "loss": 3.9977,
+      "step": 14458
+    },
+    {
+      "epoch": 0.14459,
+      "grad_norm": 0.7433202266693115,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 14459
+    },
+    {
+      "epoch": 0.1446,
+      "grad_norm": 0.8381054401397705,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 14460
+    },
+    {
+      "epoch": 0.14461,
+      "grad_norm": 1.0219905376434326,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 14461
+    },
+    {
+      "epoch": 0.14462,
+      "grad_norm": 1.1736960411071777,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 14462
+    },
+    {
+      "epoch": 0.14463,
+      "grad_norm": 0.8547998666763306,
+      "learning_rate": 0.003,
+      "loss": 4.0251,
+      "step": 14463
+    },
+    {
+      "epoch": 0.14464,
+      "grad_norm": 0.7964674234390259,
+      "learning_rate": 0.003,
+      "loss": 4.0331,
+      "step": 14464
+    },
+    {
+      "epoch": 0.14465,
+      "grad_norm": 0.7108013033866882,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 14465
+    },
+    {
+      "epoch": 0.14466,
+      "grad_norm": 0.7695088982582092,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 14466
+    },
+    {
+      "epoch": 0.14467,
+      "grad_norm": 0.7569155693054199,
+      "learning_rate": 0.003,
+      "loss": 4.0495,
+      "step": 14467
+    },
+    {
+      "epoch": 0.14468,
+      "grad_norm": 0.7204357981681824,
+      "learning_rate": 0.003,
+      "loss": 3.9984,
+      "step": 14468
+    },
+    {
+      "epoch": 0.14469,
+      "grad_norm": 0.7813732624053955,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 14469
+    },
+    {
+      "epoch": 0.1447,
+      "grad_norm": 0.8003991842269897,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 14470
+    },
+    {
+      "epoch": 0.14471,
+      "grad_norm": 0.8164728283882141,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 14471
+    },
+    {
+      "epoch": 0.14472,
+      "grad_norm": 0.696689248085022,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 14472
+    },
+    {
+      "epoch": 0.14473,
+      "grad_norm": 0.7590498328208923,
+      "learning_rate": 0.003,
+      "loss": 3.9864,
+      "step": 14473
+    },
+    {
+      "epoch": 0.14474,
+      "grad_norm": 0.7519974708557129,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 14474
+    },
+    {
+      "epoch": 0.14475,
+      "grad_norm": 0.7030192613601685,
+      "learning_rate": 0.003,
+      "loss": 4.0366,
+      "step": 14475
+    },
+    {
+      "epoch": 0.14476,
+      "grad_norm": 0.7600857019424438,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 14476
+    },
+    {
+      "epoch": 0.14477,
+      "grad_norm": 0.7709656953811646,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 14477
+    },
+    {
+      "epoch": 0.14478,
+      "grad_norm": 0.8543557524681091,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 14478
+    },
+    {
+      "epoch": 0.14479,
+      "grad_norm": 1.0360087156295776,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 14479
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 1.1634830236434937,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 14480
+    },
+    {
+      "epoch": 0.14481,
+      "grad_norm": 0.8620505332946777,
+      "learning_rate": 0.003,
+      "loss": 4.0237,
+      "step": 14481
+    },
+    {
+      "epoch": 0.14482,
+      "grad_norm": 0.9569733738899231,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 14482
+    },
+    {
+      "epoch": 0.14483,
+      "grad_norm": 1.3167060613632202,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 14483
+    },
+    {
+      "epoch": 0.14484,
+      "grad_norm": 0.745167076587677,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 14484
+    },
+    {
+      "epoch": 0.14485,
+      "grad_norm": 0.7294960021972656,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 14485
+    },
+    {
+      "epoch": 0.14486,
+      "grad_norm": 0.7164823412895203,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 14486
+    },
+    {
+      "epoch": 0.14487,
+      "grad_norm": 0.752485990524292,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 14487
+    },
+    {
+      "epoch": 0.14488,
+      "grad_norm": 0.6875482201576233,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 14488
+    },
+    {
+      "epoch": 0.14489,
+      "grad_norm": 0.5566693544387817,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 14489
+    },
+    {
+      "epoch": 0.1449,
+      "grad_norm": 0.579632043838501,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 14490
+    },
+    {
+      "epoch": 0.14491,
+      "grad_norm": 0.6094452738761902,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 14491
+    },
+    {
+      "epoch": 0.14492,
+      "grad_norm": 0.6271584630012512,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 14492
+    },
+    {
+      "epoch": 0.14493,
+      "grad_norm": 0.5886869430541992,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 14493
+    },
+    {
+      "epoch": 0.14494,
+      "grad_norm": 0.7035474181175232,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 14494
+    },
+    {
+      "epoch": 0.14495,
+      "grad_norm": 0.880418062210083,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 14495
+    },
+    {
+      "epoch": 0.14496,
+      "grad_norm": 0.9390040040016174,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 14496
+    },
+    {
+      "epoch": 0.14497,
+      "grad_norm": 1.0518993139266968,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 14497
+    },
+    {
+      "epoch": 0.14498,
+      "grad_norm": 1.1795552968978882,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 14498
+    },
+    {
+      "epoch": 0.14499,
+      "grad_norm": 0.9480694532394409,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 14499
+    },
+    {
+      "epoch": 0.145,
+      "grad_norm": 0.9626632928848267,
+      "learning_rate": 0.003,
+      "loss": 4.0483,
+      "step": 14500
+    },
+    {
+      "epoch": 0.14501,
+      "grad_norm": 0.9182538390159607,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 14501
+    },
+    {
+      "epoch": 0.14502,
+      "grad_norm": 0.7662861943244934,
+      "learning_rate": 0.003,
+      "loss": 4.0119,
+      "step": 14502
+    },
+    {
+      "epoch": 0.14503,
+      "grad_norm": 0.6444974541664124,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 14503
+    },
+    {
+      "epoch": 0.14504,
+      "grad_norm": 0.6466870903968811,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 14504
+    },
+    {
+      "epoch": 0.14505,
+      "grad_norm": 0.6860358715057373,
+      "learning_rate": 0.003,
+      "loss": 3.9908,
+      "step": 14505
+    },
+    {
+      "epoch": 0.14506,
+      "grad_norm": 0.777827262878418,
+      "learning_rate": 0.003,
+      "loss": 3.9934,
+      "step": 14506
+    },
+    {
+      "epoch": 0.14507,
+      "grad_norm": 0.8074339032173157,
+      "learning_rate": 0.003,
+      "loss": 3.9816,
+      "step": 14507
+    },
+    {
+      "epoch": 0.14508,
+      "grad_norm": 0.7027100920677185,
+      "learning_rate": 0.003,
+      "loss": 3.9894,
+      "step": 14508
+    },
+    {
+      "epoch": 0.14509,
+      "grad_norm": 0.690327525138855,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 14509
+    },
+    {
+      "epoch": 0.1451,
+      "grad_norm": 0.8751059770584106,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 14510
+    },
+    {
+      "epoch": 0.14511,
+      "grad_norm": 1.0037294626235962,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 14511
+    },
+    {
+      "epoch": 0.14512,
+      "grad_norm": 1.1385389566421509,
+      "learning_rate": 0.003,
+      "loss": 4.0537,
+      "step": 14512
+    },
+    {
+      "epoch": 0.14513,
+      "grad_norm": 0.8328064680099487,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 14513
+    },
+    {
+      "epoch": 0.14514,
+      "grad_norm": 0.750587522983551,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 14514
+    },
+    {
+      "epoch": 0.14515,
+      "grad_norm": 0.780562162399292,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 14515
+    },
+    {
+      "epoch": 0.14516,
+      "grad_norm": 0.8911523222923279,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 14516
+    },
+    {
+      "epoch": 0.14517,
+      "grad_norm": 0.8570557832717896,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 14517
+    },
+    {
+      "epoch": 0.14518,
+      "grad_norm": 0.8679510354995728,
+      "learning_rate": 0.003,
+      "loss": 4.0237,
+      "step": 14518
+    },
+    {
+      "epoch": 0.14519,
+      "grad_norm": 0.9858636856079102,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 14519
+    },
+    {
+      "epoch": 0.1452,
+      "grad_norm": 1.074411392211914,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 14520
+    },
+    {
+      "epoch": 0.14521,
+      "grad_norm": 0.8335927128791809,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 14521
+    },
+    {
+      "epoch": 0.14522,
+      "grad_norm": 0.780289888381958,
+      "learning_rate": 0.003,
+      "loss": 3.9785,
+      "step": 14522
+    },
+    {
+      "epoch": 0.14523,
+      "grad_norm": 0.843913197517395,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 14523
+    },
+    {
+      "epoch": 0.14524,
+      "grad_norm": 0.8714582920074463,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 14524
+    },
+    {
+      "epoch": 0.14525,
+      "grad_norm": 0.8275744318962097,
+      "learning_rate": 0.003,
+      "loss": 3.994,
+      "step": 14525
+    },
+    {
+      "epoch": 0.14526,
+      "grad_norm": 0.9582111239433289,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 14526
+    },
+    {
+      "epoch": 0.14527,
+      "grad_norm": 0.9758346676826477,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 14527
+    },
+    {
+      "epoch": 0.14528,
+      "grad_norm": 1.0356850624084473,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 14528
+    },
+    {
+      "epoch": 0.14529,
+      "grad_norm": 0.9981045722961426,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 14529
+    },
+    {
+      "epoch": 0.1453,
+      "grad_norm": 0.9965315461158752,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 14530
+    },
+    {
+      "epoch": 0.14531,
+      "grad_norm": 0.9737066626548767,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 14531
+    },
+    {
+      "epoch": 0.14532,
+      "grad_norm": 0.8926593065261841,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 14532
+    },
+    {
+      "epoch": 0.14533,
+      "grad_norm": 0.7353343367576599,
+      "learning_rate": 0.003,
+      "loss": 4.0428,
+      "step": 14533
+    },
+    {
+      "epoch": 0.14534,
+      "grad_norm": 0.817808985710144,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 14534
+    },
+    {
+      "epoch": 0.14535,
+      "grad_norm": 0.8736345767974854,
+      "learning_rate": 0.003,
+      "loss": 4.0342,
+      "step": 14535
+    },
+    {
+      "epoch": 0.14536,
+      "grad_norm": 0.9836347103118896,
+      "learning_rate": 0.003,
+      "loss": 4.034,
+      "step": 14536
+    },
+    {
+      "epoch": 0.14537,
+      "grad_norm": 1.0643627643585205,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 14537
+    },
+    {
+      "epoch": 0.14538,
+      "grad_norm": 1.0713415145874023,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 14538
+    },
+    {
+      "epoch": 0.14539,
+      "grad_norm": 0.8724493980407715,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 14539
+    },
+    {
+      "epoch": 0.1454,
+      "grad_norm": 0.7486632466316223,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 14540
+    },
+    {
+      "epoch": 0.14541,
+      "grad_norm": 0.7692328095436096,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 14541
+    },
+    {
+      "epoch": 0.14542,
+      "grad_norm": 0.7887241840362549,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 14542
+    },
+    {
+      "epoch": 0.14543,
+      "grad_norm": 0.7101812362670898,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 14543
+    },
+    {
+      "epoch": 0.14544,
+      "grad_norm": 0.6259065270423889,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 14544
+    },
+    {
+      "epoch": 0.14545,
+      "grad_norm": 0.6179724931716919,
+      "learning_rate": 0.003,
+      "loss": 3.9952,
+      "step": 14545
+    },
+    {
+      "epoch": 0.14546,
+      "grad_norm": 0.6964648962020874,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 14546
+    },
+    {
+      "epoch": 0.14547,
+      "grad_norm": 0.736311674118042,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 14547
+    },
+    {
+      "epoch": 0.14548,
+      "grad_norm": 0.7651875615119934,
+      "learning_rate": 0.003,
+      "loss": 3.9957,
+      "step": 14548
+    },
+    {
+      "epoch": 0.14549,
+      "grad_norm": 0.6706742644309998,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 14549
+    },
+    {
+      "epoch": 0.1455,
+      "grad_norm": 0.6436061859130859,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 14550
+    },
+    {
+      "epoch": 0.14551,
+      "grad_norm": 0.8034910559654236,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 14551
+    },
+    {
+      "epoch": 0.14552,
+      "grad_norm": 1.0899924039840698,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 14552
+    },
+    {
+      "epoch": 0.14553,
+      "grad_norm": 1.0492063760757446,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 14553
+    },
+    {
+      "epoch": 0.14554,
+      "grad_norm": 0.9474644064903259,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 14554
+    },
+    {
+      "epoch": 0.14555,
+      "grad_norm": 0.8426516652107239,
+      "learning_rate": 0.003,
+      "loss": 4.0006,
+      "step": 14555
+    },
+    {
+      "epoch": 0.14556,
+      "grad_norm": 0.7306226491928101,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 14556
+    },
+    {
+      "epoch": 0.14557,
+      "grad_norm": 0.8061223030090332,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 14557
+    },
+    {
+      "epoch": 0.14558,
+      "grad_norm": 1.023594617843628,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 14558
+    },
+    {
+      "epoch": 0.14559,
+      "grad_norm": 1.2255849838256836,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 14559
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.7095961570739746,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 14560
+    },
+    {
+      "epoch": 0.14561,
+      "grad_norm": 0.6970982551574707,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 14561
+    },
+    {
+      "epoch": 0.14562,
+      "grad_norm": 0.830156683921814,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 14562
+    },
+    {
+      "epoch": 0.14563,
+      "grad_norm": 0.7684354186058044,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 14563
+    },
+    {
+      "epoch": 0.14564,
+      "grad_norm": 0.8628973364830017,
+      "learning_rate": 0.003,
+      "loss": 3.9784,
+      "step": 14564
+    },
+    {
+      "epoch": 0.14565,
+      "grad_norm": 0.9758632183074951,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 14565
+    },
+    {
+      "epoch": 0.14566,
+      "grad_norm": 0.9080876708030701,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 14566
+    },
+    {
+      "epoch": 0.14567,
+      "grad_norm": 0.8077095746994019,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 14567
+    },
+    {
+      "epoch": 0.14568,
+      "grad_norm": 0.7247617244720459,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 14568
+    },
+    {
+      "epoch": 0.14569,
+      "grad_norm": 0.6292919516563416,
+      "learning_rate": 0.003,
+      "loss": 4.0427,
+      "step": 14569
+    },
+    {
+      "epoch": 0.1457,
+      "grad_norm": 0.7047914266586304,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 14570
+    },
+    {
+      "epoch": 0.14571,
+      "grad_norm": 0.8658979535102844,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 14571
+    },
+    {
+      "epoch": 0.14572,
+      "grad_norm": 0.9502711296081543,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 14572
+    },
+    {
+      "epoch": 0.14573,
+      "grad_norm": 1.1412993669509888,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 14573
+    },
+    {
+      "epoch": 0.14574,
+      "grad_norm": 0.8256980776786804,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 14574
+    },
+    {
+      "epoch": 0.14575,
+      "grad_norm": 0.745071530342102,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 14575
+    },
+    {
+      "epoch": 0.14576,
+      "grad_norm": 0.6846836805343628,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 14576
+    },
+    {
+      "epoch": 0.14577,
+      "grad_norm": 0.6639923453330994,
+      "learning_rate": 0.003,
+      "loss": 3.997,
+      "step": 14577
+    },
+    {
+      "epoch": 0.14578,
+      "grad_norm": 0.6641462445259094,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 14578
+    },
+    {
+      "epoch": 0.14579,
+      "grad_norm": 0.6584463715553284,
+      "learning_rate": 0.003,
+      "loss": 3.9835,
+      "step": 14579
+    },
+    {
+      "epoch": 0.1458,
+      "grad_norm": 0.7167730927467346,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 14580
+    },
+    {
+      "epoch": 0.14581,
+      "grad_norm": 0.7999163866043091,
+      "learning_rate": 0.003,
+      "loss": 3.9781,
+      "step": 14581
+    },
+    {
+      "epoch": 0.14582,
+      "grad_norm": 0.7912095785140991,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 14582
+    },
+    {
+      "epoch": 0.14583,
+      "grad_norm": 0.8575613498687744,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 14583
+    },
+    {
+      "epoch": 0.14584,
+      "grad_norm": 1.1373587846755981,
+      "learning_rate": 0.003,
+      "loss": 4.0295,
+      "step": 14584
+    },
+    {
+      "epoch": 0.14585,
+      "grad_norm": 1.114978313446045,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 14585
+    },
+    {
+      "epoch": 0.14586,
+      "grad_norm": 0.8075107336044312,
+      "learning_rate": 0.003,
+      "loss": 3.9894,
+      "step": 14586
+    },
+    {
+      "epoch": 0.14587,
+      "grad_norm": 0.7308522462844849,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 14587
+    },
+    {
+      "epoch": 0.14588,
+      "grad_norm": 0.8998515009880066,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 14588
+    },
+    {
+      "epoch": 0.14589,
+      "grad_norm": 1.0372451543807983,
+      "learning_rate": 0.003,
+      "loss": 3.9769,
+      "step": 14589
+    },
+    {
+      "epoch": 0.1459,
+      "grad_norm": 1.1466875076293945,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 14590
+    },
+    {
+      "epoch": 0.14591,
+      "grad_norm": 0.7109951972961426,
+      "learning_rate": 0.003,
+      "loss": 3.9881,
+      "step": 14591
+    },
+    {
+      "epoch": 0.14592,
+      "grad_norm": 0.612815797328949,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 14592
+    },
+    {
+      "epoch": 0.14593,
+      "grad_norm": 0.6005706787109375,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 14593
+    },
+    {
+      "epoch": 0.14594,
+      "grad_norm": 0.6634933948516846,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 14594
+    },
+    {
+      "epoch": 0.14595,
+      "grad_norm": 0.6079758405685425,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 14595
+    },
+    {
+      "epoch": 0.14596,
+      "grad_norm": 0.6029581427574158,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 14596
+    },
+    {
+      "epoch": 0.14597,
+      "grad_norm": 0.6305979490280151,
+      "learning_rate": 0.003,
+      "loss": 4.0432,
+      "step": 14597
+    },
+    {
+      "epoch": 0.14598,
+      "grad_norm": 0.6574081182479858,
+      "learning_rate": 0.003,
+      "loss": 4.0533,
+      "step": 14598
+    },
+    {
+      "epoch": 0.14599,
+      "grad_norm": 0.6735368371009827,
+      "learning_rate": 0.003,
+      "loss": 3.9792,
+      "step": 14599
+    },
+    {
+      "epoch": 0.146,
+      "grad_norm": 0.651794970035553,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 14600
+    },
+    {
+      "epoch": 0.14601,
+      "grad_norm": 0.6393558382987976,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 14601
+    },
+    {
+      "epoch": 0.14602,
+      "grad_norm": 0.7819718718528748,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 14602
+    },
+    {
+      "epoch": 0.14603,
+      "grad_norm": 0.9885872006416321,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 14603
+    },
+    {
+      "epoch": 0.14604,
+      "grad_norm": 1.2221794128417969,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 14604
+    },
+    {
+      "epoch": 0.14605,
+      "grad_norm": 0.6630940437316895,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 14605
+    },
+    {
+      "epoch": 0.14606,
+      "grad_norm": 0.5590291023254395,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 14606
+    },
+    {
+      "epoch": 0.14607,
+      "grad_norm": 0.6604478359222412,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 14607
+    },
+    {
+      "epoch": 0.14608,
+      "grad_norm": 0.7741831541061401,
+      "learning_rate": 0.003,
+      "loss": 3.9941,
+      "step": 14608
+    },
+    {
+      "epoch": 0.14609,
+      "grad_norm": 0.9608388543128967,
+      "learning_rate": 0.003,
+      "loss": 3.9739,
+      "step": 14609
+    },
+    {
+      "epoch": 0.1461,
+      "grad_norm": 1.0947613716125488,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 14610
+    },
+    {
+      "epoch": 0.14611,
+      "grad_norm": 0.9668039679527283,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 14611
+    },
+    {
+      "epoch": 0.14612,
+      "grad_norm": 1.0010789632797241,
+      "learning_rate": 0.003,
+      "loss": 4.0341,
+      "step": 14612
+    },
+    {
+      "epoch": 0.14613,
+      "grad_norm": 1.0474330186843872,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 14613
+    },
+    {
+      "epoch": 0.14614,
+      "grad_norm": 0.8920873999595642,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 14614
+    },
+    {
+      "epoch": 0.14615,
+      "grad_norm": 0.8349394798278809,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 14615
+    },
+    {
+      "epoch": 0.14616,
+      "grad_norm": 0.883679211139679,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 14616
+    },
+    {
+      "epoch": 0.14617,
+      "grad_norm": 0.8766427040100098,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 14617
+    },
+    {
+      "epoch": 0.14618,
+      "grad_norm": 1.103816032409668,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 14618
+    },
+    {
+      "epoch": 0.14619,
+      "grad_norm": 1.0742919445037842,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 14619
+    },
+    {
+      "epoch": 0.1462,
+      "grad_norm": 1.0519556999206543,
+      "learning_rate": 0.003,
+      "loss": 4.0446,
+      "step": 14620
+    },
+    {
+      "epoch": 0.14621,
+      "grad_norm": 1.0646867752075195,
+      "learning_rate": 0.003,
+      "loss": 4.0614,
+      "step": 14621
+    },
+    {
+      "epoch": 0.14622,
+      "grad_norm": 1.0145447254180908,
+      "learning_rate": 0.003,
+      "loss": 4.0386,
+      "step": 14622
+    },
+    {
+      "epoch": 0.14623,
+      "grad_norm": 0.9972551465034485,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 14623
+    },
+    {
+      "epoch": 0.14624,
+      "grad_norm": 1.0853716135025024,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 14624
+    },
+    {
+      "epoch": 0.14625,
+      "grad_norm": 0.8484166264533997,
+      "learning_rate": 0.003,
+      "loss": 4.043,
+      "step": 14625
+    },
+    {
+      "epoch": 0.14626,
+      "grad_norm": 0.7008185386657715,
+      "learning_rate": 0.003,
+      "loss": 4.0427,
+      "step": 14626
+    },
+    {
+      "epoch": 0.14627,
+      "grad_norm": 0.7344235181808472,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 14627
+    },
+    {
+      "epoch": 0.14628,
+      "grad_norm": 0.7429527640342712,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 14628
+    },
+    {
+      "epoch": 0.14629,
+      "grad_norm": 0.6254894137382507,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 14629
+    },
+    {
+      "epoch": 0.1463,
+      "grad_norm": 0.486186683177948,
+      "learning_rate": 0.003,
+      "loss": 4.0483,
+      "step": 14630
+    },
+    {
+      "epoch": 0.14631,
+      "grad_norm": 0.5246158838272095,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 14631
+    },
+    {
+      "epoch": 0.14632,
+      "grad_norm": 0.5575757026672363,
+      "learning_rate": 0.003,
+      "loss": 3.9858,
+      "step": 14632
+    },
+    {
+      "epoch": 0.14633,
+      "grad_norm": 0.6978687644004822,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 14633
+    },
+    {
+      "epoch": 0.14634,
+      "grad_norm": 0.8760482668876648,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 14634
+    },
+    {
+      "epoch": 0.14635,
+      "grad_norm": 1.2433359622955322,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 14635
+    },
+    {
+      "epoch": 0.14636,
+      "grad_norm": 0.7502328753471375,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 14636
+    },
+    {
+      "epoch": 0.14637,
+      "grad_norm": 0.6649960279464722,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 14637
+    },
+    {
+      "epoch": 0.14638,
+      "grad_norm": 0.7799578309059143,
+      "learning_rate": 0.003,
+      "loss": 3.9941,
+      "step": 14638
+    },
+    {
+      "epoch": 0.14639,
+      "grad_norm": 0.857184886932373,
+      "learning_rate": 0.003,
+      "loss": 3.9908,
+      "step": 14639
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 1.1021485328674316,
+      "learning_rate": 0.003,
+      "loss": 4.0057,
+      "step": 14640
+    },
+    {
+      "epoch": 0.14641,
+      "grad_norm": 0.9773330688476562,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 14641
+    },
+    {
+      "epoch": 0.14642,
+      "grad_norm": 0.8717167973518372,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 14642
+    },
+    {
+      "epoch": 0.14643,
+      "grad_norm": 0.8049965500831604,
+      "learning_rate": 0.003,
+      "loss": 4.0234,
+      "step": 14643
+    },
+    {
+      "epoch": 0.14644,
+      "grad_norm": 0.8108642101287842,
+      "learning_rate": 0.003,
+      "loss": 3.9763,
+      "step": 14644
+    },
+    {
+      "epoch": 0.14645,
+      "grad_norm": 0.8317939639091492,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 14645
+    },
+    {
+      "epoch": 0.14646,
+      "grad_norm": 0.8724066019058228,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 14646
+    },
+    {
+      "epoch": 0.14647,
+      "grad_norm": 0.9462814331054688,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 14647
+    },
+    {
+      "epoch": 0.14648,
+      "grad_norm": 0.9973888993263245,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 14648
+    },
+    {
+      "epoch": 0.14649,
+      "grad_norm": 0.8211222887039185,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 14649
+    },
+    {
+      "epoch": 0.1465,
+      "grad_norm": 0.7676868438720703,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 14650
+    },
+    {
+      "epoch": 0.14651,
+      "grad_norm": 0.7277308106422424,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 14651
+    },
+    {
+      "epoch": 0.14652,
+      "grad_norm": 0.7895736694335938,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 14652
+    },
+    {
+      "epoch": 0.14653,
+      "grad_norm": 0.7760275602340698,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 14653
+    },
+    {
+      "epoch": 0.14654,
+      "grad_norm": 0.9125950336456299,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 14654
+    },
+    {
+      "epoch": 0.14655,
+      "grad_norm": 1.0902222394943237,
+      "learning_rate": 0.003,
+      "loss": 3.9983,
+      "step": 14655
+    },
+    {
+      "epoch": 0.14656,
+      "grad_norm": 0.9675227999687195,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 14656
+    },
+    {
+      "epoch": 0.14657,
+      "grad_norm": 0.8446677923202515,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 14657
+    },
+    {
+      "epoch": 0.14658,
+      "grad_norm": 0.7147864699363708,
+      "learning_rate": 0.003,
+      "loss": 3.9794,
+      "step": 14658
+    },
+    {
+      "epoch": 0.14659,
+      "grad_norm": 0.72620689868927,
+      "learning_rate": 0.003,
+      "loss": 4.0091,
+      "step": 14659
+    },
+    {
+      "epoch": 0.1466,
+      "grad_norm": 0.6665274500846863,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 14660
+    },
+    {
+      "epoch": 0.14661,
+      "grad_norm": 0.6929038763046265,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 14661
+    },
+    {
+      "epoch": 0.14662,
+      "grad_norm": 0.6481709480285645,
+      "learning_rate": 0.003,
+      "loss": 3.9884,
+      "step": 14662
+    },
+    {
+      "epoch": 0.14663,
+      "grad_norm": 0.5627266764640808,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 14663
+    },
+    {
+      "epoch": 0.14664,
+      "grad_norm": 0.5991783738136292,
+      "learning_rate": 0.003,
+      "loss": 3.9753,
+      "step": 14664
+    },
+    {
+      "epoch": 0.14665,
+      "grad_norm": 0.5894344449043274,
+      "learning_rate": 0.003,
+      "loss": 3.995,
+      "step": 14665
+    },
+    {
+      "epoch": 0.14666,
+      "grad_norm": 0.5698339939117432,
+      "learning_rate": 0.003,
+      "loss": 3.9926,
+      "step": 14666
+    },
+    {
+      "epoch": 0.14667,
+      "grad_norm": 0.6349946856498718,
+      "learning_rate": 0.003,
+      "loss": 3.9985,
+      "step": 14667
+    },
+    {
+      "epoch": 0.14668,
+      "grad_norm": 0.7414714097976685,
+      "learning_rate": 0.003,
+      "loss": 4.0331,
+      "step": 14668
+    },
+    {
+      "epoch": 0.14669,
+      "grad_norm": 0.7325112223625183,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 14669
+    },
+    {
+      "epoch": 0.1467,
+      "grad_norm": 0.7556993365287781,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 14670
+    },
+    {
+      "epoch": 0.14671,
+      "grad_norm": 0.8011714816093445,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 14671
+    },
+    {
+      "epoch": 0.14672,
+      "grad_norm": 0.8728906512260437,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 14672
+    },
+    {
+      "epoch": 0.14673,
+      "grad_norm": 0.9702844023704529,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 14673
+    },
+    {
+      "epoch": 0.14674,
+      "grad_norm": 1.0082310438156128,
+      "learning_rate": 0.003,
+      "loss": 3.9833,
+      "step": 14674
+    },
+    {
+      "epoch": 0.14675,
+      "grad_norm": 1.2419564723968506,
+      "learning_rate": 0.003,
+      "loss": 4.0263,
+      "step": 14675
+    },
+    {
+      "epoch": 0.14676,
+      "grad_norm": 0.9880674481391907,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 14676
+    },
+    {
+      "epoch": 0.14677,
+      "grad_norm": 0.8718649744987488,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 14677
+    },
+    {
+      "epoch": 0.14678,
+      "grad_norm": 0.8813468217849731,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 14678
+    },
+    {
+      "epoch": 0.14679,
+      "grad_norm": 0.7749958634376526,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 14679
+    },
+    {
+      "epoch": 0.1468,
+      "grad_norm": 0.7995460033416748,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 14680
+    },
+    {
+      "epoch": 0.14681,
+      "grad_norm": 0.6909242272377014,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 14681
+    },
+    {
+      "epoch": 0.14682,
+      "grad_norm": 0.736655592918396,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 14682
+    },
+    {
+      "epoch": 0.14683,
+      "grad_norm": 0.7802330851554871,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 14683
+    },
+    {
+      "epoch": 0.14684,
+      "grad_norm": 0.8775361180305481,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 14684
+    },
+    {
+      "epoch": 0.14685,
+      "grad_norm": 1.0075017213821411,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 14685
+    },
+    {
+      "epoch": 0.14686,
+      "grad_norm": 1.010310173034668,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 14686
+    },
+    {
+      "epoch": 0.14687,
+      "grad_norm": 0.7436519265174866,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 14687
+    },
+    {
+      "epoch": 0.14688,
+      "grad_norm": 0.788584291934967,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 14688
+    },
+    {
+      "epoch": 0.14689,
+      "grad_norm": 0.776448130607605,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 14689
+    },
+    {
+      "epoch": 0.1469,
+      "grad_norm": 0.7722798585891724,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 14690
+    },
+    {
+      "epoch": 0.14691,
+      "grad_norm": 0.710889995098114,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 14691
+    },
+    {
+      "epoch": 0.14692,
+      "grad_norm": 0.8781268000602722,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 14692
+    },
+    {
+      "epoch": 0.14693,
+      "grad_norm": 1.0924203395843506,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 14693
+    },
+    {
+      "epoch": 0.14694,
+      "grad_norm": 1.026205062866211,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 14694
+    },
+    {
+      "epoch": 0.14695,
+      "grad_norm": 0.9521628022193909,
+      "learning_rate": 0.003,
+      "loss": 4.039,
+      "step": 14695
+    },
+    {
+      "epoch": 0.14696,
+      "grad_norm": 0.9316068887710571,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 14696
+    },
+    {
+      "epoch": 0.14697,
+      "grad_norm": 0.9264803528785706,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 14697
+    },
+    {
+      "epoch": 0.14698,
+      "grad_norm": 1.1187646389007568,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 14698
+    },
+    {
+      "epoch": 0.14699,
+      "grad_norm": 0.8024663329124451,
+      "learning_rate": 0.003,
+      "loss": 3.9805,
+      "step": 14699
+    },
+    {
+      "epoch": 0.147,
+      "grad_norm": 0.7368111610412598,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 14700
+    },
+    {
+      "epoch": 0.14701,
+      "grad_norm": 0.7123501300811768,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 14701
+    },
+    {
+      "epoch": 0.14702,
+      "grad_norm": 0.7903427481651306,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 14702
+    },
+    {
+      "epoch": 0.14703,
+      "grad_norm": 0.8391876220703125,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 14703
+    },
+    {
+      "epoch": 0.14704,
+      "grad_norm": 0.9227834939956665,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 14704
+    },
+    {
+      "epoch": 0.14705,
+      "grad_norm": 1.0087929964065552,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 14705
+    },
+    {
+      "epoch": 0.14706,
+      "grad_norm": 0.8531105518341064,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 14706
+    },
+    {
+      "epoch": 0.14707,
+      "grad_norm": 0.7724258899688721,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 14707
+    },
+    {
+      "epoch": 0.14708,
+      "grad_norm": 0.7460688352584839,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 14708
+    },
+    {
+      "epoch": 0.14709,
+      "grad_norm": 0.7925257086753845,
+      "learning_rate": 0.003,
+      "loss": 3.9746,
+      "step": 14709
+    },
+    {
+      "epoch": 0.1471,
+      "grad_norm": 0.8145542740821838,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 14710
+    },
+    {
+      "epoch": 0.14711,
+      "grad_norm": 0.8683342933654785,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 14711
+    },
+    {
+      "epoch": 0.14712,
+      "grad_norm": 1.0504648685455322,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 14712
+    },
+    {
+      "epoch": 0.14713,
+      "grad_norm": 1.0538825988769531,
+      "learning_rate": 0.003,
+      "loss": 3.995,
+      "step": 14713
+    },
+    {
+      "epoch": 0.14714,
+      "grad_norm": 0.9523059129714966,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 14714
+    },
+    {
+      "epoch": 0.14715,
+      "grad_norm": 0.9349806308746338,
+      "learning_rate": 0.003,
+      "loss": 4.0437,
+      "step": 14715
+    },
+    {
+      "epoch": 0.14716,
+      "grad_norm": 0.9929547309875488,
+      "learning_rate": 0.003,
+      "loss": 4.0524,
+      "step": 14716
+    },
+    {
+      "epoch": 0.14717,
+      "grad_norm": 0.9924283027648926,
+      "learning_rate": 0.003,
+      "loss": 4.039,
+      "step": 14717
+    },
+    {
+      "epoch": 0.14718,
+      "grad_norm": 0.9050933718681335,
+      "learning_rate": 0.003,
+      "loss": 3.9921,
+      "step": 14718
+    },
+    {
+      "epoch": 0.14719,
+      "grad_norm": 0.8614309430122375,
+      "learning_rate": 0.003,
+      "loss": 3.9913,
+      "step": 14719
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.8637762665748596,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 14720
+    },
+    {
+      "epoch": 0.14721,
+      "grad_norm": 0.7075213193893433,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 14721
+    },
+    {
+      "epoch": 0.14722,
+      "grad_norm": 0.6924384832382202,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 14722
+    },
+    {
+      "epoch": 0.14723,
+      "grad_norm": 0.7364385724067688,
+      "learning_rate": 0.003,
+      "loss": 3.9952,
+      "step": 14723
+    },
+    {
+      "epoch": 0.14724,
+      "grad_norm": 0.8216361403465271,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 14724
+    },
+    {
+      "epoch": 0.14725,
+      "grad_norm": 0.8571340441703796,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 14725
+    },
+    {
+      "epoch": 0.14726,
+      "grad_norm": 0.947347104549408,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 14726
+    },
+    {
+      "epoch": 0.14727,
+      "grad_norm": 1.083392858505249,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 14727
+    },
+    {
+      "epoch": 0.14728,
+      "grad_norm": 1.0357502698898315,
+      "learning_rate": 0.003,
+      "loss": 3.9994,
+      "step": 14728
+    },
+    {
+      "epoch": 0.14729,
+      "grad_norm": 0.7941818237304688,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 14729
+    },
+    {
+      "epoch": 0.1473,
+      "grad_norm": 0.692725658416748,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 14730
+    },
+    {
+      "epoch": 0.14731,
+      "grad_norm": 0.6918818354606628,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 14731
+    },
+    {
+      "epoch": 0.14732,
+      "grad_norm": 0.7324870228767395,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 14732
+    },
+    {
+      "epoch": 0.14733,
+      "grad_norm": 0.8361362218856812,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 14733
+    },
+    {
+      "epoch": 0.14734,
+      "grad_norm": 0.8729264736175537,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 14734
+    },
+    {
+      "epoch": 0.14735,
+      "grad_norm": 0.653428316116333,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 14735
+    },
+    {
+      "epoch": 0.14736,
+      "grad_norm": 0.6323630809783936,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 14736
+    },
+    {
+      "epoch": 0.14737,
+      "grad_norm": 0.6049655079841614,
+      "learning_rate": 0.003,
+      "loss": 3.9966,
+      "step": 14737
+    },
+    {
+      "epoch": 0.14738,
+      "grad_norm": 0.5639153718948364,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 14738
+    },
+    {
+      "epoch": 0.14739,
+      "grad_norm": 0.6139823794364929,
+      "learning_rate": 0.003,
+      "loss": 3.9865,
+      "step": 14739
+    },
+    {
+      "epoch": 0.1474,
+      "grad_norm": 0.7757695913314819,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 14740
+    },
+    {
+      "epoch": 0.14741,
+      "grad_norm": 0.9068511128425598,
+      "learning_rate": 0.003,
+      "loss": 3.9938,
+      "step": 14741
+    },
+    {
+      "epoch": 0.14742,
+      "grad_norm": 1.1613571643829346,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 14742
+    },
+    {
+      "epoch": 0.14743,
+      "grad_norm": 0.7322368025779724,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 14743
+    },
+    {
+      "epoch": 0.14744,
+      "grad_norm": 0.5892424583435059,
+      "learning_rate": 0.003,
+      "loss": 3.9933,
+      "step": 14744
+    },
+    {
+      "epoch": 0.14745,
+      "grad_norm": 0.7783081531524658,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 14745
+    },
+    {
+      "epoch": 0.14746,
+      "grad_norm": 0.8132655620574951,
+      "learning_rate": 0.003,
+      "loss": 3.9751,
+      "step": 14746
+    },
+    {
+      "epoch": 0.14747,
+      "grad_norm": 0.8056081533432007,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 14747
+    },
+    {
+      "epoch": 0.14748,
+      "grad_norm": 0.8169584274291992,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 14748
+    },
+    {
+      "epoch": 0.14749,
+      "grad_norm": 0.6788761019706726,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 14749
+    },
+    {
+      "epoch": 0.1475,
+      "grad_norm": 0.6912890076637268,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 14750
+    },
+    {
+      "epoch": 0.14751,
+      "grad_norm": 0.791756272315979,
+      "learning_rate": 0.003,
+      "loss": 3.9859,
+      "step": 14751
+    },
+    {
+      "epoch": 0.14752,
+      "grad_norm": 0.6415024995803833,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 14752
+    },
+    {
+      "epoch": 0.14753,
+      "grad_norm": 0.5531638264656067,
+      "learning_rate": 0.003,
+      "loss": 3.9822,
+      "step": 14753
+    },
+    {
+      "epoch": 0.14754,
+      "grad_norm": 0.6161421537399292,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 14754
+    },
+    {
+      "epoch": 0.14755,
+      "grad_norm": 0.7031818628311157,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 14755
+    },
+    {
+      "epoch": 0.14756,
+      "grad_norm": 0.7122676968574524,
+      "learning_rate": 0.003,
+      "loss": 3.998,
+      "step": 14756
+    },
+    {
+      "epoch": 0.14757,
+      "grad_norm": 0.7026879191398621,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 14757
+    },
+    {
+      "epoch": 0.14758,
+      "grad_norm": 0.847398579120636,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 14758
+    },
+    {
+      "epoch": 0.14759,
+      "grad_norm": 1.0359034538269043,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 14759
+    },
+    {
+      "epoch": 0.1476,
+      "grad_norm": 1.1722480058670044,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 14760
+    },
+    {
+      "epoch": 0.14761,
+      "grad_norm": 1.0708212852478027,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 14761
+    },
+    {
+      "epoch": 0.14762,
+      "grad_norm": 1.000401496887207,
+      "learning_rate": 0.003,
+      "loss": 4.009,
+      "step": 14762
+    },
+    {
+      "epoch": 0.14763,
+      "grad_norm": 0.9598147869110107,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 14763
+    },
+    {
+      "epoch": 0.14764,
+      "grad_norm": 0.9209848046302795,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 14764
+    },
+    {
+      "epoch": 0.14765,
+      "grad_norm": 0.834210216999054,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 14765
+    },
+    {
+      "epoch": 0.14766,
+      "grad_norm": 0.902065098285675,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 14766
+    },
+    {
+      "epoch": 0.14767,
+      "grad_norm": 0.9380612969398499,
+      "learning_rate": 0.003,
+      "loss": 4.0508,
+      "step": 14767
+    },
+    {
+      "epoch": 0.14768,
+      "grad_norm": 1.0841375589370728,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 14768
+    },
+    {
+      "epoch": 0.14769,
+      "grad_norm": 0.9712322950363159,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 14769
+    },
+    {
+      "epoch": 0.1477,
+      "grad_norm": 0.988662600517273,
+      "learning_rate": 0.003,
+      "loss": 4.0486,
+      "step": 14770
+    },
+    {
+      "epoch": 0.14771,
+      "grad_norm": 1.0296447277069092,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 14771
+    },
+    {
+      "epoch": 0.14772,
+      "grad_norm": 0.9799277186393738,
+      "learning_rate": 0.003,
+      "loss": 4.0095,
+      "step": 14772
+    },
+    {
+      "epoch": 0.14773,
+      "grad_norm": 0.9216949343681335,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 14773
+    },
+    {
+      "epoch": 0.14774,
+      "grad_norm": 0.9277490377426147,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 14774
+    },
+    {
+      "epoch": 0.14775,
+      "grad_norm": 1.1422898769378662,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 14775
+    },
+    {
+      "epoch": 0.14776,
+      "grad_norm": 0.8414909839630127,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 14776
+    },
+    {
+      "epoch": 0.14777,
+      "grad_norm": 0.7519655823707581,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 14777
+    },
+    {
+      "epoch": 0.14778,
+      "grad_norm": 0.6935589909553528,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 14778
+    },
+    {
+      "epoch": 0.14779,
+      "grad_norm": 0.632209300994873,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 14779
+    },
+    {
+      "epoch": 0.1478,
+      "grad_norm": 0.6803045272827148,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 14780
+    },
+    {
+      "epoch": 0.14781,
+      "grad_norm": 0.7468591928482056,
+      "learning_rate": 0.003,
+      "loss": 3.9862,
+      "step": 14781
+    },
+    {
+      "epoch": 0.14782,
+      "grad_norm": 0.7592949271202087,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 14782
+    },
+    {
+      "epoch": 0.14783,
+      "grad_norm": 0.8823490142822266,
+      "learning_rate": 0.003,
+      "loss": 4.0242,
+      "step": 14783
+    },
+    {
+      "epoch": 0.14784,
+      "grad_norm": 0.8962957262992859,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 14784
+    },
+    {
+      "epoch": 0.14785,
+      "grad_norm": 0.9850898385047913,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 14785
+    },
+    {
+      "epoch": 0.14786,
+      "grad_norm": 1.0215193033218384,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 14786
+    },
+    {
+      "epoch": 0.14787,
+      "grad_norm": 0.8413836359977722,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 14787
+    },
+    {
+      "epoch": 0.14788,
+      "grad_norm": 0.740068256855011,
+      "learning_rate": 0.003,
+      "loss": 3.9763,
+      "step": 14788
+    },
+    {
+      "epoch": 0.14789,
+      "grad_norm": 0.7006083130836487,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 14789
+    },
+    {
+      "epoch": 0.1479,
+      "grad_norm": 0.7278753519058228,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 14790
+    },
+    {
+      "epoch": 0.14791,
+      "grad_norm": 0.8917559385299683,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 14791
+    },
+    {
+      "epoch": 0.14792,
+      "grad_norm": 0.9600374102592468,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 14792
+    },
+    {
+      "epoch": 0.14793,
+      "grad_norm": 0.9399484992027283,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 14793
+    },
+    {
+      "epoch": 0.14794,
+      "grad_norm": 0.9212305545806885,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 14794
+    },
+    {
+      "epoch": 0.14795,
+      "grad_norm": 0.8454551696777344,
+      "learning_rate": 0.003,
+      "loss": 4.0592,
+      "step": 14795
+    },
+    {
+      "epoch": 0.14796,
+      "grad_norm": 0.8320006728172302,
+      "learning_rate": 0.003,
+      "loss": 3.9833,
+      "step": 14796
+    },
+    {
+      "epoch": 0.14797,
+      "grad_norm": 0.8420716524124146,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 14797
+    },
+    {
+      "epoch": 0.14798,
+      "grad_norm": 0.8744964003562927,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 14798
+    },
+    {
+      "epoch": 0.14799,
+      "grad_norm": 0.9001518487930298,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 14799
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.9593254327774048,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 14800
+    },
+    {
+      "epoch": 0.14801,
+      "grad_norm": 1.0270856618881226,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 14801
+    },
+    {
+      "epoch": 0.14802,
+      "grad_norm": 0.909136950969696,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 14802
+    },
+    {
+      "epoch": 0.14803,
+      "grad_norm": 0.8870131969451904,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 14803
+    },
+    {
+      "epoch": 0.14804,
+      "grad_norm": 0.972500741481781,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 14804
+    },
+    {
+      "epoch": 0.14805,
+      "grad_norm": 1.041571855545044,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 14805
+    },
+    {
+      "epoch": 0.14806,
+      "grad_norm": 0.8508731722831726,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 14806
+    },
+    {
+      "epoch": 0.14807,
+      "grad_norm": 0.7221386432647705,
+      "learning_rate": 0.003,
+      "loss": 3.9809,
+      "step": 14807
+    },
+    {
+      "epoch": 0.14808,
+      "grad_norm": 0.6646813750267029,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 14808
+    },
+    {
+      "epoch": 0.14809,
+      "grad_norm": 0.5410613417625427,
+      "learning_rate": 0.003,
+      "loss": 3.9764,
+      "step": 14809
+    },
+    {
+      "epoch": 0.1481,
+      "grad_norm": 0.5607827305793762,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 14810
+    },
+    {
+      "epoch": 0.14811,
+      "grad_norm": 0.5264371037483215,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 14811
+    },
+    {
+      "epoch": 0.14812,
+      "grad_norm": 0.507453203201294,
+      "learning_rate": 0.003,
+      "loss": 3.9561,
+      "step": 14812
+    },
+    {
+      "epoch": 0.14813,
+      "grad_norm": 0.5576320290565491,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 14813
+    },
+    {
+      "epoch": 0.14814,
+      "grad_norm": 0.6028345227241516,
+      "learning_rate": 0.003,
+      "loss": 3.9967,
+      "step": 14814
+    },
+    {
+      "epoch": 0.14815,
+      "grad_norm": 0.5697914361953735,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 14815
+    },
+    {
+      "epoch": 0.14816,
+      "grad_norm": 0.6025079488754272,
+      "learning_rate": 0.003,
+      "loss": 4.0091,
+      "step": 14816
+    },
+    {
+      "epoch": 0.14817,
+      "grad_norm": 0.6084979772567749,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 14817
+    },
+    {
+      "epoch": 0.14818,
+      "grad_norm": 0.7108429074287415,
+      "learning_rate": 0.003,
+      "loss": 3.9693,
+      "step": 14818
+    },
+    {
+      "epoch": 0.14819,
+      "grad_norm": 0.8598563075065613,
+      "learning_rate": 0.003,
+      "loss": 4.0039,
+      "step": 14819
+    },
+    {
+      "epoch": 0.1482,
+      "grad_norm": 1.1866936683654785,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 14820
+    },
+    {
+      "epoch": 0.14821,
+      "grad_norm": 1.1063987016677856,
+      "learning_rate": 0.003,
+      "loss": 3.9878,
+      "step": 14821
+    },
+    {
+      "epoch": 0.14822,
+      "grad_norm": 0.8805261850357056,
+      "learning_rate": 0.003,
+      "loss": 3.9748,
+      "step": 14822
+    },
+    {
+      "epoch": 0.14823,
+      "grad_norm": 0.7659177780151367,
+      "learning_rate": 0.003,
+      "loss": 3.9888,
+      "step": 14823
+    },
+    {
+      "epoch": 0.14824,
+      "grad_norm": 0.7803784012794495,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 14824
+    },
+    {
+      "epoch": 0.14825,
+      "grad_norm": 0.8470391035079956,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 14825
+    },
+    {
+      "epoch": 0.14826,
+      "grad_norm": 0.8442140817642212,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 14826
+    },
+    {
+      "epoch": 0.14827,
+      "grad_norm": 0.9312122464179993,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 14827
+    },
+    {
+      "epoch": 0.14828,
+      "grad_norm": 1.031015157699585,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 14828
+    },
+    {
+      "epoch": 0.14829,
+      "grad_norm": 1.0510855913162231,
+      "learning_rate": 0.003,
+      "loss": 4.0271,
+      "step": 14829
+    },
+    {
+      "epoch": 0.1483,
+      "grad_norm": 1.0041213035583496,
+      "learning_rate": 0.003,
+      "loss": 3.9771,
+      "step": 14830
+    },
+    {
+      "epoch": 0.14831,
+      "grad_norm": 1.0243691205978394,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 14831
+    },
+    {
+      "epoch": 0.14832,
+      "grad_norm": 0.8999918103218079,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 14832
+    },
+    {
+      "epoch": 0.14833,
+      "grad_norm": 0.7126042246818542,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 14833
+    },
+    {
+      "epoch": 0.14834,
+      "grad_norm": 0.6217748522758484,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 14834
+    },
+    {
+      "epoch": 0.14835,
+      "grad_norm": 0.5969050526618958,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 14835
+    },
+    {
+      "epoch": 0.14836,
+      "grad_norm": 0.6038386821746826,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 14836
+    },
+    {
+      "epoch": 0.14837,
+      "grad_norm": 0.5933994054794312,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 14837
+    },
+    {
+      "epoch": 0.14838,
+      "grad_norm": 0.4869672954082489,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 14838
+    },
+    {
+      "epoch": 0.14839,
+      "grad_norm": 0.5288945436477661,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 14839
+    },
+    {
+      "epoch": 0.1484,
+      "grad_norm": 0.5412532687187195,
+      "learning_rate": 0.003,
+      "loss": 3.9883,
+      "step": 14840
+    },
+    {
+      "epoch": 0.14841,
+      "grad_norm": 0.6282064318656921,
+      "learning_rate": 0.003,
+      "loss": 3.9765,
+      "step": 14841
+    },
+    {
+      "epoch": 0.14842,
+      "grad_norm": 0.7172026634216309,
+      "learning_rate": 0.003,
+      "loss": 4.0028,
+      "step": 14842
+    },
+    {
+      "epoch": 0.14843,
+      "grad_norm": 0.7796453833580017,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 14843
+    },
+    {
+      "epoch": 0.14844,
+      "grad_norm": 0.8600711226463318,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 14844
+    },
+    {
+      "epoch": 0.14845,
+      "grad_norm": 0.8411175608634949,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 14845
+    },
+    {
+      "epoch": 0.14846,
+      "grad_norm": 1.10649836063385,
+      "learning_rate": 0.003,
+      "loss": 3.9952,
+      "step": 14846
+    },
+    {
+      "epoch": 0.14847,
+      "grad_norm": 1.4016904830932617,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 14847
+    },
+    {
+      "epoch": 0.14848,
+      "grad_norm": 0.7915355563163757,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 14848
+    },
+    {
+      "epoch": 0.14849,
+      "grad_norm": 0.7058914303779602,
+      "learning_rate": 0.003,
+      "loss": 4.0108,
+      "step": 14849
+    },
+    {
+      "epoch": 0.1485,
+      "grad_norm": 0.7597223520278931,
+      "learning_rate": 0.003,
+      "loss": 3.9993,
+      "step": 14850
+    },
+    {
+      "epoch": 0.14851,
+      "grad_norm": 0.8670496940612793,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 14851
+    },
+    {
+      "epoch": 0.14852,
+      "grad_norm": 0.7875985503196716,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 14852
+    },
+    {
+      "epoch": 0.14853,
+      "grad_norm": 0.8555645942687988,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 14853
+    },
+    {
+      "epoch": 0.14854,
+      "grad_norm": 1.1159355640411377,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 14854
+    },
+    {
+      "epoch": 0.14855,
+      "grad_norm": 1.0782500505447388,
+      "learning_rate": 0.003,
+      "loss": 4.0398,
+      "step": 14855
+    },
+    {
+      "epoch": 0.14856,
+      "grad_norm": 0.8411812782287598,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 14856
+    },
+    {
+      "epoch": 0.14857,
+      "grad_norm": 0.6487085223197937,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 14857
+    },
+    {
+      "epoch": 0.14858,
+      "grad_norm": 0.6623062491416931,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 14858
+    },
+    {
+      "epoch": 0.14859,
+      "grad_norm": 0.7163119316101074,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 14859
+    },
+    {
+      "epoch": 0.1486,
+      "grad_norm": 0.6611106991767883,
+      "learning_rate": 0.003,
+      "loss": 4.0047,
+      "step": 14860
+    },
+    {
+      "epoch": 0.14861,
+      "grad_norm": 0.6478426456451416,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 14861
+    },
+    {
+      "epoch": 0.14862,
+      "grad_norm": 0.7446329593658447,
+      "learning_rate": 0.003,
+      "loss": 3.9862,
+      "step": 14862
+    },
+    {
+      "epoch": 0.14863,
+      "grad_norm": 0.9106122851371765,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 14863
+    },
+    {
+      "epoch": 0.14864,
+      "grad_norm": 1.1429930925369263,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 14864
+    },
+    {
+      "epoch": 0.14865,
+      "grad_norm": 0.8995509743690491,
+      "learning_rate": 0.003,
+      "loss": 4.0388,
+      "step": 14865
+    },
+    {
+      "epoch": 0.14866,
+      "grad_norm": 0.761750340461731,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 14866
+    },
+    {
+      "epoch": 0.14867,
+      "grad_norm": 0.732969343662262,
+      "learning_rate": 0.003,
+      "loss": 3.979,
+      "step": 14867
+    },
+    {
+      "epoch": 0.14868,
+      "grad_norm": 0.7871980667114258,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 14868
+    },
+    {
+      "epoch": 0.14869,
+      "grad_norm": 0.7820833921432495,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 14869
+    },
+    {
+      "epoch": 0.1487,
+      "grad_norm": 0.8342604637145996,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 14870
+    },
+    {
+      "epoch": 0.14871,
+      "grad_norm": 0.9378725290298462,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 14871
+    },
+    {
+      "epoch": 0.14872,
+      "grad_norm": 1.1117609739303589,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 14872
+    },
+    {
+      "epoch": 0.14873,
+      "grad_norm": 0.8830320835113525,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 14873
+    },
+    {
+      "epoch": 0.14874,
+      "grad_norm": 0.9355973601341248,
+      "learning_rate": 0.003,
+      "loss": 3.9888,
+      "step": 14874
+    },
+    {
+      "epoch": 0.14875,
+      "grad_norm": 0.9096114635467529,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 14875
+    },
+    {
+      "epoch": 0.14876,
+      "grad_norm": 0.902562141418457,
+      "learning_rate": 0.003,
+      "loss": 4.0242,
+      "step": 14876
+    },
+    {
+      "epoch": 0.14877,
+      "grad_norm": 0.868760883808136,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 14877
+    },
+    {
+      "epoch": 0.14878,
+      "grad_norm": 0.8453792929649353,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 14878
+    },
+    {
+      "epoch": 0.14879,
+      "grad_norm": 0.9315659403800964,
+      "learning_rate": 0.003,
+      "loss": 4.0446,
+      "step": 14879
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.824686586856842,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 14880
+    },
+    {
+      "epoch": 0.14881,
+      "grad_norm": 0.7330214381217957,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 14881
+    },
+    {
+      "epoch": 0.14882,
+      "grad_norm": 0.7923099994659424,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 14882
+    },
+    {
+      "epoch": 0.14883,
+      "grad_norm": 0.8784119486808777,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 14883
+    },
+    {
+      "epoch": 0.14884,
+      "grad_norm": 1.2640788555145264,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 14884
+    },
+    {
+      "epoch": 0.14885,
+      "grad_norm": 0.874704897403717,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 14885
+    },
+    {
+      "epoch": 0.14886,
+      "grad_norm": 0.8032217025756836,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 14886
+    },
+    {
+      "epoch": 0.14887,
+      "grad_norm": 0.8753021955490112,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 14887
+    },
+    {
+      "epoch": 0.14888,
+      "grad_norm": 0.8627346158027649,
+      "learning_rate": 0.003,
+      "loss": 4.0494,
+      "step": 14888
+    },
+    {
+      "epoch": 0.14889,
+      "grad_norm": 0.821229875087738,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 14889
+    },
+    {
+      "epoch": 0.1489,
+      "grad_norm": 0.6824096441268921,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 14890
+    },
+    {
+      "epoch": 0.14891,
+      "grad_norm": 0.6843130588531494,
+      "learning_rate": 0.003,
+      "loss": 3.9889,
+      "step": 14891
+    },
+    {
+      "epoch": 0.14892,
+      "grad_norm": 0.8635883927345276,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 14892
+    },
+    {
+      "epoch": 0.14893,
+      "grad_norm": 0.8831451535224915,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 14893
+    },
+    {
+      "epoch": 0.14894,
+      "grad_norm": 0.9102339744567871,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 14894
+    },
+    {
+      "epoch": 0.14895,
+      "grad_norm": 0.9062256217002869,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 14895
+    },
+    {
+      "epoch": 0.14896,
+      "grad_norm": 0.7547155618667603,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 14896
+    },
+    {
+      "epoch": 0.14897,
+      "grad_norm": 0.8735215067863464,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 14897
+    },
+    {
+      "epoch": 0.14898,
+      "grad_norm": 1.1400668621063232,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 14898
+    },
+    {
+      "epoch": 0.14899,
+      "grad_norm": 0.9181110858917236,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 14899
+    },
+    {
+      "epoch": 0.149,
+      "grad_norm": 0.7998493313789368,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 14900
+    },
+    {
+      "epoch": 0.14901,
+      "grad_norm": 0.8677207231521606,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 14901
+    },
+    {
+      "epoch": 0.14902,
+      "grad_norm": 0.833134651184082,
+      "learning_rate": 0.003,
+      "loss": 4.0316,
+      "step": 14902
+    },
+    {
+      "epoch": 0.14903,
+      "grad_norm": 0.7749120593070984,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 14903
+    },
+    {
+      "epoch": 0.14904,
+      "grad_norm": 0.9325809478759766,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 14904
+    },
+    {
+      "epoch": 0.14905,
+      "grad_norm": 1.0706454515457153,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 14905
+    },
+    {
+      "epoch": 0.14906,
+      "grad_norm": 0.9616243839263916,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 14906
+    },
+    {
+      "epoch": 0.14907,
+      "grad_norm": 0.8771450519561768,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 14907
+    },
+    {
+      "epoch": 0.14908,
+      "grad_norm": 0.824774444103241,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 14908
+    },
+    {
+      "epoch": 0.14909,
+      "grad_norm": 0.8146010041236877,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 14909
+    },
+    {
+      "epoch": 0.1491,
+      "grad_norm": 0.9311041235923767,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 14910
+    },
+    {
+      "epoch": 0.14911,
+      "grad_norm": 0.9459924697875977,
+      "learning_rate": 0.003,
+      "loss": 4.0447,
+      "step": 14911
+    },
+    {
+      "epoch": 0.14912,
+      "grad_norm": 0.9101096391677856,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 14912
+    },
+    {
+      "epoch": 0.14913,
+      "grad_norm": 0.9328768849372864,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 14913
+    },
+    {
+      "epoch": 0.14914,
+      "grad_norm": 0.8582000732421875,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 14914
+    },
+    {
+      "epoch": 0.14915,
+      "grad_norm": 0.8020802736282349,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 14915
+    },
+    {
+      "epoch": 0.14916,
+      "grad_norm": 0.6678415536880493,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 14916
+    },
+    {
+      "epoch": 0.14917,
+      "grad_norm": 0.58196622133255,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 14917
+    },
+    {
+      "epoch": 0.14918,
+      "grad_norm": 0.5718095302581787,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 14918
+    },
+    {
+      "epoch": 0.14919,
+      "grad_norm": 0.5286774039268494,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 14919
+    },
+    {
+      "epoch": 0.1492,
+      "grad_norm": 0.5560582280158997,
+      "learning_rate": 0.003,
+      "loss": 3.9994,
+      "step": 14920
+    },
+    {
+      "epoch": 0.14921,
+      "grad_norm": 0.5656267404556274,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 14921
+    },
+    {
+      "epoch": 0.14922,
+      "grad_norm": 0.583981454372406,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 14922
+    },
+    {
+      "epoch": 0.14923,
+      "grad_norm": 0.6026471257209778,
+      "learning_rate": 0.003,
+      "loss": 3.9761,
+      "step": 14923
+    },
+    {
+      "epoch": 0.14924,
+      "grad_norm": 0.6006498336791992,
+      "learning_rate": 0.003,
+      "loss": 3.9861,
+      "step": 14924
+    },
+    {
+      "epoch": 0.14925,
+      "grad_norm": 0.7609928846359253,
+      "learning_rate": 0.003,
+      "loss": 4.0001,
+      "step": 14925
+    },
+    {
+      "epoch": 0.14926,
+      "grad_norm": 1.1219854354858398,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 14926
+    },
+    {
+      "epoch": 0.14927,
+      "grad_norm": 1.2104982137680054,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 14927
+    },
+    {
+      "epoch": 0.14928,
+      "grad_norm": 0.5168392062187195,
+      "learning_rate": 0.003,
+      "loss": 4.0295,
+      "step": 14928
+    },
+    {
+      "epoch": 0.14929,
+      "grad_norm": 0.7282378077507019,
+      "learning_rate": 0.003,
+      "loss": 4.0108,
+      "step": 14929
+    },
+    {
+      "epoch": 0.1493,
+      "grad_norm": 1.0722233057022095,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 14930
+    },
+    {
+      "epoch": 0.14931,
+      "grad_norm": 1.0388180017471313,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 14931
+    },
+    {
+      "epoch": 0.14932,
+      "grad_norm": 0.8689343929290771,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 14932
+    },
+    {
+      "epoch": 0.14933,
+      "grad_norm": 0.8487166166305542,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 14933
+    },
+    {
+      "epoch": 0.14934,
+      "grad_norm": 0.8658933043479919,
+      "learning_rate": 0.003,
+      "loss": 4.0251,
+      "step": 14934
+    },
+    {
+      "epoch": 0.14935,
+      "grad_norm": 0.9383997321128845,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 14935
+    },
+    {
+      "epoch": 0.14936,
+      "grad_norm": 0.9382808208465576,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 14936
+    },
+    {
+      "epoch": 0.14937,
+      "grad_norm": 0.9391462206840515,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 14937
+    },
+    {
+      "epoch": 0.14938,
+      "grad_norm": 0.8973220586776733,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 14938
+    },
+    {
+      "epoch": 0.14939,
+      "grad_norm": 0.8659802675247192,
+      "learning_rate": 0.003,
+      "loss": 3.9965,
+      "step": 14939
+    },
+    {
+      "epoch": 0.1494,
+      "grad_norm": 0.8968918919563293,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 14940
+    },
+    {
+      "epoch": 0.14941,
+      "grad_norm": 0.8753356337547302,
+      "learning_rate": 0.003,
+      "loss": 4.0548,
+      "step": 14941
+    },
+    {
+      "epoch": 0.14942,
+      "grad_norm": 0.8973208069801331,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 14942
+    },
+    {
+      "epoch": 0.14943,
+      "grad_norm": 0.7997957468032837,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 14943
+    },
+    {
+      "epoch": 0.14944,
+      "grad_norm": 0.6905195116996765,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 14944
+    },
+    {
+      "epoch": 0.14945,
+      "grad_norm": 0.6598191261291504,
+      "learning_rate": 0.003,
+      "loss": 3.989,
+      "step": 14945
+    },
+    {
+      "epoch": 0.14946,
+      "grad_norm": 0.7957265973091125,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 14946
+    },
+    {
+      "epoch": 0.14947,
+      "grad_norm": 0.7551441788673401,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 14947
+    },
+    {
+      "epoch": 0.14948,
+      "grad_norm": 0.6483982801437378,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 14948
+    },
+    {
+      "epoch": 0.14949,
+      "grad_norm": 0.7106829285621643,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 14949
+    },
+    {
+      "epoch": 0.1495,
+      "grad_norm": 0.8401058316230774,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 14950
+    },
+    {
+      "epoch": 0.14951,
+      "grad_norm": 0.8834985494613647,
+      "learning_rate": 0.003,
+      "loss": 3.9778,
+      "step": 14951
+    },
+    {
+      "epoch": 0.14952,
+      "grad_norm": 1.0601893663406372,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 14952
+    },
+    {
+      "epoch": 0.14953,
+      "grad_norm": 1.2167599201202393,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 14953
+    },
+    {
+      "epoch": 0.14954,
+      "grad_norm": 0.8047480583190918,
+      "learning_rate": 0.003,
+      "loss": 4.0495,
+      "step": 14954
+    },
+    {
+      "epoch": 0.14955,
+      "grad_norm": 0.6802546381950378,
+      "learning_rate": 0.003,
+      "loss": 3.9815,
+      "step": 14955
+    },
+    {
+      "epoch": 0.14956,
+      "grad_norm": 0.6390823721885681,
+      "learning_rate": 0.003,
+      "loss": 3.9982,
+      "step": 14956
+    },
+    {
+      "epoch": 0.14957,
+      "grad_norm": 0.7274983525276184,
+      "learning_rate": 0.003,
+      "loss": 3.9909,
+      "step": 14957
+    },
+    {
+      "epoch": 0.14958,
+      "grad_norm": 0.7369818687438965,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 14958
+    },
+    {
+      "epoch": 0.14959,
+      "grad_norm": 0.7644141912460327,
+      "learning_rate": 0.003,
+      "loss": 3.9797,
+      "step": 14959
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.674062192440033,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 14960
+    },
+    {
+      "epoch": 0.14961,
+      "grad_norm": 0.8109387755393982,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 14961
+    },
+    {
+      "epoch": 0.14962,
+      "grad_norm": 0.8895452618598938,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 14962
+    },
+    {
+      "epoch": 0.14963,
+      "grad_norm": 1.0448538064956665,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 14963
+    },
+    {
+      "epoch": 0.14964,
+      "grad_norm": 1.0570226907730103,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 14964
+    },
+    {
+      "epoch": 0.14965,
+      "grad_norm": 1.0361334085464478,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 14965
+    },
+    {
+      "epoch": 0.14966,
+      "grad_norm": 1.0691248178482056,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 14966
+    },
+    {
+      "epoch": 0.14967,
+      "grad_norm": 0.9369614124298096,
+      "learning_rate": 0.003,
+      "loss": 4.0472,
+      "step": 14967
+    },
+    {
+      "epoch": 0.14968,
+      "grad_norm": 0.911221981048584,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 14968
+    },
+    {
+      "epoch": 0.14969,
+      "grad_norm": 0.8358284831047058,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 14969
+    },
+    {
+      "epoch": 0.1497,
+      "grad_norm": 0.7841851115226746,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 14970
+    },
+    {
+      "epoch": 0.14971,
+      "grad_norm": 0.8406400084495544,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 14971
+    },
+    {
+      "epoch": 0.14972,
+      "grad_norm": 0.9173609614372253,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 14972
+    },
+    {
+      "epoch": 0.14973,
+      "grad_norm": 0.8612219095230103,
+      "learning_rate": 0.003,
+      "loss": 3.9713,
+      "step": 14973
+    },
+    {
+      "epoch": 0.14974,
+      "grad_norm": 1.0527944564819336,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 14974
+    },
+    {
+      "epoch": 0.14975,
+      "grad_norm": 1.1495803594589233,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 14975
+    },
+    {
+      "epoch": 0.14976,
+      "grad_norm": 0.8519269227981567,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 14976
+    },
+    {
+      "epoch": 0.14977,
+      "grad_norm": 0.6940068006515503,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 14977
+    },
+    {
+      "epoch": 0.14978,
+      "grad_norm": 0.6137828826904297,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 14978
+    },
+    {
+      "epoch": 0.14979,
+      "grad_norm": 0.7167901396751404,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 14979
+    },
+    {
+      "epoch": 0.1498,
+      "grad_norm": 0.736061692237854,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 14980
+    },
+    {
+      "epoch": 0.14981,
+      "grad_norm": 0.7785318493843079,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 14981
+    },
+    {
+      "epoch": 0.14982,
+      "grad_norm": 0.7325731515884399,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 14982
+    },
+    {
+      "epoch": 0.14983,
+      "grad_norm": 0.7698650360107422,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 14983
+    },
+    {
+      "epoch": 0.14984,
+      "grad_norm": 0.9054916501045227,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 14984
+    },
+    {
+      "epoch": 0.14985,
+      "grad_norm": 0.8448771834373474,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 14985
+    },
+    {
+      "epoch": 0.14986,
+      "grad_norm": 0.7488740682601929,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 14986
+    },
+    {
+      "epoch": 0.14987,
+      "grad_norm": 0.7946826219558716,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 14987
+    },
+    {
+      "epoch": 0.14988,
+      "grad_norm": 0.8177905082702637,
+      "learning_rate": 0.003,
+      "loss": 3.9916,
+      "step": 14988
+    },
+    {
+      "epoch": 0.14989,
+      "grad_norm": 0.9731899499893188,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 14989
+    },
+    {
+      "epoch": 0.1499,
+      "grad_norm": 1.1797256469726562,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 14990
+    },
+    {
+      "epoch": 0.14991,
+      "grad_norm": 0.6459851264953613,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 14991
+    },
+    {
+      "epoch": 0.14992,
+      "grad_norm": 0.4921308755874634,
+      "learning_rate": 0.003,
+      "loss": 3.988,
+      "step": 14992
+    },
+    {
+      "epoch": 0.14993,
+      "grad_norm": 0.5904116630554199,
+      "learning_rate": 0.003,
+      "loss": 3.994,
+      "step": 14993
+    },
+    {
+      "epoch": 0.14994,
+      "grad_norm": 0.6307472586631775,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 14994
+    },
+    {
+      "epoch": 0.14995,
+      "grad_norm": 0.6794448494911194,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 14995
+    },
+    {
+      "epoch": 0.14996,
+      "grad_norm": 0.8052518963813782,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 14996
+    },
+    {
+      "epoch": 0.14997,
+      "grad_norm": 0.9949678778648376,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 14997
+    },
+    {
+      "epoch": 0.14998,
+      "grad_norm": 1.0166358947753906,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 14998
+    },
+    {
+      "epoch": 0.14999,
+      "grad_norm": 0.9890971779823303,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 14999
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 1.0871403217315674,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 15000
+    },
+    {
+      "epoch": 0.15001,
+      "grad_norm": 0.8645810484886169,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 15001
+    },
+    {
+      "epoch": 0.15002,
+      "grad_norm": 0.702522873878479,
+      "learning_rate": 0.003,
+      "loss": 3.9852,
+      "step": 15002
+    },
+    {
+      "epoch": 0.15003,
+      "grad_norm": 0.7483938932418823,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 15003
+    },
+    {
+      "epoch": 0.15004,
+      "grad_norm": 1.0146819353103638,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 15004
+    },
+    {
+      "epoch": 0.15005,
+      "grad_norm": 1.0388811826705933,
+      "learning_rate": 0.003,
+      "loss": 4.0448,
+      "step": 15005
+    },
+    {
+      "epoch": 0.15006,
+      "grad_norm": 0.8578092455863953,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 15006
+    },
+    {
+      "epoch": 0.15007,
+      "grad_norm": 0.8312777876853943,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 15007
+    },
+    {
+      "epoch": 0.15008,
+      "grad_norm": 0.7816850543022156,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 15008
+    },
+    {
+      "epoch": 0.15009,
+      "grad_norm": 0.6976773142814636,
+      "learning_rate": 0.003,
+      "loss": 3.9873,
+      "step": 15009
+    },
+    {
+      "epoch": 0.1501,
+      "grad_norm": 0.6751989126205444,
+      "learning_rate": 0.003,
+      "loss": 3.9941,
+      "step": 15010
+    },
+    {
+      "epoch": 0.15011,
+      "grad_norm": 0.7083759307861328,
+      "learning_rate": 0.003,
+      "loss": 4.0108,
+      "step": 15011
+    },
+    {
+      "epoch": 0.15012,
+      "grad_norm": 0.6603441834449768,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 15012
+    },
+    {
+      "epoch": 0.15013,
+      "grad_norm": 0.6130689978599548,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 15013
+    },
+    {
+      "epoch": 0.15014,
+      "grad_norm": 0.6485565900802612,
+      "learning_rate": 0.003,
+      "loss": 3.9862,
+      "step": 15014
+    },
+    {
+      "epoch": 0.15015,
+      "grad_norm": 0.7764594554901123,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 15015
+    },
+    {
+      "epoch": 0.15016,
+      "grad_norm": 0.9251732230186462,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 15016
+    },
+    {
+      "epoch": 0.15017,
+      "grad_norm": 0.9680980443954468,
+      "learning_rate": 0.003,
+      "loss": 4.0207,
+      "step": 15017
+    },
+    {
+      "epoch": 0.15018,
+      "grad_norm": 0.89886474609375,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 15018
+    },
+    {
+      "epoch": 0.15019,
+      "grad_norm": 0.8506682515144348,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 15019
+    },
+    {
+      "epoch": 0.1502,
+      "grad_norm": 0.9167236089706421,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 15020
+    },
+    {
+      "epoch": 0.15021,
+      "grad_norm": 0.9710371494293213,
+      "learning_rate": 0.003,
+      "loss": 4.0249,
+      "step": 15021
+    },
+    {
+      "epoch": 0.15022,
+      "grad_norm": 0.9589751958847046,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 15022
+    },
+    {
+      "epoch": 0.15023,
+      "grad_norm": 0.9774499535560608,
+      "learning_rate": 0.003,
+      "loss": 4.0039,
+      "step": 15023
+    },
+    {
+      "epoch": 0.15024,
+      "grad_norm": 0.9559521079063416,
+      "learning_rate": 0.003,
+      "loss": 3.9916,
+      "step": 15024
+    },
+    {
+      "epoch": 0.15025,
+      "grad_norm": 1.0851106643676758,
+      "learning_rate": 0.003,
+      "loss": 3.9985,
+      "step": 15025
+    },
+    {
+      "epoch": 0.15026,
+      "grad_norm": 0.9278112649917603,
+      "learning_rate": 0.003,
+      "loss": 3.9916,
+      "step": 15026
+    },
+    {
+      "epoch": 0.15027,
+      "grad_norm": 0.900368332862854,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 15027
+    },
+    {
+      "epoch": 0.15028,
+      "grad_norm": 0.7963383793830872,
+      "learning_rate": 0.003,
+      "loss": 4.0249,
+      "step": 15028
+    },
+    {
+      "epoch": 0.15029,
+      "grad_norm": 0.7205212116241455,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 15029
+    },
+    {
+      "epoch": 0.1503,
+      "grad_norm": 0.7486795783042908,
+      "learning_rate": 0.003,
+      "loss": 3.9759,
+      "step": 15030
+    },
+    {
+      "epoch": 0.15031,
+      "grad_norm": 0.6926259398460388,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 15031
+    },
+    {
+      "epoch": 0.15032,
+      "grad_norm": 0.6792514324188232,
+      "learning_rate": 0.003,
+      "loss": 3.9859,
+      "step": 15032
+    },
+    {
+      "epoch": 0.15033,
+      "grad_norm": 0.5921057462692261,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 15033
+    },
+    {
+      "epoch": 0.15034,
+      "grad_norm": 0.7386751770973206,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 15034
+    },
+    {
+      "epoch": 0.15035,
+      "grad_norm": 0.725128710269928,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 15035
+    },
+    {
+      "epoch": 0.15036,
+      "grad_norm": 0.8471537828445435,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 15036
+    },
+    {
+      "epoch": 0.15037,
+      "grad_norm": 0.8906320333480835,
+      "learning_rate": 0.003,
+      "loss": 4.0498,
+      "step": 15037
+    },
+    {
+      "epoch": 0.15038,
+      "grad_norm": 0.9164702296257019,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 15038
+    },
+    {
+      "epoch": 0.15039,
+      "grad_norm": 0.9383320212364197,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 15039
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 1.0033183097839355,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 15040
+    },
+    {
+      "epoch": 0.15041,
+      "grad_norm": 1.1808382272720337,
+      "learning_rate": 0.003,
+      "loss": 4.0674,
+      "step": 15041
+    },
+    {
+      "epoch": 0.15042,
+      "grad_norm": 0.9112935066223145,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 15042
+    },
+    {
+      "epoch": 0.15043,
+      "grad_norm": 0.8786137104034424,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 15043
+    },
+    {
+      "epoch": 0.15044,
+      "grad_norm": 0.9137707948684692,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 15044
+    },
+    {
+      "epoch": 0.15045,
+      "grad_norm": 0.8492369055747986,
+      "learning_rate": 0.003,
+      "loss": 4.0294,
+      "step": 15045
+    },
+    {
+      "epoch": 0.15046,
+      "grad_norm": 0.8525924682617188,
+      "learning_rate": 0.003,
+      "loss": 4.0366,
+      "step": 15046
+    },
+    {
+      "epoch": 0.15047,
+      "grad_norm": 0.8138546347618103,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 15047
+    },
+    {
+      "epoch": 0.15048,
+      "grad_norm": 0.8754577040672302,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 15048
+    },
+    {
+      "epoch": 0.15049,
+      "grad_norm": 1.0250927209854126,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 15049
+    },
+    {
+      "epoch": 0.1505,
+      "grad_norm": 1.0228712558746338,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 15050
+    },
+    {
+      "epoch": 0.15051,
+      "grad_norm": 0.8266756534576416,
+      "learning_rate": 0.003,
+      "loss": 3.9858,
+      "step": 15051
+    },
+    {
+      "epoch": 0.15052,
+      "grad_norm": 0.6580228209495544,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 15052
+    },
+    {
+      "epoch": 0.15053,
+      "grad_norm": 0.5499899983406067,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 15053
+    },
+    {
+      "epoch": 0.15054,
+      "grad_norm": 0.7129097580909729,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 15054
+    },
+    {
+      "epoch": 0.15055,
+      "grad_norm": 0.7538021802902222,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 15055
+    },
+    {
+      "epoch": 0.15056,
+      "grad_norm": 0.7968039512634277,
+      "learning_rate": 0.003,
+      "loss": 3.9663,
+      "step": 15056
+    },
+    {
+      "epoch": 0.15057,
+      "grad_norm": 0.8896482586860657,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 15057
+    },
+    {
+      "epoch": 0.15058,
+      "grad_norm": 0.9867523908615112,
+      "learning_rate": 0.003,
+      "loss": 4.0437,
+      "step": 15058
+    },
+    {
+      "epoch": 0.15059,
+      "grad_norm": 1.057076334953308,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 15059
+    },
+    {
+      "epoch": 0.1506,
+      "grad_norm": 0.9541208148002625,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 15060
+    },
+    {
+      "epoch": 0.15061,
+      "grad_norm": 0.9236926436424255,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 15061
+    },
+    {
+      "epoch": 0.15062,
+      "grad_norm": 0.8116821050643921,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 15062
+    },
+    {
+      "epoch": 0.15063,
+      "grad_norm": 0.7856393456459045,
+      "learning_rate": 0.003,
+      "loss": 4.0321,
+      "step": 15063
+    },
+    {
+      "epoch": 0.15064,
+      "grad_norm": 0.6679021120071411,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 15064
+    },
+    {
+      "epoch": 0.15065,
+      "grad_norm": 0.6571440100669861,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 15065
+    },
+    {
+      "epoch": 0.15066,
+      "grad_norm": 0.7009978890419006,
+      "learning_rate": 0.003,
+      "loss": 4.0332,
+      "step": 15066
+    },
+    {
+      "epoch": 0.15067,
+      "grad_norm": 0.8476483821868896,
+      "learning_rate": 0.003,
+      "loss": 4.0449,
+      "step": 15067
+    },
+    {
+      "epoch": 0.15068,
+      "grad_norm": 0.9449343681335449,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 15068
+    },
+    {
+      "epoch": 0.15069,
+      "grad_norm": 1.0391207933425903,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 15069
+    },
+    {
+      "epoch": 0.1507,
+      "grad_norm": 0.9836397767066956,
+      "learning_rate": 0.003,
+      "loss": 3.9944,
+      "step": 15070
+    },
+    {
+      "epoch": 0.15071,
+      "grad_norm": 0.9063265323638916,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 15071
+    },
+    {
+      "epoch": 0.15072,
+      "grad_norm": 0.8169854879379272,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 15072
+    },
+    {
+      "epoch": 0.15073,
+      "grad_norm": 0.6318926811218262,
+      "learning_rate": 0.003,
+      "loss": 3.996,
+      "step": 15073
+    },
+    {
+      "epoch": 0.15074,
+      "grad_norm": 0.6150188446044922,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 15074
+    },
+    {
+      "epoch": 0.15075,
+      "grad_norm": 0.730800449848175,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 15075
+    },
+    {
+      "epoch": 0.15076,
+      "grad_norm": 0.7448334693908691,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 15076
+    },
+    {
+      "epoch": 0.15077,
+      "grad_norm": 0.6553254723548889,
+      "learning_rate": 0.003,
+      "loss": 3.9831,
+      "step": 15077
+    },
+    {
+      "epoch": 0.15078,
+      "grad_norm": 0.7216061353683472,
+      "learning_rate": 0.003,
+      "loss": 3.988,
+      "step": 15078
+    },
+    {
+      "epoch": 0.15079,
+      "grad_norm": 0.9148663282394409,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 15079
+    },
+    {
+      "epoch": 0.1508,
+      "grad_norm": 1.18192458152771,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 15080
+    },
+    {
+      "epoch": 0.15081,
+      "grad_norm": 0.7886486053466797,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 15081
+    },
+    {
+      "epoch": 0.15082,
+      "grad_norm": 0.6921918988227844,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 15082
+    },
+    {
+      "epoch": 0.15083,
+      "grad_norm": 0.8153104186058044,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 15083
+    },
+    {
+      "epoch": 0.15084,
+      "grad_norm": 0.8307874798774719,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 15084
+    },
+    {
+      "epoch": 0.15085,
+      "grad_norm": 0.7925188541412354,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 15085
+    },
+    {
+      "epoch": 0.15086,
+      "grad_norm": 0.8499267101287842,
+      "learning_rate": 0.003,
+      "loss": 3.9795,
+      "step": 15086
+    },
+    {
+      "epoch": 0.15087,
+      "grad_norm": 1.0056763887405396,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 15087
+    },
+    {
+      "epoch": 0.15088,
+      "grad_norm": 0.960319459438324,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 15088
+    },
+    {
+      "epoch": 0.15089,
+      "grad_norm": 0.9148436784744263,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 15089
+    },
+    {
+      "epoch": 0.1509,
+      "grad_norm": 0.9970728158950806,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 15090
+    },
+    {
+      "epoch": 0.15091,
+      "grad_norm": 0.9315593838691711,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 15091
+    },
+    {
+      "epoch": 0.15092,
+      "grad_norm": 0.8441429138183594,
+      "learning_rate": 0.003,
+      "loss": 4.0355,
+      "step": 15092
+    },
+    {
+      "epoch": 0.15093,
+      "grad_norm": 0.646895706653595,
+      "learning_rate": 0.003,
+      "loss": 3.9868,
+      "step": 15093
+    },
+    {
+      "epoch": 0.15094,
+      "grad_norm": 0.5070883631706238,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 15094
+    },
+    {
+      "epoch": 0.15095,
+      "grad_norm": 0.4848787486553192,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 15095
+    },
+    {
+      "epoch": 0.15096,
+      "grad_norm": 0.5808219909667969,
+      "learning_rate": 0.003,
+      "loss": 3.9888,
+      "step": 15096
+    },
+    {
+      "epoch": 0.15097,
+      "grad_norm": 0.6804661750793457,
+      "learning_rate": 0.003,
+      "loss": 4.0006,
+      "step": 15097
+    },
+    {
+      "epoch": 0.15098,
+      "grad_norm": 0.8220118880271912,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 15098
+    },
+    {
+      "epoch": 0.15099,
+      "grad_norm": 0.9969527721405029,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 15099
+    },
+    {
+      "epoch": 0.151,
+      "grad_norm": 1.0243529081344604,
+      "learning_rate": 0.003,
+      "loss": 3.9948,
+      "step": 15100
+    },
+    {
+      "epoch": 0.15101,
+      "grad_norm": 0.8672932982444763,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 15101
+    },
+    {
+      "epoch": 0.15102,
+      "grad_norm": 1.051192283630371,
+      "learning_rate": 0.003,
+      "loss": 4.0454,
+      "step": 15102
+    },
+    {
+      "epoch": 0.15103,
+      "grad_norm": 0.9607369899749756,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 15103
+    },
+    {
+      "epoch": 0.15104,
+      "grad_norm": 1.0513368844985962,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 15104
+    },
+    {
+      "epoch": 0.15105,
+      "grad_norm": 1.097847819328308,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 15105
+    },
+    {
+      "epoch": 0.15106,
+      "grad_norm": 0.9191159605979919,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 15106
+    },
+    {
+      "epoch": 0.15107,
+      "grad_norm": 0.8986817598342896,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 15107
+    },
+    {
+      "epoch": 0.15108,
+      "grad_norm": 0.8361642956733704,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 15108
+    },
+    {
+      "epoch": 0.15109,
+      "grad_norm": 0.8894073367118835,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 15109
+    },
+    {
+      "epoch": 0.1511,
+      "grad_norm": 0.9681217670440674,
+      "learning_rate": 0.003,
+      "loss": 3.9883,
+      "step": 15110
+    },
+    {
+      "epoch": 0.15111,
+      "grad_norm": 1.0168449878692627,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 15111
+    },
+    {
+      "epoch": 0.15112,
+      "grad_norm": 0.839482843875885,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 15112
+    },
+    {
+      "epoch": 0.15113,
+      "grad_norm": 0.7267372012138367,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 15113
+    },
+    {
+      "epoch": 0.15114,
+      "grad_norm": 0.6676825881004333,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 15114
+    },
+    {
+      "epoch": 0.15115,
+      "grad_norm": 0.7883790135383606,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 15115
+    },
+    {
+      "epoch": 0.15116,
+      "grad_norm": 0.9439095258712769,
+      "learning_rate": 0.003,
+      "loss": 4.0242,
+      "step": 15116
+    },
+    {
+      "epoch": 0.15117,
+      "grad_norm": 1.1422020196914673,
+      "learning_rate": 0.003,
+      "loss": 4.0095,
+      "step": 15117
+    },
+    {
+      "epoch": 0.15118,
+      "grad_norm": 0.6147944331169128,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 15118
+    },
+    {
+      "epoch": 0.15119,
+      "grad_norm": 0.4783836603164673,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 15119
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.6159612536430359,
+      "learning_rate": 0.003,
+      "loss": 4.0366,
+      "step": 15120
+    },
+    {
+      "epoch": 0.15121,
+      "grad_norm": 0.6156262159347534,
+      "learning_rate": 0.003,
+      "loss": 3.9872,
+      "step": 15121
+    },
+    {
+      "epoch": 0.15122,
+      "grad_norm": 0.596961259841919,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 15122
+    },
+    {
+      "epoch": 0.15123,
+      "grad_norm": 0.5993465185165405,
+      "learning_rate": 0.003,
+      "loss": 3.9884,
+      "step": 15123
+    },
+    {
+      "epoch": 0.15124,
+      "grad_norm": 0.4942002296447754,
+      "learning_rate": 0.003,
+      "loss": 3.9755,
+      "step": 15124
+    },
+    {
+      "epoch": 0.15125,
+      "grad_norm": 0.5079962611198425,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 15125
+    },
+    {
+      "epoch": 0.15126,
+      "grad_norm": 0.5626096725463867,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 15126
+    },
+    {
+      "epoch": 0.15127,
+      "grad_norm": 0.6039233207702637,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 15127
+    },
+    {
+      "epoch": 0.15128,
+      "grad_norm": 0.5964253544807434,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 15128
+    },
+    {
+      "epoch": 0.15129,
+      "grad_norm": 0.7165008187294006,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 15129
+    },
+    {
+      "epoch": 0.1513,
+      "grad_norm": 0.9771832823753357,
+      "learning_rate": 0.003,
+      "loss": 3.9813,
+      "step": 15130
+    },
+    {
+      "epoch": 0.15131,
+      "grad_norm": 1.2352522611618042,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 15131
+    },
+    {
+      "epoch": 0.15132,
+      "grad_norm": 0.7619431614875793,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 15132
+    },
+    {
+      "epoch": 0.15133,
+      "grad_norm": 0.6206051707267761,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 15133
+    },
+    {
+      "epoch": 0.15134,
+      "grad_norm": 0.6780843734741211,
+      "learning_rate": 0.003,
+      "loss": 3.9913,
+      "step": 15134
+    },
+    {
+      "epoch": 0.15135,
+      "grad_norm": 0.8164312243461609,
+      "learning_rate": 0.003,
+      "loss": 3.9933,
+      "step": 15135
+    },
+    {
+      "epoch": 0.15136,
+      "grad_norm": 0.9336850643157959,
+      "learning_rate": 0.003,
+      "loss": 3.9716,
+      "step": 15136
+    },
+    {
+      "epoch": 0.15137,
+      "grad_norm": 1.0265147686004639,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 15137
+    },
+    {
+      "epoch": 0.15138,
+      "grad_norm": 1.0117080211639404,
+      "learning_rate": 0.003,
+      "loss": 4.0524,
+      "step": 15138
+    },
+    {
+      "epoch": 0.15139,
+      "grad_norm": 0.806343138217926,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 15139
+    },
+    {
+      "epoch": 0.1514,
+      "grad_norm": 0.8026455044746399,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 15140
+    },
+    {
+      "epoch": 0.15141,
+      "grad_norm": 0.8498087525367737,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 15141
+    },
+    {
+      "epoch": 0.15142,
+      "grad_norm": 0.9997002482414246,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 15142
+    },
+    {
+      "epoch": 0.15143,
+      "grad_norm": 1.2353754043579102,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 15143
+    },
+    {
+      "epoch": 0.15144,
+      "grad_norm": 0.7741185426712036,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 15144
+    },
+    {
+      "epoch": 0.15145,
+      "grad_norm": 0.6971248388290405,
+      "learning_rate": 0.003,
+      "loss": 3.9912,
+      "step": 15145
+    },
+    {
+      "epoch": 0.15146,
+      "grad_norm": 0.7391877770423889,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 15146
+    },
+    {
+      "epoch": 0.15147,
+      "grad_norm": 0.8225349187850952,
+      "learning_rate": 0.003,
+      "loss": 3.9902,
+      "step": 15147
+    },
+    {
+      "epoch": 0.15148,
+      "grad_norm": 0.9799991250038147,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 15148
+    },
+    {
+      "epoch": 0.15149,
+      "grad_norm": 1.0774352550506592,
+      "learning_rate": 0.003,
+      "loss": 4.0437,
+      "step": 15149
+    },
+    {
+      "epoch": 0.1515,
+      "grad_norm": 0.8947973251342773,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 15150
+    },
+    {
+      "epoch": 0.15151,
+      "grad_norm": 0.7971606850624084,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 15151
+    },
+    {
+      "epoch": 0.15152,
+      "grad_norm": 0.7421729564666748,
+      "learning_rate": 0.003,
+      "loss": 4.0555,
+      "step": 15152
+    },
+    {
+      "epoch": 0.15153,
+      "grad_norm": 0.6884970664978027,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 15153
+    },
+    {
+      "epoch": 0.15154,
+      "grad_norm": 0.6970769166946411,
+      "learning_rate": 0.003,
+      "loss": 3.997,
+      "step": 15154
+    },
+    {
+      "epoch": 0.15155,
+      "grad_norm": 0.7541738748550415,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 15155
+    },
+    {
+      "epoch": 0.15156,
+      "grad_norm": 0.9252972602844238,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 15156
+    },
+    {
+      "epoch": 0.15157,
+      "grad_norm": 1.024196982383728,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 15157
+    },
+    {
+      "epoch": 0.15158,
+      "grad_norm": 0.998375415802002,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 15158
+    },
+    {
+      "epoch": 0.15159,
+      "grad_norm": 0.9763141870498657,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 15159
+    },
+    {
+      "epoch": 0.1516,
+      "grad_norm": 1.0023225545883179,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 15160
+    },
+    {
+      "epoch": 0.15161,
+      "grad_norm": 1.0663807392120361,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 15161
+    },
+    {
+      "epoch": 0.15162,
+      "grad_norm": 0.9437249302864075,
+      "learning_rate": 0.003,
+      "loss": 4.052,
+      "step": 15162
+    },
+    {
+      "epoch": 0.15163,
+      "grad_norm": 1.0163962841033936,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 15163
+    },
+    {
+      "epoch": 0.15164,
+      "grad_norm": 1.1673986911773682,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 15164
+    },
+    {
+      "epoch": 0.15165,
+      "grad_norm": 0.882068395614624,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 15165
+    },
+    {
+      "epoch": 0.15166,
+      "grad_norm": 0.7180541753768921,
+      "learning_rate": 0.003,
+      "loss": 3.9943,
+      "step": 15166
+    },
+    {
+      "epoch": 0.15167,
+      "grad_norm": 0.6750436425209045,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 15167
+    },
+    {
+      "epoch": 0.15168,
+      "grad_norm": 0.6546518802642822,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 15168
+    },
+    {
+      "epoch": 0.15169,
+      "grad_norm": 0.6339498162269592,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 15169
+    },
+    {
+      "epoch": 0.1517,
+      "grad_norm": 0.5468326210975647,
+      "learning_rate": 0.003,
+      "loss": 3.99,
+      "step": 15170
+    },
+    {
+      "epoch": 0.15171,
+      "grad_norm": 0.5912162065505981,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 15171
+    },
+    {
+      "epoch": 0.15172,
+      "grad_norm": 0.6342115998268127,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 15172
+    },
+    {
+      "epoch": 0.15173,
+      "grad_norm": 0.6179934740066528,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 15173
+    },
+    {
+      "epoch": 0.15174,
+      "grad_norm": 0.630556583404541,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 15174
+    },
+    {
+      "epoch": 0.15175,
+      "grad_norm": 0.6733129024505615,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 15175
+    },
+    {
+      "epoch": 0.15176,
+      "grad_norm": 0.837215006351471,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 15176
+    },
+    {
+      "epoch": 0.15177,
+      "grad_norm": 1.103715181350708,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 15177
+    },
+    {
+      "epoch": 0.15178,
+      "grad_norm": 1.0142706632614136,
+      "learning_rate": 0.003,
+      "loss": 3.9814,
+      "step": 15178
+    },
+    {
+      "epoch": 0.15179,
+      "grad_norm": 1.1049751043319702,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 15179
+    },
+    {
+      "epoch": 0.1518,
+      "grad_norm": 0.9215617775917053,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 15180
+    },
+    {
+      "epoch": 0.15181,
+      "grad_norm": 0.9071481823921204,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 15181
+    },
+    {
+      "epoch": 0.15182,
+      "grad_norm": 0.9090131521224976,
+      "learning_rate": 0.003,
+      "loss": 4.0289,
+      "step": 15182
+    },
+    {
+      "epoch": 0.15183,
+      "grad_norm": 0.823883593082428,
+      "learning_rate": 0.003,
+      "loss": 3.9929,
+      "step": 15183
+    },
+    {
+      "epoch": 0.15184,
+      "grad_norm": 0.8454837203025818,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 15184
+    },
+    {
+      "epoch": 0.15185,
+      "grad_norm": 0.7196727395057678,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 15185
+    },
+    {
+      "epoch": 0.15186,
+      "grad_norm": 0.8515127897262573,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 15186
+    },
+    {
+      "epoch": 0.15187,
+      "grad_norm": 0.847205638885498,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 15187
+    },
+    {
+      "epoch": 0.15188,
+      "grad_norm": 0.8074892163276672,
+      "learning_rate": 0.003,
+      "loss": 4.0207,
+      "step": 15188
+    },
+    {
+      "epoch": 0.15189,
+      "grad_norm": 0.797053873538971,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 15189
+    },
+    {
+      "epoch": 0.1519,
+      "grad_norm": 0.9285748600959778,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 15190
+    },
+    {
+      "epoch": 0.15191,
+      "grad_norm": 0.9438667297363281,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 15191
+    },
+    {
+      "epoch": 0.15192,
+      "grad_norm": 0.7593942880630493,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 15192
+    },
+    {
+      "epoch": 0.15193,
+      "grad_norm": 0.7198943495750427,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 15193
+    },
+    {
+      "epoch": 0.15194,
+      "grad_norm": 0.6867886781692505,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 15194
+    },
+    {
+      "epoch": 0.15195,
+      "grad_norm": 0.6413894295692444,
+      "learning_rate": 0.003,
+      "loss": 3.9865,
+      "step": 15195
+    },
+    {
+      "epoch": 0.15196,
+      "grad_norm": 0.8065673112869263,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 15196
+    },
+    {
+      "epoch": 0.15197,
+      "grad_norm": 0.8199344277381897,
+      "learning_rate": 0.003,
+      "loss": 4.0,
+      "step": 15197
+    },
+    {
+      "epoch": 0.15198,
+      "grad_norm": 0.7948529720306396,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 15198
+    },
+    {
+      "epoch": 0.15199,
+      "grad_norm": 0.7588329911231995,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 15199
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.759814441204071,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 15200
+    },
+    {
+      "epoch": 0.15201,
+      "grad_norm": 0.8657602071762085,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 15201
+    },
+    {
+      "epoch": 0.15202,
+      "grad_norm": 0.809902548789978,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 15202
+    },
+    {
+      "epoch": 0.15203,
+      "grad_norm": 0.7706008553504944,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 15203
+    },
+    {
+      "epoch": 0.15204,
+      "grad_norm": 0.8675158023834229,
+      "learning_rate": 0.003,
+      "loss": 4.0442,
+      "step": 15204
+    },
+    {
+      "epoch": 0.15205,
+      "grad_norm": 0.9507919549942017,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 15205
+    },
+    {
+      "epoch": 0.15206,
+      "grad_norm": 0.9662113189697266,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 15206
+    },
+    {
+      "epoch": 0.15207,
+      "grad_norm": 0.886137843132019,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 15207
+    },
+    {
+      "epoch": 0.15208,
+      "grad_norm": 0.8959958553314209,
+      "learning_rate": 0.003,
+      "loss": 3.9835,
+      "step": 15208
+    },
+    {
+      "epoch": 0.15209,
+      "grad_norm": 0.9112538695335388,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 15209
+    },
+    {
+      "epoch": 0.1521,
+      "grad_norm": 1.0017379522323608,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 15210
+    },
+    {
+      "epoch": 0.15211,
+      "grad_norm": 1.1133248805999756,
+      "learning_rate": 0.003,
+      "loss": 4.0454,
+      "step": 15211
+    },
+    {
+      "epoch": 0.15212,
+      "grad_norm": 0.7938923239707947,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 15212
+    },
+    {
+      "epoch": 0.15213,
+      "grad_norm": 0.7378742694854736,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 15213
+    },
+    {
+      "epoch": 0.15214,
+      "grad_norm": 0.8953700661659241,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 15214
+    },
+    {
+      "epoch": 0.15215,
+      "grad_norm": 0.9183306694030762,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 15215
+    },
+    {
+      "epoch": 0.15216,
+      "grad_norm": 0.9317597150802612,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 15216
+    },
+    {
+      "epoch": 0.15217,
+      "grad_norm": 0.9850735664367676,
+      "learning_rate": 0.003,
+      "loss": 4.0271,
+      "step": 15217
+    },
+    {
+      "epoch": 0.15218,
+      "grad_norm": 1.0636837482452393,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 15218
+    },
+    {
+      "epoch": 0.15219,
+      "grad_norm": 1.0994192361831665,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 15219
+    },
+    {
+      "epoch": 0.1522,
+      "grad_norm": 1.2374557256698608,
+      "learning_rate": 0.003,
+      "loss": 4.0357,
+      "step": 15220
+    },
+    {
+      "epoch": 0.15221,
+      "grad_norm": 0.9231405258178711,
+      "learning_rate": 0.003,
+      "loss": 4.0467,
+      "step": 15221
+    },
+    {
+      "epoch": 0.15222,
+      "grad_norm": 0.9628676772117615,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 15222
+    },
+    {
+      "epoch": 0.15223,
+      "grad_norm": 0.9543718695640564,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 15223
+    },
+    {
+      "epoch": 0.15224,
+      "grad_norm": 1.016852617263794,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 15224
+    },
+    {
+      "epoch": 0.15225,
+      "grad_norm": 0.9168770909309387,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 15225
+    },
+    {
+      "epoch": 0.15226,
+      "grad_norm": 0.6900375485420227,
+      "learning_rate": 0.003,
+      "loss": 4.0168,
+      "step": 15226
+    },
+    {
+      "epoch": 0.15227,
+      "grad_norm": 0.5902383327484131,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 15227
+    },
+    {
+      "epoch": 0.15228,
+      "grad_norm": 0.5826788544654846,
+      "learning_rate": 0.003,
+      "loss": 3.9946,
+      "step": 15228
+    },
+    {
+      "epoch": 0.15229,
+      "grad_norm": 0.5453304648399353,
+      "learning_rate": 0.003,
+      "loss": 3.9732,
+      "step": 15229
+    },
+    {
+      "epoch": 0.1523,
+      "grad_norm": 0.4619797170162201,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 15230
+    },
+    {
+      "epoch": 0.15231,
+      "grad_norm": 0.45512986183166504,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 15231
+    },
+    {
+      "epoch": 0.15232,
+      "grad_norm": 0.4618683457374573,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 15232
+    },
+    {
+      "epoch": 0.15233,
+      "grad_norm": 0.5123746991157532,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 15233
+    },
+    {
+      "epoch": 0.15234,
+      "grad_norm": 0.6983823180198669,
+      "learning_rate": 0.003,
+      "loss": 3.9952,
+      "step": 15234
+    },
+    {
+      "epoch": 0.15235,
+      "grad_norm": 1.0241881608963013,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 15235
+    },
+    {
+      "epoch": 0.15236,
+      "grad_norm": 1.2229256629943848,
+      "learning_rate": 0.003,
+      "loss": 4.0342,
+      "step": 15236
+    },
+    {
+      "epoch": 0.15237,
+      "grad_norm": 0.7741057276725769,
+      "learning_rate": 0.003,
+      "loss": 3.9993,
+      "step": 15237
+    },
+    {
+      "epoch": 0.15238,
+      "grad_norm": 0.6281371116638184,
+      "learning_rate": 0.003,
+      "loss": 3.9821,
+      "step": 15238
+    },
+    {
+      "epoch": 0.15239,
+      "grad_norm": 0.6690221428871155,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 15239
+    },
+    {
+      "epoch": 0.1524,
+      "grad_norm": 0.7069966197013855,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 15240
+    },
+    {
+      "epoch": 0.15241,
+      "grad_norm": 0.6972704529762268,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 15241
+    },
+    {
+      "epoch": 0.15242,
+      "grad_norm": 0.6901689767837524,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 15242
+    },
+    {
+      "epoch": 0.15243,
+      "grad_norm": 0.7443487048149109,
+      "learning_rate": 0.003,
+      "loss": 4.0119,
+      "step": 15243
+    },
+    {
+      "epoch": 0.15244,
+      "grad_norm": 0.7118276357650757,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 15244
+    },
+    {
+      "epoch": 0.15245,
+      "grad_norm": 0.7309918403625488,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 15245
+    },
+    {
+      "epoch": 0.15246,
+      "grad_norm": 0.7601751089096069,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 15246
+    },
+    {
+      "epoch": 0.15247,
+      "grad_norm": 0.8328369855880737,
+      "learning_rate": 0.003,
+      "loss": 4.0004,
+      "step": 15247
+    },
+    {
+      "epoch": 0.15248,
+      "grad_norm": 0.9573512673377991,
+      "learning_rate": 0.003,
+      "loss": 4.0295,
+      "step": 15248
+    },
+    {
+      "epoch": 0.15249,
+      "grad_norm": 0.9210628867149353,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 15249
+    },
+    {
+      "epoch": 0.1525,
+      "grad_norm": 0.8686948418617249,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 15250
+    },
+    {
+      "epoch": 0.15251,
+      "grad_norm": 0.8697372078895569,
+      "learning_rate": 0.003,
+      "loss": 4.0435,
+      "step": 15251
+    },
+    {
+      "epoch": 0.15252,
+      "grad_norm": 0.8911150693893433,
+      "learning_rate": 0.003,
+      "loss": 4.0342,
+      "step": 15252
+    },
+    {
+      "epoch": 0.15253,
+      "grad_norm": 0.8357546329498291,
+      "learning_rate": 0.003,
+      "loss": 3.9852,
+      "step": 15253
+    },
+    {
+      "epoch": 0.15254,
+      "grad_norm": 0.7480366230010986,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 15254
+    },
+    {
+      "epoch": 0.15255,
+      "grad_norm": 0.6970673203468323,
+      "learning_rate": 0.003,
+      "loss": 3.9998,
+      "step": 15255
+    },
+    {
+      "epoch": 0.15256,
+      "grad_norm": 0.6920578479766846,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 15256
+    },
+    {
+      "epoch": 0.15257,
+      "grad_norm": 0.7250205278396606,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 15257
+    },
+    {
+      "epoch": 0.15258,
+      "grad_norm": 0.7733274102210999,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 15258
+    },
+    {
+      "epoch": 0.15259,
+      "grad_norm": 0.8828164935112,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 15259
+    },
+    {
+      "epoch": 0.1526,
+      "grad_norm": 0.8882545828819275,
+      "learning_rate": 0.003,
+      "loss": 3.9844,
+      "step": 15260
+    },
+    {
+      "epoch": 0.15261,
+      "grad_norm": 0.9990447163581848,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 15261
+    },
+    {
+      "epoch": 0.15262,
+      "grad_norm": 1.3572981357574463,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 15262
+    },
+    {
+      "epoch": 0.15263,
+      "grad_norm": 0.7818669080734253,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 15263
+    },
+    {
+      "epoch": 0.15264,
+      "grad_norm": 0.8175413608551025,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 15264
+    },
+    {
+      "epoch": 0.15265,
+      "grad_norm": 0.9246871471405029,
+      "learning_rate": 0.003,
+      "loss": 4.0256,
+      "step": 15265
+    },
+    {
+      "epoch": 0.15266,
+      "grad_norm": 1.0359551906585693,
+      "learning_rate": 0.003,
+      "loss": 3.9966,
+      "step": 15266
+    },
+    {
+      "epoch": 0.15267,
+      "grad_norm": 1.083351731300354,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 15267
+    },
+    {
+      "epoch": 0.15268,
+      "grad_norm": 0.9957295060157776,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 15268
+    },
+    {
+      "epoch": 0.15269,
+      "grad_norm": 0.999646008014679,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 15269
+    },
+    {
+      "epoch": 0.1527,
+      "grad_norm": 0.8644801378250122,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 15270
+    },
+    {
+      "epoch": 0.15271,
+      "grad_norm": 0.6981543302536011,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 15271
+    },
+    {
+      "epoch": 0.15272,
+      "grad_norm": 0.6990094184875488,
+      "learning_rate": 0.003,
+      "loss": 3.9942,
+      "step": 15272
+    },
+    {
+      "epoch": 0.15273,
+      "grad_norm": 0.9216434955596924,
+      "learning_rate": 0.003,
+      "loss": 3.9907,
+      "step": 15273
+    },
+    {
+      "epoch": 0.15274,
+      "grad_norm": 0.9980815052986145,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 15274
+    },
+    {
+      "epoch": 0.15275,
+      "grad_norm": 1.024815320968628,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 15275
+    },
+    {
+      "epoch": 0.15276,
+      "grad_norm": 0.9341852068901062,
+      "learning_rate": 0.003,
+      "loss": 4.0394,
+      "step": 15276
+    },
+    {
+      "epoch": 0.15277,
+      "grad_norm": 0.952576756477356,
+      "learning_rate": 0.003,
+      "loss": 3.9895,
+      "step": 15277
+    },
+    {
+      "epoch": 0.15278,
+      "grad_norm": 0.8232364654541016,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 15278
+    },
+    {
+      "epoch": 0.15279,
+      "grad_norm": 0.6267227530479431,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 15279
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.706098198890686,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 15280
+    },
+    {
+      "epoch": 0.15281,
+      "grad_norm": 0.7639279365539551,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 15281
+    },
+    {
+      "epoch": 0.15282,
+      "grad_norm": 0.8171020746231079,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 15282
+    },
+    {
+      "epoch": 0.15283,
+      "grad_norm": 0.8467424511909485,
+      "learning_rate": 0.003,
+      "loss": 4.0263,
+      "step": 15283
+    },
+    {
+      "epoch": 0.15284,
+      "grad_norm": 0.8294116258621216,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 15284
+    },
+    {
+      "epoch": 0.15285,
+      "grad_norm": 0.8115954399108887,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 15285
+    },
+    {
+      "epoch": 0.15286,
+      "grad_norm": 0.789169192314148,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 15286
+    },
+    {
+      "epoch": 0.15287,
+      "grad_norm": 0.6780423521995544,
+      "learning_rate": 0.003,
+      "loss": 3.9901,
+      "step": 15287
+    },
+    {
+      "epoch": 0.15288,
+      "grad_norm": 0.5915562510490417,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 15288
+    },
+    {
+      "epoch": 0.15289,
+      "grad_norm": 0.5556778907775879,
+      "learning_rate": 0.003,
+      "loss": 3.9812,
+      "step": 15289
+    },
+    {
+      "epoch": 0.1529,
+      "grad_norm": 0.7075419425964355,
+      "learning_rate": 0.003,
+      "loss": 4.0022,
+      "step": 15290
+    },
+    {
+      "epoch": 0.15291,
+      "grad_norm": 0.82306969165802,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 15291
+    },
+    {
+      "epoch": 0.15292,
+      "grad_norm": 0.8850873112678528,
+      "learning_rate": 0.003,
+      "loss": 4.0251,
+      "step": 15292
+    },
+    {
+      "epoch": 0.15293,
+      "grad_norm": 0.8600744009017944,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 15293
+    },
+    {
+      "epoch": 0.15294,
+      "grad_norm": 0.8272354602813721,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 15294
+    },
+    {
+      "epoch": 0.15295,
+      "grad_norm": 0.9683411717414856,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 15295
+    },
+    {
+      "epoch": 0.15296,
+      "grad_norm": 1.0481081008911133,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 15296
+    },
+    {
+      "epoch": 0.15297,
+      "grad_norm": 0.9039106369018555,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 15297
+    },
+    {
+      "epoch": 0.15298,
+      "grad_norm": 1.220619797706604,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 15298
+    },
+    {
+      "epoch": 0.15299,
+      "grad_norm": 0.8804208636283875,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 15299
+    },
+    {
+      "epoch": 0.153,
+      "grad_norm": 0.6494994759559631,
+      "learning_rate": 0.003,
+      "loss": 4.0331,
+      "step": 15300
+    },
+    {
+      "epoch": 0.15301,
+      "grad_norm": 0.6240890026092529,
+      "learning_rate": 0.003,
+      "loss": 3.9907,
+      "step": 15301
+    },
+    {
+      "epoch": 0.15302,
+      "grad_norm": 0.5636361837387085,
+      "learning_rate": 0.003,
+      "loss": 3.9753,
+      "step": 15302
+    },
+    {
+      "epoch": 0.15303,
+      "grad_norm": 0.6072368025779724,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 15303
+    },
+    {
+      "epoch": 0.15304,
+      "grad_norm": 0.6338242292404175,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 15304
+    },
+    {
+      "epoch": 0.15305,
+      "grad_norm": 0.6941981315612793,
+      "learning_rate": 0.003,
+      "loss": 3.9933,
+      "step": 15305
+    },
+    {
+      "epoch": 0.15306,
+      "grad_norm": 0.6970587968826294,
+      "learning_rate": 0.003,
+      "loss": 3.9845,
+      "step": 15306
+    },
+    {
+      "epoch": 0.15307,
+      "grad_norm": 0.7661889791488647,
+      "learning_rate": 0.003,
+      "loss": 3.9769,
+      "step": 15307
+    },
+    {
+      "epoch": 0.15308,
+      "grad_norm": 0.9541347622871399,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 15308
+    },
+    {
+      "epoch": 0.15309,
+      "grad_norm": 1.085623860359192,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 15309
+    },
+    {
+      "epoch": 0.1531,
+      "grad_norm": 0.8310011029243469,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 15310
+    },
+    {
+      "epoch": 0.15311,
+      "grad_norm": 0.7221589088439941,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 15311
+    },
+    {
+      "epoch": 0.15312,
+      "grad_norm": 0.747313380241394,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 15312
+    },
+    {
+      "epoch": 0.15313,
+      "grad_norm": 0.8484153151512146,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 15313
+    },
+    {
+      "epoch": 0.15314,
+      "grad_norm": 0.8482285141944885,
+      "learning_rate": 0.003,
+      "loss": 4.0168,
+      "step": 15314
+    },
+    {
+      "epoch": 0.15315,
+      "grad_norm": 0.7901672720909119,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 15315
+    },
+    {
+      "epoch": 0.15316,
+      "grad_norm": 0.8714357018470764,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 15316
+    },
+    {
+      "epoch": 0.15317,
+      "grad_norm": 0.8399065136909485,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 15317
+    },
+    {
+      "epoch": 0.15318,
+      "grad_norm": 0.9220929145812988,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 15318
+    },
+    {
+      "epoch": 0.15319,
+      "grad_norm": 0.9695712924003601,
+      "learning_rate": 0.003,
+      "loss": 3.992,
+      "step": 15319
+    },
+    {
+      "epoch": 0.1532,
+      "grad_norm": 1.101618766784668,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 15320
+    },
+    {
+      "epoch": 0.15321,
+      "grad_norm": 0.9492210149765015,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 15321
+    },
+    {
+      "epoch": 0.15322,
+      "grad_norm": 0.9337678551673889,
+      "learning_rate": 0.003,
+      "loss": 4.0357,
+      "step": 15322
+    },
+    {
+      "epoch": 0.15323,
+      "grad_norm": 1.0097078084945679,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 15323
+    },
+    {
+      "epoch": 0.15324,
+      "grad_norm": 0.9319543242454529,
+      "learning_rate": 0.003,
+      "loss": 4.015,
+      "step": 15324
+    },
+    {
+      "epoch": 0.15325,
+      "grad_norm": 0.8490461707115173,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 15325
+    },
+    {
+      "epoch": 0.15326,
+      "grad_norm": 0.806574285030365,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 15326
+    },
+    {
+      "epoch": 0.15327,
+      "grad_norm": 0.7265695333480835,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 15327
+    },
+    {
+      "epoch": 0.15328,
+      "grad_norm": 0.7705208659172058,
+      "learning_rate": 0.003,
+      "loss": 3.9845,
+      "step": 15328
+    },
+    {
+      "epoch": 0.15329,
+      "grad_norm": 1.0969406366348267,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 15329
+    },
+    {
+      "epoch": 0.1533,
+      "grad_norm": 1.1445393562316895,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 15330
+    },
+    {
+      "epoch": 0.15331,
+      "grad_norm": 0.8153518438339233,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 15331
+    },
+    {
+      "epoch": 0.15332,
+      "grad_norm": 0.6289441585540771,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 15332
+    },
+    {
+      "epoch": 0.15333,
+      "grad_norm": 0.48444995284080505,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 15333
+    },
+    {
+      "epoch": 0.15334,
+      "grad_norm": 0.5958626866340637,
+      "learning_rate": 0.003,
+      "loss": 3.9867,
+      "step": 15334
+    },
+    {
+      "epoch": 0.15335,
+      "grad_norm": 0.7594508528709412,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 15335
+    },
+    {
+      "epoch": 0.15336,
+      "grad_norm": 0.9328643083572388,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 15336
+    },
+    {
+      "epoch": 0.15337,
+      "grad_norm": 1.0968477725982666,
+      "learning_rate": 0.003,
+      "loss": 4.0185,
+      "step": 15337
+    },
+    {
+      "epoch": 0.15338,
+      "grad_norm": 0.8232316970825195,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 15338
+    },
+    {
+      "epoch": 0.15339,
+      "grad_norm": 0.6940597891807556,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 15339
+    },
+    {
+      "epoch": 0.1534,
+      "grad_norm": 0.6054121255874634,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 15340
+    },
+    {
+      "epoch": 0.15341,
+      "grad_norm": 0.6718443632125854,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 15341
+    },
+    {
+      "epoch": 0.15342,
+      "grad_norm": 0.8152159452438354,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 15342
+    },
+    {
+      "epoch": 0.15343,
+      "grad_norm": 1.1432894468307495,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 15343
+    },
+    {
+      "epoch": 0.15344,
+      "grad_norm": 0.9310418367385864,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 15344
+    },
+    {
+      "epoch": 0.15345,
+      "grad_norm": 0.7084609866142273,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 15345
+    },
+    {
+      "epoch": 0.15346,
+      "grad_norm": 0.6835572123527527,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 15346
+    },
+    {
+      "epoch": 0.15347,
+      "grad_norm": 0.6644852161407471,
+      "learning_rate": 0.003,
+      "loss": 3.9869,
+      "step": 15347
+    },
+    {
+      "epoch": 0.15348,
+      "grad_norm": 0.6577886939048767,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 15348
+    },
+    {
+      "epoch": 0.15349,
+      "grad_norm": 0.7408700585365295,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 15349
+    },
+    {
+      "epoch": 0.1535,
+      "grad_norm": 0.8827539682388306,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 15350
+    },
+    {
+      "epoch": 0.15351,
+      "grad_norm": 0.8726819157600403,
+      "learning_rate": 0.003,
+      "loss": 3.9713,
+      "step": 15351
+    },
+    {
+      "epoch": 0.15352,
+      "grad_norm": 0.8914332389831543,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 15352
+    },
+    {
+      "epoch": 0.15353,
+      "grad_norm": 0.9129746556282043,
+      "learning_rate": 0.003,
+      "loss": 4.0006,
+      "step": 15353
+    },
+    {
+      "epoch": 0.15354,
+      "grad_norm": 0.9270063638687134,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 15354
+    },
+    {
+      "epoch": 0.15355,
+      "grad_norm": 0.9356761574745178,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 15355
+    },
+    {
+      "epoch": 0.15356,
+      "grad_norm": 0.947665810585022,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 15356
+    },
+    {
+      "epoch": 0.15357,
+      "grad_norm": 0.8571285009384155,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 15357
+    },
+    {
+      "epoch": 0.15358,
+      "grad_norm": 0.8978291749954224,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 15358
+    },
+    {
+      "epoch": 0.15359,
+      "grad_norm": 0.966492772102356,
+      "learning_rate": 0.003,
+      "loss": 3.9977,
+      "step": 15359
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.8793907165527344,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 15360
+    },
+    {
+      "epoch": 0.15361,
+      "grad_norm": 0.8608678579330444,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 15361
+    },
+    {
+      "epoch": 0.15362,
+      "grad_norm": 0.7903480529785156,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 15362
+    },
+    {
+      "epoch": 0.15363,
+      "grad_norm": 0.8036578297615051,
+      "learning_rate": 0.003,
+      "loss": 4.0487,
+      "step": 15363
+    },
+    {
+      "epoch": 0.15364,
+      "grad_norm": 0.8505967855453491,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 15364
+    },
+    {
+      "epoch": 0.15365,
+      "grad_norm": 0.9503588080406189,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 15365
+    },
+    {
+      "epoch": 0.15366,
+      "grad_norm": 0.759649395942688,
+      "learning_rate": 0.003,
+      "loss": 4.0448,
+      "step": 15366
+    },
+    {
+      "epoch": 0.15367,
+      "grad_norm": 0.6503275036811829,
+      "learning_rate": 0.003,
+      "loss": 3.9921,
+      "step": 15367
+    },
+    {
+      "epoch": 0.15368,
+      "grad_norm": 0.6055623888969421,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 15368
+    },
+    {
+      "epoch": 0.15369,
+      "grad_norm": 0.5819411277770996,
+      "learning_rate": 0.003,
+      "loss": 3.994,
+      "step": 15369
+    },
+    {
+      "epoch": 0.1537,
+      "grad_norm": 0.6257475018501282,
+      "learning_rate": 0.003,
+      "loss": 3.9763,
+      "step": 15370
+    },
+    {
+      "epoch": 0.15371,
+      "grad_norm": 0.6066151261329651,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 15371
+    },
+    {
+      "epoch": 0.15372,
+      "grad_norm": 0.623354434967041,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 15372
+    },
+    {
+      "epoch": 0.15373,
+      "grad_norm": 0.6460627913475037,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 15373
+    },
+    {
+      "epoch": 0.15374,
+      "grad_norm": 0.741217851638794,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 15374
+    },
+    {
+      "epoch": 0.15375,
+      "grad_norm": 0.9240460991859436,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 15375
+    },
+    {
+      "epoch": 0.15376,
+      "grad_norm": 1.118343472480774,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 15376
+    },
+    {
+      "epoch": 0.15377,
+      "grad_norm": 0.9606201648712158,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 15377
+    },
+    {
+      "epoch": 0.15378,
+      "grad_norm": 0.9432666301727295,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 15378
+    },
+    {
+      "epoch": 0.15379,
+      "grad_norm": 0.9836307764053345,
+      "learning_rate": 0.003,
+      "loss": 4.0488,
+      "step": 15379
+    },
+    {
+      "epoch": 0.1538,
+      "grad_norm": 0.9370342493057251,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 15380
+    },
+    {
+      "epoch": 0.15381,
+      "grad_norm": 1.0143237113952637,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 15381
+    },
+    {
+      "epoch": 0.15382,
+      "grad_norm": 1.0634740591049194,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 15382
+    },
+    {
+      "epoch": 0.15383,
+      "grad_norm": 1.0176247358322144,
+      "learning_rate": 0.003,
+      "loss": 4.0499,
+      "step": 15383
+    },
+    {
+      "epoch": 0.15384,
+      "grad_norm": 0.8548615574836731,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 15384
+    },
+    {
+      "epoch": 0.15385,
+      "grad_norm": 0.8597134947776794,
+      "learning_rate": 0.003,
+      "loss": 4.0384,
+      "step": 15385
+    },
+    {
+      "epoch": 0.15386,
+      "grad_norm": 0.9391376376152039,
+      "learning_rate": 0.003,
+      "loss": 3.9929,
+      "step": 15386
+    },
+    {
+      "epoch": 0.15387,
+      "grad_norm": 0.9654890894889832,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 15387
+    },
+    {
+      "epoch": 0.15388,
+      "grad_norm": 0.9590594172477722,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 15388
+    },
+    {
+      "epoch": 0.15389,
+      "grad_norm": 0.8145045638084412,
+      "learning_rate": 0.003,
+      "loss": 3.9992,
+      "step": 15389
+    },
+    {
+      "epoch": 0.1539,
+      "grad_norm": 0.8208436369895935,
+      "learning_rate": 0.003,
+      "loss": 4.0483,
+      "step": 15390
+    },
+    {
+      "epoch": 0.15391,
+      "grad_norm": 0.8954910635948181,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 15391
+    },
+    {
+      "epoch": 0.15392,
+      "grad_norm": 0.9844794869422913,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 15392
+    },
+    {
+      "epoch": 0.15393,
+      "grad_norm": 1.1418054103851318,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 15393
+    },
+    {
+      "epoch": 0.15394,
+      "grad_norm": 0.8355457186698914,
+      "learning_rate": 0.003,
+      "loss": 3.9984,
+      "step": 15394
+    },
+    {
+      "epoch": 0.15395,
+      "grad_norm": 0.7603216171264648,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 15395
+    },
+    {
+      "epoch": 0.15396,
+      "grad_norm": 0.6772087216377258,
+      "learning_rate": 0.003,
+      "loss": 3.9923,
+      "step": 15396
+    },
+    {
+      "epoch": 0.15397,
+      "grad_norm": 0.8091949820518494,
+      "learning_rate": 0.003,
+      "loss": 4.0517,
+      "step": 15397
+    },
+    {
+      "epoch": 0.15398,
+      "grad_norm": 0.8611405491828918,
+      "learning_rate": 0.003,
+      "loss": 3.996,
+      "step": 15398
+    },
+    {
+      "epoch": 0.15399,
+      "grad_norm": 0.8038859963417053,
+      "learning_rate": 0.003,
+      "loss": 3.9879,
+      "step": 15399
+    },
+    {
+      "epoch": 0.154,
+      "grad_norm": 0.7616530060768127,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 15400
+    },
+    {
+      "epoch": 0.15401,
+      "grad_norm": 0.846915602684021,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 15401
+    },
+    {
+      "epoch": 0.15402,
+      "grad_norm": 0.899276852607727,
+      "learning_rate": 0.003,
+      "loss": 4.0091,
+      "step": 15402
+    },
+    {
+      "epoch": 0.15403,
+      "grad_norm": 1.0511224269866943,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 15403
+    },
+    {
+      "epoch": 0.15404,
+      "grad_norm": 1.0797245502471924,
+      "learning_rate": 0.003,
+      "loss": 3.9982,
+      "step": 15404
+    },
+    {
+      "epoch": 0.15405,
+      "grad_norm": 0.9076430201530457,
+      "learning_rate": 0.003,
+      "loss": 4.0543,
+      "step": 15405
+    },
+    {
+      "epoch": 0.15406,
+      "grad_norm": 0.7749274969100952,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 15406
+    },
+    {
+      "epoch": 0.15407,
+      "grad_norm": 0.6954168677330017,
+      "learning_rate": 0.003,
+      "loss": 3.9797,
+      "step": 15407
+    },
+    {
+      "epoch": 0.15408,
+      "grad_norm": 0.6638758182525635,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 15408
+    },
+    {
+      "epoch": 0.15409,
+      "grad_norm": 0.6848050355911255,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 15409
+    },
+    {
+      "epoch": 0.1541,
+      "grad_norm": 0.7466670870780945,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 15410
+    },
+    {
+      "epoch": 0.15411,
+      "grad_norm": 0.7404317855834961,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 15411
+    },
+    {
+      "epoch": 0.15412,
+      "grad_norm": 0.7153498530387878,
+      "learning_rate": 0.003,
+      "loss": 3.9983,
+      "step": 15412
+    },
+    {
+      "epoch": 0.15413,
+      "grad_norm": 0.7007918357849121,
+      "learning_rate": 0.003,
+      "loss": 4.0408,
+      "step": 15413
+    },
+    {
+      "epoch": 0.15414,
+      "grad_norm": 0.5969216227531433,
+      "learning_rate": 0.003,
+      "loss": 3.9994,
+      "step": 15414
+    },
+    {
+      "epoch": 0.15415,
+      "grad_norm": 0.5652239918708801,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 15415
+    },
+    {
+      "epoch": 0.15416,
+      "grad_norm": 0.5697609782218933,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 15416
+    },
+    {
+      "epoch": 0.15417,
+      "grad_norm": 0.6852931380271912,
+      "learning_rate": 0.003,
+      "loss": 3.9967,
+      "step": 15417
+    },
+    {
+      "epoch": 0.15418,
+      "grad_norm": 0.8129041790962219,
+      "learning_rate": 0.003,
+      "loss": 3.9764,
+      "step": 15418
+    },
+    {
+      "epoch": 0.15419,
+      "grad_norm": 0.8532951474189758,
+      "learning_rate": 0.003,
+      "loss": 3.996,
+      "step": 15419
+    },
+    {
+      "epoch": 0.1542,
+      "grad_norm": 0.8256349563598633,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 15420
+    },
+    {
+      "epoch": 0.15421,
+      "grad_norm": 0.7314229011535645,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 15421
+    },
+    {
+      "epoch": 0.15422,
+      "grad_norm": 0.7831211090087891,
+      "learning_rate": 0.003,
+      "loss": 3.9921,
+      "step": 15422
+    },
+    {
+      "epoch": 0.15423,
+      "grad_norm": 0.7848354578018188,
+      "learning_rate": 0.003,
+      "loss": 3.9626,
+      "step": 15423
+    },
+    {
+      "epoch": 0.15424,
+      "grad_norm": 0.8499462008476257,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 15424
+    },
+    {
+      "epoch": 0.15425,
+      "grad_norm": 0.8613630533218384,
+      "learning_rate": 0.003,
+      "loss": 3.9923,
+      "step": 15425
+    },
+    {
+      "epoch": 0.15426,
+      "grad_norm": 0.9698076248168945,
+      "learning_rate": 0.003,
+      "loss": 3.9868,
+      "step": 15426
+    },
+    {
+      "epoch": 0.15427,
+      "grad_norm": 1.0350285768508911,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 15427
+    },
+    {
+      "epoch": 0.15428,
+      "grad_norm": 1.0963178873062134,
+      "learning_rate": 0.003,
+      "loss": 4.0339,
+      "step": 15428
+    },
+    {
+      "epoch": 0.15429,
+      "grad_norm": 0.8511303067207336,
+      "learning_rate": 0.003,
+      "loss": 3.9931,
+      "step": 15429
+    },
+    {
+      "epoch": 0.1543,
+      "grad_norm": 0.6375939249992371,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 15430
+    },
+    {
+      "epoch": 0.15431,
+      "grad_norm": 0.6718809604644775,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 15431
+    },
+    {
+      "epoch": 0.15432,
+      "grad_norm": 0.7174335718154907,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 15432
+    },
+    {
+      "epoch": 0.15433,
+      "grad_norm": 0.7369046211242676,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 15433
+    },
+    {
+      "epoch": 0.15434,
+      "grad_norm": 0.7658149600028992,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 15434
+    },
+    {
+      "epoch": 0.15435,
+      "grad_norm": 0.9825096130371094,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 15435
+    },
+    {
+      "epoch": 0.15436,
+      "grad_norm": 1.0927093029022217,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 15436
+    },
+    {
+      "epoch": 0.15437,
+      "grad_norm": 0.9758725762367249,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 15437
+    },
+    {
+      "epoch": 0.15438,
+      "grad_norm": 1.1830312013626099,
+      "learning_rate": 0.003,
+      "loss": 4.047,
+      "step": 15438
+    },
+    {
+      "epoch": 0.15439,
+      "grad_norm": 1.0404443740844727,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 15439
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.884275496006012,
+      "learning_rate": 0.003,
+      "loss": 3.9917,
+      "step": 15440
+    },
+    {
+      "epoch": 0.15441,
+      "grad_norm": 0.7949202060699463,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 15441
+    },
+    {
+      "epoch": 0.15442,
+      "grad_norm": 0.6556805372238159,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 15442
+    },
+    {
+      "epoch": 0.15443,
+      "grad_norm": 0.6673168540000916,
+      "learning_rate": 0.003,
+      "loss": 3.9836,
+      "step": 15443
+    },
+    {
+      "epoch": 0.15444,
+      "grad_norm": 0.6797085404396057,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 15444
+    },
+    {
+      "epoch": 0.15445,
+      "grad_norm": 0.5978509187698364,
+      "learning_rate": 0.003,
+      "loss": 3.9679,
+      "step": 15445
+    },
+    {
+      "epoch": 0.15446,
+      "grad_norm": 0.6056724190711975,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 15446
+    },
+    {
+      "epoch": 0.15447,
+      "grad_norm": 0.6114022731781006,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 15447
+    },
+    {
+      "epoch": 0.15448,
+      "grad_norm": 0.5594829320907593,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 15448
+    },
+    {
+      "epoch": 0.15449,
+      "grad_norm": 0.7436548471450806,
+      "learning_rate": 0.003,
+      "loss": 3.9912,
+      "step": 15449
+    },
+    {
+      "epoch": 0.1545,
+      "grad_norm": 0.8702318072319031,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 15450
+    },
+    {
+      "epoch": 0.15451,
+      "grad_norm": 1.0084431171417236,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 15451
+    },
+    {
+      "epoch": 0.15452,
+      "grad_norm": 1.1599823236465454,
+      "learning_rate": 0.003,
+      "loss": 3.9866,
+      "step": 15452
+    },
+    {
+      "epoch": 0.15453,
+      "grad_norm": 0.8092561364173889,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 15453
+    },
+    {
+      "epoch": 0.15454,
+      "grad_norm": 0.7046549320220947,
+      "learning_rate": 0.003,
+      "loss": 3.9886,
+      "step": 15454
+    },
+    {
+      "epoch": 0.15455,
+      "grad_norm": 0.5919784903526306,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 15455
+    },
+    {
+      "epoch": 0.15456,
+      "grad_norm": 0.617529034614563,
+      "learning_rate": 0.003,
+      "loss": 3.9744,
+      "step": 15456
+    },
+    {
+      "epoch": 0.15457,
+      "grad_norm": 0.6455297470092773,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 15457
+    },
+    {
+      "epoch": 0.15458,
+      "grad_norm": 0.631838858127594,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 15458
+    },
+    {
+      "epoch": 0.15459,
+      "grad_norm": 0.6798779368400574,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 15459
+    },
+    {
+      "epoch": 0.1546,
+      "grad_norm": 0.7333508133888245,
+      "learning_rate": 0.003,
+      "loss": 3.9838,
+      "step": 15460
+    },
+    {
+      "epoch": 0.15461,
+      "grad_norm": 0.8497591614723206,
+      "learning_rate": 0.003,
+      "loss": 3.9778,
+      "step": 15461
+    },
+    {
+      "epoch": 0.15462,
+      "grad_norm": 1.0627999305725098,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 15462
+    },
+    {
+      "epoch": 0.15463,
+      "grad_norm": 1.0860134363174438,
+      "learning_rate": 0.003,
+      "loss": 3.9754,
+      "step": 15463
+    },
+    {
+      "epoch": 0.15464,
+      "grad_norm": 0.8445022106170654,
+      "learning_rate": 0.003,
+      "loss": 3.9948,
+      "step": 15464
+    },
+    {
+      "epoch": 0.15465,
+      "grad_norm": 0.7499777674674988,
+      "learning_rate": 0.003,
+      "loss": 3.9883,
+      "step": 15465
+    },
+    {
+      "epoch": 0.15466,
+      "grad_norm": 0.6469881534576416,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 15466
+    },
+    {
+      "epoch": 0.15467,
+      "grad_norm": 0.6957772374153137,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 15467
+    },
+    {
+      "epoch": 0.15468,
+      "grad_norm": 0.7566899657249451,
+      "learning_rate": 0.003,
+      "loss": 3.9891,
+      "step": 15468
+    },
+    {
+      "epoch": 0.15469,
+      "grad_norm": 0.7406789064407349,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 15469
+    },
+    {
+      "epoch": 0.1547,
+      "grad_norm": 0.6400657296180725,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 15470
+    },
+    {
+      "epoch": 0.15471,
+      "grad_norm": 0.7208961844444275,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 15471
+    },
+    {
+      "epoch": 0.15472,
+      "grad_norm": 0.7523152828216553,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 15472
+    },
+    {
+      "epoch": 0.15473,
+      "grad_norm": 0.7218494415283203,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 15473
+    },
+    {
+      "epoch": 0.15474,
+      "grad_norm": 0.7980036735534668,
+      "learning_rate": 0.003,
+      "loss": 3.9952,
+      "step": 15474
+    },
+    {
+      "epoch": 0.15475,
+      "grad_norm": 0.939830482006073,
+      "learning_rate": 0.003,
+      "loss": 4.015,
+      "step": 15475
+    },
+    {
+      "epoch": 0.15476,
+      "grad_norm": 1.0078206062316895,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 15476
+    },
+    {
+      "epoch": 0.15477,
+      "grad_norm": 0.9280356168746948,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 15477
+    },
+    {
+      "epoch": 0.15478,
+      "grad_norm": 0.9465794563293457,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 15478
+    },
+    {
+      "epoch": 0.15479,
+      "grad_norm": 0.8881992697715759,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 15479
+    },
+    {
+      "epoch": 0.1548,
+      "grad_norm": 1.0140552520751953,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 15480
+    },
+    {
+      "epoch": 0.15481,
+      "grad_norm": 1.0695966482162476,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 15481
+    },
+    {
+      "epoch": 0.15482,
+      "grad_norm": 0.9526455402374268,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 15482
+    },
+    {
+      "epoch": 0.15483,
+      "grad_norm": 1.1447579860687256,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 15483
+    },
+    {
+      "epoch": 0.15484,
+      "grad_norm": 0.8123161792755127,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 15484
+    },
+    {
+      "epoch": 0.15485,
+      "grad_norm": 0.8708415031433105,
+      "learning_rate": 0.003,
+      "loss": 4.039,
+      "step": 15485
+    },
+    {
+      "epoch": 0.15486,
+      "grad_norm": 0.9243865013122559,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 15486
+    },
+    {
+      "epoch": 0.15487,
+      "grad_norm": 1.0781525373458862,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 15487
+    },
+    {
+      "epoch": 0.15488,
+      "grad_norm": 0.9679509997367859,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 15488
+    },
+    {
+      "epoch": 0.15489,
+      "grad_norm": 0.8731271028518677,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 15489
+    },
+    {
+      "epoch": 0.1549,
+      "grad_norm": 0.7739214897155762,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 15490
+    },
+    {
+      "epoch": 0.15491,
+      "grad_norm": 0.9619302153587341,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 15491
+    },
+    {
+      "epoch": 0.15492,
+      "grad_norm": 0.9622506499290466,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 15492
+    },
+    {
+      "epoch": 0.15493,
+      "grad_norm": 0.9416058659553528,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 15493
+    },
+    {
+      "epoch": 0.15494,
+      "grad_norm": 0.911348283290863,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 15494
+    },
+    {
+      "epoch": 0.15495,
+      "grad_norm": 0.9651471972465515,
+      "learning_rate": 0.003,
+      "loss": 4.0528,
+      "step": 15495
+    },
+    {
+      "epoch": 0.15496,
+      "grad_norm": 0.8875364661216736,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 15496
+    },
+    {
+      "epoch": 0.15497,
+      "grad_norm": 1.0809496641159058,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 15497
+    },
+    {
+      "epoch": 0.15498,
+      "grad_norm": 0.9345546960830688,
+      "learning_rate": 0.003,
+      "loss": 3.9736,
+      "step": 15498
+    },
+    {
+      "epoch": 0.15499,
+      "grad_norm": 0.8077123165130615,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 15499
+    },
+    {
+      "epoch": 0.155,
+      "grad_norm": 0.8043227195739746,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 15500
+    },
+    {
+      "epoch": 0.15501,
+      "grad_norm": 0.916532576084137,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 15501
+    },
+    {
+      "epoch": 0.15502,
+      "grad_norm": 1.014280915260315,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 15502
+    },
+    {
+      "epoch": 0.15503,
+      "grad_norm": 1.0342527627944946,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 15503
+    },
+    {
+      "epoch": 0.15504,
+      "grad_norm": 0.8955778479576111,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 15504
+    },
+    {
+      "epoch": 0.15505,
+      "grad_norm": 0.8519617915153503,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 15505
+    },
+    {
+      "epoch": 0.15506,
+      "grad_norm": 0.7919250130653381,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 15506
+    },
+    {
+      "epoch": 0.15507,
+      "grad_norm": 0.7271054983139038,
+      "learning_rate": 0.003,
+      "loss": 3.9931,
+      "step": 15507
+    },
+    {
+      "epoch": 0.15508,
+      "grad_norm": 0.7064192295074463,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 15508
+    },
+    {
+      "epoch": 0.15509,
+      "grad_norm": 0.7795794606208801,
+      "learning_rate": 0.003,
+      "loss": 3.9896,
+      "step": 15509
+    },
+    {
+      "epoch": 0.1551,
+      "grad_norm": 0.8523615002632141,
+      "learning_rate": 0.003,
+      "loss": 4.0334,
+      "step": 15510
+    },
+    {
+      "epoch": 0.15511,
+      "grad_norm": 0.8650283813476562,
+      "learning_rate": 0.003,
+      "loss": 4.0438,
+      "step": 15511
+    },
+    {
+      "epoch": 0.15512,
+      "grad_norm": 0.9879773855209351,
+      "learning_rate": 0.003,
+      "loss": 4.0419,
+      "step": 15512
+    },
+    {
+      "epoch": 0.15513,
+      "grad_norm": 1.2060025930404663,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 15513
+    },
+    {
+      "epoch": 0.15514,
+      "grad_norm": 0.6860678195953369,
+      "learning_rate": 0.003,
+      "loss": 4.04,
+      "step": 15514
+    },
+    {
+      "epoch": 0.15515,
+      "grad_norm": 0.6115677952766418,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 15515
+    },
+    {
+      "epoch": 0.15516,
+      "grad_norm": 0.5774584412574768,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 15516
+    },
+    {
+      "epoch": 0.15517,
+      "grad_norm": 0.6202374696731567,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 15517
+    },
+    {
+      "epoch": 0.15518,
+      "grad_norm": 0.5926021933555603,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 15518
+    },
+    {
+      "epoch": 0.15519,
+      "grad_norm": 0.6063708662986755,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 15519
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.5697833299636841,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 15520
+    },
+    {
+      "epoch": 0.15521,
+      "grad_norm": 0.6202701330184937,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 15521
+    },
+    {
+      "epoch": 0.15522,
+      "grad_norm": 0.7257983684539795,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 15522
+    },
+    {
+      "epoch": 0.15523,
+      "grad_norm": 0.8819434642791748,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 15523
+    },
+    {
+      "epoch": 0.15524,
+      "grad_norm": 0.9893710017204285,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 15524
+    },
+    {
+      "epoch": 0.15525,
+      "grad_norm": 0.8953197002410889,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 15525
+    },
+    {
+      "epoch": 0.15526,
+      "grad_norm": 0.7468194961547852,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 15526
+    },
+    {
+      "epoch": 0.15527,
+      "grad_norm": 0.7914648652076721,
+      "learning_rate": 0.003,
+      "loss": 3.998,
+      "step": 15527
+    },
+    {
+      "epoch": 0.15528,
+      "grad_norm": 0.6238877177238464,
+      "learning_rate": 0.003,
+      "loss": 3.9836,
+      "step": 15528
+    },
+    {
+      "epoch": 0.15529,
+      "grad_norm": 0.6196391582489014,
+      "learning_rate": 0.003,
+      "loss": 3.9954,
+      "step": 15529
+    },
+    {
+      "epoch": 0.1553,
+      "grad_norm": 0.726754903793335,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 15530
+    },
+    {
+      "epoch": 0.15531,
+      "grad_norm": 0.7787684202194214,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 15531
+    },
+    {
+      "epoch": 0.15532,
+      "grad_norm": 0.8006848096847534,
+      "learning_rate": 0.003,
+      "loss": 4.0455,
+      "step": 15532
+    },
+    {
+      "epoch": 0.15533,
+      "grad_norm": 0.7540202736854553,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 15533
+    },
+    {
+      "epoch": 0.15534,
+      "grad_norm": 0.836521565914154,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 15534
+    },
+    {
+      "epoch": 0.15535,
+      "grad_norm": 0.9272993803024292,
+      "learning_rate": 0.003,
+      "loss": 3.9862,
+      "step": 15535
+    },
+    {
+      "epoch": 0.15536,
+      "grad_norm": 1.074102759361267,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 15536
+    },
+    {
+      "epoch": 0.15537,
+      "grad_norm": 1.1279255151748657,
+      "learning_rate": 0.003,
+      "loss": 4.0341,
+      "step": 15537
+    },
+    {
+      "epoch": 0.15538,
+      "grad_norm": 1.0611523389816284,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 15538
+    },
+    {
+      "epoch": 0.15539,
+      "grad_norm": 0.7072646617889404,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 15539
+    },
+    {
+      "epoch": 0.1554,
+      "grad_norm": 0.7035132050514221,
+      "learning_rate": 0.003,
+      "loss": 4.0443,
+      "step": 15540
+    },
+    {
+      "epoch": 0.15541,
+      "grad_norm": 0.8227235078811646,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 15541
+    },
+    {
+      "epoch": 0.15542,
+      "grad_norm": 0.9182822108268738,
+      "learning_rate": 0.003,
+      "loss": 3.9904,
+      "step": 15542
+    },
+    {
+      "epoch": 0.15543,
+      "grad_norm": 1.0358269214630127,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 15543
+    },
+    {
+      "epoch": 0.15544,
+      "grad_norm": 0.8956734538078308,
+      "learning_rate": 0.003,
+      "loss": 4.0578,
+      "step": 15544
+    },
+    {
+      "epoch": 0.15545,
+      "grad_norm": 0.7539672255516052,
+      "learning_rate": 0.003,
+      "loss": 3.9873,
+      "step": 15545
+    },
+    {
+      "epoch": 0.15546,
+      "grad_norm": 0.7827208638191223,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 15546
+    },
+    {
+      "epoch": 0.15547,
+      "grad_norm": 0.7323380708694458,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 15547
+    },
+    {
+      "epoch": 0.15548,
+      "grad_norm": 0.6001154184341431,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 15548
+    },
+    {
+      "epoch": 0.15549,
+      "grad_norm": 0.6410365700721741,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 15549
+    },
+    {
+      "epoch": 0.1555,
+      "grad_norm": 0.633104145526886,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 15550
+    },
+    {
+      "epoch": 0.15551,
+      "grad_norm": 0.6054565906524658,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 15551
+    },
+    {
+      "epoch": 0.15552,
+      "grad_norm": 0.6131342649459839,
+      "learning_rate": 0.003,
+      "loss": 3.9895,
+      "step": 15552
+    },
+    {
+      "epoch": 0.15553,
+      "grad_norm": 0.6478719711303711,
+      "learning_rate": 0.003,
+      "loss": 3.9674,
+      "step": 15553
+    },
+    {
+      "epoch": 0.15554,
+      "grad_norm": 0.6694060564041138,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 15554
+    },
+    {
+      "epoch": 0.15555,
+      "grad_norm": 0.6657654047012329,
+      "learning_rate": 0.003,
+      "loss": 3.9674,
+      "step": 15555
+    },
+    {
+      "epoch": 0.15556,
+      "grad_norm": 0.7979192137718201,
+      "learning_rate": 0.003,
+      "loss": 3.9829,
+      "step": 15556
+    },
+    {
+      "epoch": 0.15557,
+      "grad_norm": 1.0289673805236816,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 15557
+    },
+    {
+      "epoch": 0.15558,
+      "grad_norm": 1.0689125061035156,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 15558
+    },
+    {
+      "epoch": 0.15559,
+      "grad_norm": 0.9275684356689453,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 15559
+    },
+    {
+      "epoch": 0.1556,
+      "grad_norm": 0.934004008769989,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 15560
+    },
+    {
+      "epoch": 0.15561,
+      "grad_norm": 0.8707156777381897,
+      "learning_rate": 0.003,
+      "loss": 3.9916,
+      "step": 15561
+    },
+    {
+      "epoch": 0.15562,
+      "grad_norm": 0.8118012547492981,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 15562
+    },
+    {
+      "epoch": 0.15563,
+      "grad_norm": 1.2029263973236084,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 15563
+    },
+    {
+      "epoch": 0.15564,
+      "grad_norm": 1.1070818901062012,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 15564
+    },
+    {
+      "epoch": 0.15565,
+      "grad_norm": 0.6871122717857361,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 15565
+    },
+    {
+      "epoch": 0.15566,
+      "grad_norm": 0.6614555716514587,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 15566
+    },
+    {
+      "epoch": 0.15567,
+      "grad_norm": 0.7378991842269897,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 15567
+    },
+    {
+      "epoch": 0.15568,
+      "grad_norm": 0.7864341139793396,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 15568
+    },
+    {
+      "epoch": 0.15569,
+      "grad_norm": 0.8094674348831177,
+      "learning_rate": 0.003,
+      "loss": 3.9922,
+      "step": 15569
+    },
+    {
+      "epoch": 0.1557,
+      "grad_norm": 0.7100827097892761,
+      "learning_rate": 0.003,
+      "loss": 3.9941,
+      "step": 15570
+    },
+    {
+      "epoch": 0.15571,
+      "grad_norm": 0.7554362416267395,
+      "learning_rate": 0.003,
+      "loss": 3.9946,
+      "step": 15571
+    },
+    {
+      "epoch": 0.15572,
+      "grad_norm": 0.9113379120826721,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 15572
+    },
+    {
+      "epoch": 0.15573,
+      "grad_norm": 1.0028765201568604,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 15573
+    },
+    {
+      "epoch": 0.15574,
+      "grad_norm": 1.0085824728012085,
+      "learning_rate": 0.003,
+      "loss": 4.0416,
+      "step": 15574
+    },
+    {
+      "epoch": 0.15575,
+      "grad_norm": 1.139907956123352,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 15575
+    },
+    {
+      "epoch": 0.15576,
+      "grad_norm": 0.9667785167694092,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 15576
+    },
+    {
+      "epoch": 0.15577,
+      "grad_norm": 1.0565452575683594,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 15577
+    },
+    {
+      "epoch": 0.15578,
+      "grad_norm": 0.8792425394058228,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 15578
+    },
+    {
+      "epoch": 0.15579,
+      "grad_norm": 0.895402729511261,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 15579
+    },
+    {
+      "epoch": 0.1558,
+      "grad_norm": 0.8785274028778076,
+      "learning_rate": 0.003,
+      "loss": 4.0488,
+      "step": 15580
+    },
+    {
+      "epoch": 0.15581,
+      "grad_norm": 0.9280442595481873,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 15581
+    },
+    {
+      "epoch": 0.15582,
+      "grad_norm": 1.178812026977539,
+      "learning_rate": 0.003,
+      "loss": 4.0369,
+      "step": 15582
+    },
+    {
+      "epoch": 0.15583,
+      "grad_norm": 0.9174395799636841,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 15583
+    },
+    {
+      "epoch": 0.15584,
+      "grad_norm": 0.9447359442710876,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 15584
+    },
+    {
+      "epoch": 0.15585,
+      "grad_norm": 1.1034443378448486,
+      "learning_rate": 0.003,
+      "loss": 4.0541,
+      "step": 15585
+    },
+    {
+      "epoch": 0.15586,
+      "grad_norm": 1.0161769390106201,
+      "learning_rate": 0.003,
+      "loss": 3.9844,
+      "step": 15586
+    },
+    {
+      "epoch": 0.15587,
+      "grad_norm": 0.8678581714630127,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 15587
+    },
+    {
+      "epoch": 0.15588,
+      "grad_norm": 0.6879012584686279,
+      "learning_rate": 0.003,
+      "loss": 4.0057,
+      "step": 15588
+    },
+    {
+      "epoch": 0.15589,
+      "grad_norm": 0.6353170871734619,
+      "learning_rate": 0.003,
+      "loss": 3.9858,
+      "step": 15589
+    },
+    {
+      "epoch": 0.1559,
+      "grad_norm": 0.5770264863967896,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 15590
+    },
+    {
+      "epoch": 0.15591,
+      "grad_norm": 0.5902904272079468,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 15591
+    },
+    {
+      "epoch": 0.15592,
+      "grad_norm": 0.5827825665473938,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 15592
+    },
+    {
+      "epoch": 0.15593,
+      "grad_norm": 0.5940733551979065,
+      "learning_rate": 0.003,
+      "loss": 3.9723,
+      "step": 15593
+    },
+    {
+      "epoch": 0.15594,
+      "grad_norm": 0.6823462247848511,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 15594
+    },
+    {
+      "epoch": 0.15595,
+      "grad_norm": 0.6451441645622253,
+      "learning_rate": 0.003,
+      "loss": 3.9795,
+      "step": 15595
+    },
+    {
+      "epoch": 0.15596,
+      "grad_norm": 0.5992090106010437,
+      "learning_rate": 0.003,
+      "loss": 3.98,
+      "step": 15596
+    },
+    {
+      "epoch": 0.15597,
+      "grad_norm": 0.6235384345054626,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 15597
+    },
+    {
+      "epoch": 0.15598,
+      "grad_norm": 0.6631515622138977,
+      "learning_rate": 0.003,
+      "loss": 4.0002,
+      "step": 15598
+    },
+    {
+      "epoch": 0.15599,
+      "grad_norm": 0.9059483408927917,
+      "learning_rate": 0.003,
+      "loss": 3.9854,
+      "step": 15599
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 1.39214026927948,
+      "learning_rate": 0.003,
+      "loss": 3.9906,
+      "step": 15600
+    },
+    {
+      "epoch": 0.15601,
+      "grad_norm": 0.6501911282539368,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 15601
+    },
+    {
+      "epoch": 0.15602,
+      "grad_norm": 0.6093357801437378,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 15602
+    },
+    {
+      "epoch": 0.15603,
+      "grad_norm": 0.6285752058029175,
+      "learning_rate": 0.003,
+      "loss": 3.9789,
+      "step": 15603
+    },
+    {
+      "epoch": 0.15604,
+      "grad_norm": 0.7170791029930115,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 15604
+    },
+    {
+      "epoch": 0.15605,
+      "grad_norm": 0.9150353670120239,
+      "learning_rate": 0.003,
+      "loss": 4.0091,
+      "step": 15605
+    },
+    {
+      "epoch": 0.15606,
+      "grad_norm": 1.0283564329147339,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 15606
+    },
+    {
+      "epoch": 0.15607,
+      "grad_norm": 0.8459843993186951,
+      "learning_rate": 0.003,
+      "loss": 3.993,
+      "step": 15607
+    },
+    {
+      "epoch": 0.15608,
+      "grad_norm": 0.6638538241386414,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 15608
+    },
+    {
+      "epoch": 0.15609,
+      "grad_norm": 0.6532926559448242,
+      "learning_rate": 0.003,
+      "loss": 4.0185,
+      "step": 15609
+    },
+    {
+      "epoch": 0.1561,
+      "grad_norm": 0.6578401327133179,
+      "learning_rate": 0.003,
+      "loss": 3.9839,
+      "step": 15610
+    },
+    {
+      "epoch": 0.15611,
+      "grad_norm": 0.6223822236061096,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 15611
+    },
+    {
+      "epoch": 0.15612,
+      "grad_norm": 0.6221069693565369,
+      "learning_rate": 0.003,
+      "loss": 3.9798,
+      "step": 15612
+    },
+    {
+      "epoch": 0.15613,
+      "grad_norm": 0.7909890413284302,
+      "learning_rate": 0.003,
+      "loss": 3.9812,
+      "step": 15613
+    },
+    {
+      "epoch": 0.15614,
+      "grad_norm": 0.8776581287384033,
+      "learning_rate": 0.003,
+      "loss": 3.9886,
+      "step": 15614
+    },
+    {
+      "epoch": 0.15615,
+      "grad_norm": 1.013645052909851,
+      "learning_rate": 0.003,
+      "loss": 3.9852,
+      "step": 15615
+    },
+    {
+      "epoch": 0.15616,
+      "grad_norm": 1.1218299865722656,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 15616
+    },
+    {
+      "epoch": 0.15617,
+      "grad_norm": 0.8555946350097656,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 15617
+    },
+    {
+      "epoch": 0.15618,
+      "grad_norm": 0.6954250931739807,
+      "learning_rate": 0.003,
+      "loss": 3.9787,
+      "step": 15618
+    },
+    {
+      "epoch": 0.15619,
+      "grad_norm": 0.6700589060783386,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 15619
+    },
+    {
+      "epoch": 0.1562,
+      "grad_norm": 0.6501047015190125,
+      "learning_rate": 0.003,
+      "loss": 3.9919,
+      "step": 15620
+    },
+    {
+      "epoch": 0.15621,
+      "grad_norm": 0.6431427001953125,
+      "learning_rate": 0.003,
+      "loss": 3.9813,
+      "step": 15621
+    },
+    {
+      "epoch": 0.15622,
+      "grad_norm": 0.7406570315361023,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 15622
+    },
+    {
+      "epoch": 0.15623,
+      "grad_norm": 0.8946971893310547,
+      "learning_rate": 0.003,
+      "loss": 3.9881,
+      "step": 15623
+    },
+    {
+      "epoch": 0.15624,
+      "grad_norm": 0.8742425441741943,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 15624
+    },
+    {
+      "epoch": 0.15625,
+      "grad_norm": 0.815301239490509,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 15625
+    },
+    {
+      "epoch": 0.15626,
+      "grad_norm": 0.7755097150802612,
+      "learning_rate": 0.003,
+      "loss": 4.0057,
+      "step": 15626
+    },
+    {
+      "epoch": 0.15627,
+      "grad_norm": 0.711911141872406,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 15627
+    },
+    {
+      "epoch": 0.15628,
+      "grad_norm": 0.7888513207435608,
+      "learning_rate": 0.003,
+      "loss": 3.9839,
+      "step": 15628
+    },
+    {
+      "epoch": 0.15629,
+      "grad_norm": 0.8034841418266296,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 15629
+    },
+    {
+      "epoch": 0.1563,
+      "grad_norm": 0.8269197344779968,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 15630
+    },
+    {
+      "epoch": 0.15631,
+      "grad_norm": 0.8813633918762207,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 15631
+    },
+    {
+      "epoch": 0.15632,
+      "grad_norm": 0.9256646633148193,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 15632
+    },
+    {
+      "epoch": 0.15633,
+      "grad_norm": 1.0113441944122314,
+      "learning_rate": 0.003,
+      "loss": 4.0357,
+      "step": 15633
+    },
+    {
+      "epoch": 0.15634,
+      "grad_norm": 1.004540205001831,
+      "learning_rate": 0.003,
+      "loss": 4.0388,
+      "step": 15634
+    },
+    {
+      "epoch": 0.15635,
+      "grad_norm": 1.0412640571594238,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 15635
+    },
+    {
+      "epoch": 0.15636,
+      "grad_norm": 0.9312543272972107,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 15636
+    },
+    {
+      "epoch": 0.15637,
+      "grad_norm": 0.9052963256835938,
+      "learning_rate": 0.003,
+      "loss": 4.0444,
+      "step": 15637
+    },
+    {
+      "epoch": 0.15638,
+      "grad_norm": 0.8330254554748535,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 15638
+    },
+    {
+      "epoch": 0.15639,
+      "grad_norm": 1.111505389213562,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 15639
+    },
+    {
+      "epoch": 0.1564,
+      "grad_norm": 0.963144063949585,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 15640
+    },
+    {
+      "epoch": 0.15641,
+      "grad_norm": 1.0859062671661377,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 15641
+    },
+    {
+      "epoch": 0.15642,
+      "grad_norm": 1.0575289726257324,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 15642
+    },
+    {
+      "epoch": 0.15643,
+      "grad_norm": 0.8799792528152466,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 15643
+    },
+    {
+      "epoch": 0.15644,
+      "grad_norm": 0.7085080146789551,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 15644
+    },
+    {
+      "epoch": 0.15645,
+      "grad_norm": 0.6737964749336243,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 15645
+    },
+    {
+      "epoch": 0.15646,
+      "grad_norm": 0.707210898399353,
+      "learning_rate": 0.003,
+      "loss": 3.9785,
+      "step": 15646
+    },
+    {
+      "epoch": 0.15647,
+      "grad_norm": 0.8392312526702881,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 15647
+    },
+    {
+      "epoch": 0.15648,
+      "grad_norm": 0.8704290986061096,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 15648
+    },
+    {
+      "epoch": 0.15649,
+      "grad_norm": 0.8335064053535461,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 15649
+    },
+    {
+      "epoch": 0.1565,
+      "grad_norm": 0.8319531083106995,
+      "learning_rate": 0.003,
+      "loss": 3.966,
+      "step": 15650
+    },
+    {
+      "epoch": 0.15651,
+      "grad_norm": 0.8499333262443542,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 15651
+    },
+    {
+      "epoch": 0.15652,
+      "grad_norm": 0.9454139471054077,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 15652
+    },
+    {
+      "epoch": 0.15653,
+      "grad_norm": 1.1151617765426636,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 15653
+    },
+    {
+      "epoch": 0.15654,
+      "grad_norm": 0.956888735294342,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 15654
+    },
+    {
+      "epoch": 0.15655,
+      "grad_norm": 0.9633231163024902,
+      "learning_rate": 0.003,
+      "loss": 4.0589,
+      "step": 15655
+    },
+    {
+      "epoch": 0.15656,
+      "grad_norm": 0.875674843788147,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 15656
+    },
+    {
+      "epoch": 0.15657,
+      "grad_norm": 0.83741694688797,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 15657
+    },
+    {
+      "epoch": 0.15658,
+      "grad_norm": 0.7873344421386719,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 15658
+    },
+    {
+      "epoch": 0.15659,
+      "grad_norm": 0.8622631430625916,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 15659
+    },
+    {
+      "epoch": 0.1566,
+      "grad_norm": 0.9279429316520691,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 15660
+    },
+    {
+      "epoch": 0.15661,
+      "grad_norm": 1.0509636402130127,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 15661
+    },
+    {
+      "epoch": 0.15662,
+      "grad_norm": 1.0412518978118896,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 15662
+    },
+    {
+      "epoch": 0.15663,
+      "grad_norm": 0.8322525024414062,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 15663
+    },
+    {
+      "epoch": 0.15664,
+      "grad_norm": 0.6247098445892334,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 15664
+    },
+    {
+      "epoch": 0.15665,
+      "grad_norm": 0.6534708738327026,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 15665
+    },
+    {
+      "epoch": 0.15666,
+      "grad_norm": 0.75336754322052,
+      "learning_rate": 0.003,
+      "loss": 3.9984,
+      "step": 15666
+    },
+    {
+      "epoch": 0.15667,
+      "grad_norm": 0.8604514598846436,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 15667
+    },
+    {
+      "epoch": 0.15668,
+      "grad_norm": 0.8654664158821106,
+      "learning_rate": 0.003,
+      "loss": 3.989,
+      "step": 15668
+    },
+    {
+      "epoch": 0.15669,
+      "grad_norm": 0.9390060305595398,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 15669
+    },
+    {
+      "epoch": 0.1567,
+      "grad_norm": 1.0578045845031738,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 15670
+    },
+    {
+      "epoch": 0.15671,
+      "grad_norm": 0.9121909737586975,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 15671
+    },
+    {
+      "epoch": 0.15672,
+      "grad_norm": 0.7117065191268921,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 15672
+    },
+    {
+      "epoch": 0.15673,
+      "grad_norm": 0.70128333568573,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 15673
+    },
+    {
+      "epoch": 0.15674,
+      "grad_norm": 0.6423056721687317,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 15674
+    },
+    {
+      "epoch": 0.15675,
+      "grad_norm": 0.6011069416999817,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 15675
+    },
+    {
+      "epoch": 0.15676,
+      "grad_norm": 0.5848559141159058,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 15676
+    },
+    {
+      "epoch": 0.15677,
+      "grad_norm": 0.5934710502624512,
+      "learning_rate": 0.003,
+      "loss": 3.9768,
+      "step": 15677
+    },
+    {
+      "epoch": 0.15678,
+      "grad_norm": 0.5638435482978821,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 15678
+    },
+    {
+      "epoch": 0.15679,
+      "grad_norm": 0.6302936673164368,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 15679
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.6576014161109924,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 15680
+    },
+    {
+      "epoch": 0.15681,
+      "grad_norm": 0.732036292552948,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 15681
+    },
+    {
+      "epoch": 0.15682,
+      "grad_norm": 0.6981785297393799,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 15682
+    },
+    {
+      "epoch": 0.15683,
+      "grad_norm": 0.7637856006622314,
+      "learning_rate": 0.003,
+      "loss": 4.0027,
+      "step": 15683
+    },
+    {
+      "epoch": 0.15684,
+      "grad_norm": 0.8975669145584106,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 15684
+    },
+    {
+      "epoch": 0.15685,
+      "grad_norm": 0.9504064917564392,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 15685
+    },
+    {
+      "epoch": 0.15686,
+      "grad_norm": 0.9674578309059143,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 15686
+    },
+    {
+      "epoch": 0.15687,
+      "grad_norm": 1.233467936515808,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 15687
+    },
+    {
+      "epoch": 0.15688,
+      "grad_norm": 1.1004282236099243,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 15688
+    },
+    {
+      "epoch": 0.15689,
+      "grad_norm": 0.9505053162574768,
+      "learning_rate": 0.003,
+      "loss": 4.0417,
+      "step": 15689
+    },
+    {
+      "epoch": 0.1569,
+      "grad_norm": 1.0994209051132202,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 15690
+    },
+    {
+      "epoch": 0.15691,
+      "grad_norm": 1.0676054954528809,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 15691
+    },
+    {
+      "epoch": 0.15692,
+      "grad_norm": 0.883074939250946,
+      "learning_rate": 0.003,
+      "loss": 3.9993,
+      "step": 15692
+    },
+    {
+      "epoch": 0.15693,
+      "grad_norm": 1.0565145015716553,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 15693
+    },
+    {
+      "epoch": 0.15694,
+      "grad_norm": 1.1039477586746216,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 15694
+    },
+    {
+      "epoch": 0.15695,
+      "grad_norm": 0.8456154465675354,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 15695
+    },
+    {
+      "epoch": 0.15696,
+      "grad_norm": 0.7581636905670166,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 15696
+    },
+    {
+      "epoch": 0.15697,
+      "grad_norm": 0.6965075135231018,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 15697
+    },
+    {
+      "epoch": 0.15698,
+      "grad_norm": 0.7575919032096863,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 15698
+    },
+    {
+      "epoch": 0.15699,
+      "grad_norm": 0.8254173398017883,
+      "learning_rate": 0.003,
+      "loss": 4.0515,
+      "step": 15699
+    },
+    {
+      "epoch": 0.157,
+      "grad_norm": 0.8927112221717834,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 15700
+    },
+    {
+      "epoch": 0.15701,
+      "grad_norm": 0.8910102248191833,
+      "learning_rate": 0.003,
+      "loss": 3.9948,
+      "step": 15701
+    },
+    {
+      "epoch": 0.15702,
+      "grad_norm": 0.8605186343193054,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 15702
+    },
+    {
+      "epoch": 0.15703,
+      "grad_norm": 0.8565143346786499,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 15703
+    },
+    {
+      "epoch": 0.15704,
+      "grad_norm": 0.8403797149658203,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 15704
+    },
+    {
+      "epoch": 0.15705,
+      "grad_norm": 0.8270075917243958,
+      "learning_rate": 0.003,
+      "loss": 3.9985,
+      "step": 15705
+    },
+    {
+      "epoch": 0.15706,
+      "grad_norm": 0.7761716246604919,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 15706
+    },
+    {
+      "epoch": 0.15707,
+      "grad_norm": 0.7189915776252747,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 15707
+    },
+    {
+      "epoch": 0.15708,
+      "grad_norm": 0.7878168225288391,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 15708
+    },
+    {
+      "epoch": 0.15709,
+      "grad_norm": 0.8748422861099243,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 15709
+    },
+    {
+      "epoch": 0.1571,
+      "grad_norm": 1.0179502964019775,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 15710
+    },
+    {
+      "epoch": 0.15711,
+      "grad_norm": 1.0124653577804565,
+      "learning_rate": 0.003,
+      "loss": 3.9966,
+      "step": 15711
+    },
+    {
+      "epoch": 0.15712,
+      "grad_norm": 0.7594412565231323,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 15712
+    },
+    {
+      "epoch": 0.15713,
+      "grad_norm": 0.5174108743667603,
+      "learning_rate": 0.003,
+      "loss": 3.9847,
+      "step": 15713
+    },
+    {
+      "epoch": 0.15714,
+      "grad_norm": 0.6995696425437927,
+      "learning_rate": 0.003,
+      "loss": 3.9965,
+      "step": 15714
+    },
+    {
+      "epoch": 0.15715,
+      "grad_norm": 0.7402389645576477,
+      "learning_rate": 0.003,
+      "loss": 3.9862,
+      "step": 15715
+    },
+    {
+      "epoch": 0.15716,
+      "grad_norm": 0.6621226072311401,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 15716
+    },
+    {
+      "epoch": 0.15717,
+      "grad_norm": 0.6631869673728943,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 15717
+    },
+    {
+      "epoch": 0.15718,
+      "grad_norm": 0.8022869825363159,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 15718
+    },
+    {
+      "epoch": 0.15719,
+      "grad_norm": 0.8688627481460571,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 15719
+    },
+    {
+      "epoch": 0.1572,
+      "grad_norm": 0.8488649725914001,
+      "learning_rate": 0.003,
+      "loss": 4.0072,
+      "step": 15720
+    },
+    {
+      "epoch": 0.15721,
+      "grad_norm": 0.7200070023536682,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 15721
+    },
+    {
+      "epoch": 0.15722,
+      "grad_norm": 0.6765528321266174,
+      "learning_rate": 0.003,
+      "loss": 3.992,
+      "step": 15722
+    },
+    {
+      "epoch": 0.15723,
+      "grad_norm": 0.7749486565589905,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 15723
+    },
+    {
+      "epoch": 0.15724,
+      "grad_norm": 0.9206483364105225,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 15724
+    },
+    {
+      "epoch": 0.15725,
+      "grad_norm": 0.7665677070617676,
+      "learning_rate": 0.003,
+      "loss": 3.9809,
+      "step": 15725
+    },
+    {
+      "epoch": 0.15726,
+      "grad_norm": 0.6814460754394531,
+      "learning_rate": 0.003,
+      "loss": 3.9814,
+      "step": 15726
+    },
+    {
+      "epoch": 0.15727,
+      "grad_norm": 0.6091095805168152,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 15727
+    },
+    {
+      "epoch": 0.15728,
+      "grad_norm": 0.6103879809379578,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 15728
+    },
+    {
+      "epoch": 0.15729,
+      "grad_norm": 0.6739366054534912,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 15729
+    },
+    {
+      "epoch": 0.1573,
+      "grad_norm": 0.738776445388794,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 15730
+    },
+    {
+      "epoch": 0.15731,
+      "grad_norm": 0.7757909893989563,
+      "learning_rate": 0.003,
+      "loss": 3.9866,
+      "step": 15731
+    },
+    {
+      "epoch": 0.15732,
+      "grad_norm": 0.998759388923645,
+      "learning_rate": 0.003,
+      "loss": 3.9856,
+      "step": 15732
+    },
+    {
+      "epoch": 0.15733,
+      "grad_norm": 1.298292875289917,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 15733
+    },
+    {
+      "epoch": 0.15734,
+      "grad_norm": 0.7509281039237976,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 15734
+    },
+    {
+      "epoch": 0.15735,
+      "grad_norm": 0.7110625505447388,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 15735
+    },
+    {
+      "epoch": 0.15736,
+      "grad_norm": 0.7740097641944885,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 15736
+    },
+    {
+      "epoch": 0.15737,
+      "grad_norm": 0.7516487836837769,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 15737
+    },
+    {
+      "epoch": 0.15738,
+      "grad_norm": 0.6357292532920837,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 15738
+    },
+    {
+      "epoch": 0.15739,
+      "grad_norm": 0.5393524765968323,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 15739
+    },
+    {
+      "epoch": 0.1574,
+      "grad_norm": 0.6245474219322205,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 15740
+    },
+    {
+      "epoch": 0.15741,
+      "grad_norm": 0.9935439825057983,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 15741
+    },
+    {
+      "epoch": 0.15742,
+      "grad_norm": 1.4136745929718018,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 15742
+    },
+    {
+      "epoch": 0.15743,
+      "grad_norm": 0.5433087944984436,
+      "learning_rate": 0.003,
+      "loss": 3.9917,
+      "step": 15743
+    },
+    {
+      "epoch": 0.15744,
+      "grad_norm": 0.6950448751449585,
+      "learning_rate": 0.003,
+      "loss": 3.9823,
+      "step": 15744
+    },
+    {
+      "epoch": 0.15745,
+      "grad_norm": 0.832676112651825,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 15745
+    },
+    {
+      "epoch": 0.15746,
+      "grad_norm": 0.8238332271575928,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 15746
+    },
+    {
+      "epoch": 0.15747,
+      "grad_norm": 0.8855404257774353,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 15747
+    },
+    {
+      "epoch": 0.15748,
+      "grad_norm": 0.8645646572113037,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 15748
+    },
+    {
+      "epoch": 0.15749,
+      "grad_norm": 0.9085519909858704,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 15749
+    },
+    {
+      "epoch": 0.1575,
+      "grad_norm": 1.0006437301635742,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 15750
+    },
+    {
+      "epoch": 0.15751,
+      "grad_norm": 1.1213877201080322,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 15751
+    },
+    {
+      "epoch": 0.15752,
+      "grad_norm": 1.1313105821609497,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 15752
+    },
+    {
+      "epoch": 0.15753,
+      "grad_norm": 0.9690887928009033,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 15753
+    },
+    {
+      "epoch": 0.15754,
+      "grad_norm": 0.9049549102783203,
+      "learning_rate": 0.003,
+      "loss": 4.0323,
+      "step": 15754
+    },
+    {
+      "epoch": 0.15755,
+      "grad_norm": 0.8756823539733887,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 15755
+    },
+    {
+      "epoch": 0.15756,
+      "grad_norm": 0.7845616936683655,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 15756
+    },
+    {
+      "epoch": 0.15757,
+      "grad_norm": 0.7886598110198975,
+      "learning_rate": 0.003,
+      "loss": 3.9858,
+      "step": 15757
+    },
+    {
+      "epoch": 0.15758,
+      "grad_norm": 0.8714204430580139,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 15758
+    },
+    {
+      "epoch": 0.15759,
+      "grad_norm": 1.0030721426010132,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 15759
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 1.263763189315796,
+      "learning_rate": 0.003,
+      "loss": 3.9993,
+      "step": 15760
+    },
+    {
+      "epoch": 0.15761,
+      "grad_norm": 0.9564282894134521,
+      "learning_rate": 0.003,
+      "loss": 3.9829,
+      "step": 15761
+    },
+    {
+      "epoch": 0.15762,
+      "grad_norm": 0.9213220477104187,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 15762
+    },
+    {
+      "epoch": 0.15763,
+      "grad_norm": 0.9312294721603394,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 15763
+    },
+    {
+      "epoch": 0.15764,
+      "grad_norm": 0.7343323826789856,
+      "learning_rate": 0.003,
+      "loss": 3.9812,
+      "step": 15764
+    },
+    {
+      "epoch": 0.15765,
+      "grad_norm": 0.7667588591575623,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 15765
+    },
+    {
+      "epoch": 0.15766,
+      "grad_norm": 0.7190549373626709,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 15766
+    },
+    {
+      "epoch": 0.15767,
+      "grad_norm": 0.7208852767944336,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 15767
+    },
+    {
+      "epoch": 0.15768,
+      "grad_norm": 0.7332645058631897,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 15768
+    },
+    {
+      "epoch": 0.15769,
+      "grad_norm": 0.825160801410675,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 15769
+    },
+    {
+      "epoch": 0.1577,
+      "grad_norm": 0.7511894702911377,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 15770
+    },
+    {
+      "epoch": 0.15771,
+      "grad_norm": 0.8381373882293701,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 15771
+    },
+    {
+      "epoch": 0.15772,
+      "grad_norm": 1.0264532566070557,
+      "learning_rate": 0.003,
+      "loss": 4.0485,
+      "step": 15772
+    },
+    {
+      "epoch": 0.15773,
+      "grad_norm": 0.9846251010894775,
+      "learning_rate": 0.003,
+      "loss": 4.0256,
+      "step": 15773
+    },
+    {
+      "epoch": 0.15774,
+      "grad_norm": 0.8334699869155884,
+      "learning_rate": 0.003,
+      "loss": 4.0095,
+      "step": 15774
+    },
+    {
+      "epoch": 0.15775,
+      "grad_norm": 0.6779347062110901,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 15775
+    },
+    {
+      "epoch": 0.15776,
+      "grad_norm": 0.6254589557647705,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 15776
+    },
+    {
+      "epoch": 0.15777,
+      "grad_norm": 0.679844856262207,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 15777
+    },
+    {
+      "epoch": 0.15778,
+      "grad_norm": 0.7129006385803223,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 15778
+    },
+    {
+      "epoch": 0.15779,
+      "grad_norm": 0.7573176026344299,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 15779
+    },
+    {
+      "epoch": 0.1578,
+      "grad_norm": 0.7286023497581482,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 15780
+    },
+    {
+      "epoch": 0.15781,
+      "grad_norm": 0.7774902582168579,
+      "learning_rate": 0.003,
+      "loss": 3.9493,
+      "step": 15781
+    },
+    {
+      "epoch": 0.15782,
+      "grad_norm": 0.8229116201400757,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 15782
+    },
+    {
+      "epoch": 0.15783,
+      "grad_norm": 1.1103692054748535,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 15783
+    },
+    {
+      "epoch": 0.15784,
+      "grad_norm": 0.8943964242935181,
+      "learning_rate": 0.003,
+      "loss": 3.9982,
+      "step": 15784
+    },
+    {
+      "epoch": 0.15785,
+      "grad_norm": 0.8067341446876526,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 15785
+    },
+    {
+      "epoch": 0.15786,
+      "grad_norm": 0.7550935745239258,
+      "learning_rate": 0.003,
+      "loss": 3.9907,
+      "step": 15786
+    },
+    {
+      "epoch": 0.15787,
+      "grad_norm": 0.6463281512260437,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 15787
+    },
+    {
+      "epoch": 0.15788,
+      "grad_norm": 0.6048409938812256,
+      "learning_rate": 0.003,
+      "loss": 4.0251,
+      "step": 15788
+    },
+    {
+      "epoch": 0.15789,
+      "grad_norm": 0.586916983127594,
+      "learning_rate": 0.003,
+      "loss": 3.9921,
+      "step": 15789
+    },
+    {
+      "epoch": 0.1579,
+      "grad_norm": 0.606907308101654,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 15790
+    },
+    {
+      "epoch": 0.15791,
+      "grad_norm": 0.7055268883705139,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 15791
+    },
+    {
+      "epoch": 0.15792,
+      "grad_norm": 0.8679223656654358,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 15792
+    },
+    {
+      "epoch": 0.15793,
+      "grad_norm": 1.0493624210357666,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 15793
+    },
+    {
+      "epoch": 0.15794,
+      "grad_norm": 1.0888004302978516,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 15794
+    },
+    {
+      "epoch": 0.15795,
+      "grad_norm": 0.9175351858139038,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 15795
+    },
+    {
+      "epoch": 0.15796,
+      "grad_norm": 0.8576933145523071,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 15796
+    },
+    {
+      "epoch": 0.15797,
+      "grad_norm": 0.7691969871520996,
+      "learning_rate": 0.003,
+      "loss": 3.9746,
+      "step": 15797
+    },
+    {
+      "epoch": 0.15798,
+      "grad_norm": 0.7251110672950745,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 15798
+    },
+    {
+      "epoch": 0.15799,
+      "grad_norm": 0.7915005683898926,
+      "learning_rate": 0.003,
+      "loss": 3.9938,
+      "step": 15799
+    },
+    {
+      "epoch": 0.158,
+      "grad_norm": 0.9329673051834106,
+      "learning_rate": 0.003,
+      "loss": 3.9966,
+      "step": 15800
+    },
+    {
+      "epoch": 0.15801,
+      "grad_norm": 0.9955234527587891,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 15801
+    },
+    {
+      "epoch": 0.15802,
+      "grad_norm": 1.0439205169677734,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 15802
+    },
+    {
+      "epoch": 0.15803,
+      "grad_norm": 0.9350157380104065,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 15803
+    },
+    {
+      "epoch": 0.15804,
+      "grad_norm": 0.9399392604827881,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 15804
+    },
+    {
+      "epoch": 0.15805,
+      "grad_norm": 1.160086750984192,
+      "learning_rate": 0.003,
+      "loss": 4.0451,
+      "step": 15805
+    },
+    {
+      "epoch": 0.15806,
+      "grad_norm": 1.1166666746139526,
+      "learning_rate": 0.003,
+      "loss": 4.0369,
+      "step": 15806
+    },
+    {
+      "epoch": 0.15807,
+      "grad_norm": 0.7115695476531982,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 15807
+    },
+    {
+      "epoch": 0.15808,
+      "grad_norm": 0.5912373065948486,
+      "learning_rate": 0.003,
+      "loss": 3.995,
+      "step": 15808
+    },
+    {
+      "epoch": 0.15809,
+      "grad_norm": 0.6347526907920837,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 15809
+    },
+    {
+      "epoch": 0.1581,
+      "grad_norm": 0.7153865694999695,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 15810
+    },
+    {
+      "epoch": 0.15811,
+      "grad_norm": 0.8052754998207092,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 15811
+    },
+    {
+      "epoch": 0.15812,
+      "grad_norm": 0.8784002661705017,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 15812
+    },
+    {
+      "epoch": 0.15813,
+      "grad_norm": 0.9777888059616089,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 15813
+    },
+    {
+      "epoch": 0.15814,
+      "grad_norm": 1.0682368278503418,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 15814
+    },
+    {
+      "epoch": 0.15815,
+      "grad_norm": 1.0067962408065796,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 15815
+    },
+    {
+      "epoch": 0.15816,
+      "grad_norm": 0.9310998916625977,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 15816
+    },
+    {
+      "epoch": 0.15817,
+      "grad_norm": 0.8633456826210022,
+      "learning_rate": 0.003,
+      "loss": 3.9993,
+      "step": 15817
+    },
+    {
+      "epoch": 0.15818,
+      "grad_norm": 0.8136230111122131,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 15818
+    },
+    {
+      "epoch": 0.15819,
+      "grad_norm": 0.6792727708816528,
+      "learning_rate": 0.003,
+      "loss": 3.97,
+      "step": 15819
+    },
+    {
+      "epoch": 0.1582,
+      "grad_norm": 0.6035311818122864,
+      "learning_rate": 0.003,
+      "loss": 4.0444,
+      "step": 15820
+    },
+    {
+      "epoch": 0.15821,
+      "grad_norm": 0.6503746509552002,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 15821
+    },
+    {
+      "epoch": 0.15822,
+      "grad_norm": 0.6729138493537903,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 15822
+    },
+    {
+      "epoch": 0.15823,
+      "grad_norm": 0.7227945923805237,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 15823
+    },
+    {
+      "epoch": 0.15824,
+      "grad_norm": 0.846311628818512,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 15824
+    },
+    {
+      "epoch": 0.15825,
+      "grad_norm": 1.0390722751617432,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 15825
+    },
+    {
+      "epoch": 0.15826,
+      "grad_norm": 1.2302485704421997,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 15826
+    },
+    {
+      "epoch": 0.15827,
+      "grad_norm": 0.8671656847000122,
+      "learning_rate": 0.003,
+      "loss": 3.9886,
+      "step": 15827
+    },
+    {
+      "epoch": 0.15828,
+      "grad_norm": 0.9332629442214966,
+      "learning_rate": 0.003,
+      "loss": 3.997,
+      "step": 15828
+    },
+    {
+      "epoch": 0.15829,
+      "grad_norm": 0.970983624458313,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 15829
+    },
+    {
+      "epoch": 0.1583,
+      "grad_norm": 1.1105905771255493,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 15830
+    },
+    {
+      "epoch": 0.15831,
+      "grad_norm": 1.189861536026001,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 15831
+    },
+    {
+      "epoch": 0.15832,
+      "grad_norm": 0.7376668453216553,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 15832
+    },
+    {
+      "epoch": 0.15833,
+      "grad_norm": 0.6104328036308289,
+      "learning_rate": 0.003,
+      "loss": 3.9893,
+      "step": 15833
+    },
+    {
+      "epoch": 0.15834,
+      "grad_norm": 0.6534768342971802,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 15834
+    },
+    {
+      "epoch": 0.15835,
+      "grad_norm": 0.6755269169807434,
+      "learning_rate": 0.003,
+      "loss": 4.0027,
+      "step": 15835
+    },
+    {
+      "epoch": 0.15836,
+      "grad_norm": 0.6206930875778198,
+      "learning_rate": 0.003,
+      "loss": 3.9998,
+      "step": 15836
+    },
+    {
+      "epoch": 0.15837,
+      "grad_norm": 0.5505279898643494,
+      "learning_rate": 0.003,
+      "loss": 3.9977,
+      "step": 15837
+    },
+    {
+      "epoch": 0.15838,
+      "grad_norm": 0.5139374136924744,
+      "learning_rate": 0.003,
+      "loss": 4.0057,
+      "step": 15838
+    },
+    {
+      "epoch": 0.15839,
+      "grad_norm": 0.5601384043693542,
+      "learning_rate": 0.003,
+      "loss": 3.9857,
+      "step": 15839
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.6331465244293213,
+      "learning_rate": 0.003,
+      "loss": 3.9929,
+      "step": 15840
+    },
+    {
+      "epoch": 0.15841,
+      "grad_norm": 0.7213279604911804,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 15841
+    },
+    {
+      "epoch": 0.15842,
+      "grad_norm": 0.7569580674171448,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 15842
+    },
+    {
+      "epoch": 0.15843,
+      "grad_norm": 0.9473897814750671,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 15843
+    },
+    {
+      "epoch": 0.15844,
+      "grad_norm": 1.1790201663970947,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 15844
+    },
+    {
+      "epoch": 0.15845,
+      "grad_norm": 0.8182543516159058,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 15845
+    },
+    {
+      "epoch": 0.15846,
+      "grad_norm": 0.7447865009307861,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 15846
+    },
+    {
+      "epoch": 0.15847,
+      "grad_norm": 0.8887273669242859,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 15847
+    },
+    {
+      "epoch": 0.15848,
+      "grad_norm": 1.0807547569274902,
+      "learning_rate": 0.003,
+      "loss": 3.9988,
+      "step": 15848
+    },
+    {
+      "epoch": 0.15849,
+      "grad_norm": 0.9198710918426514,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 15849
+    },
+    {
+      "epoch": 0.1585,
+      "grad_norm": 0.796409010887146,
+      "learning_rate": 0.003,
+      "loss": 4.034,
+      "step": 15850
+    },
+    {
+      "epoch": 0.15851,
+      "grad_norm": 0.7506511211395264,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 15851
+    },
+    {
+      "epoch": 0.15852,
+      "grad_norm": 0.6153442859649658,
+      "learning_rate": 0.003,
+      "loss": 3.9782,
+      "step": 15852
+    },
+    {
+      "epoch": 0.15853,
+      "grad_norm": 0.637804388999939,
+      "learning_rate": 0.003,
+      "loss": 3.9894,
+      "step": 15853
+    },
+    {
+      "epoch": 0.15854,
+      "grad_norm": 0.6735568642616272,
+      "learning_rate": 0.003,
+      "loss": 3.9808,
+      "step": 15854
+    },
+    {
+      "epoch": 0.15855,
+      "grad_norm": 0.7883623838424683,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 15855
+    },
+    {
+      "epoch": 0.15856,
+      "grad_norm": 0.9529778957366943,
+      "learning_rate": 0.003,
+      "loss": 4.0294,
+      "step": 15856
+    },
+    {
+      "epoch": 0.15857,
+      "grad_norm": 0.9971096515655518,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 15857
+    },
+    {
+      "epoch": 0.15858,
+      "grad_norm": 1.0070446729660034,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 15858
+    },
+    {
+      "epoch": 0.15859,
+      "grad_norm": 0.981824517250061,
+      "learning_rate": 0.003,
+      "loss": 4.0351,
+      "step": 15859
+    },
+    {
+      "epoch": 0.1586,
+      "grad_norm": 0.8740189671516418,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 15860
+    },
+    {
+      "epoch": 0.15861,
+      "grad_norm": 0.9106821417808533,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 15861
+    },
+    {
+      "epoch": 0.15862,
+      "grad_norm": 0.8984867930412292,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 15862
+    },
+    {
+      "epoch": 0.15863,
+      "grad_norm": 0.8848163485527039,
+      "learning_rate": 0.003,
+      "loss": 3.992,
+      "step": 15863
+    },
+    {
+      "epoch": 0.15864,
+      "grad_norm": 0.9364362359046936,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 15864
+    },
+    {
+      "epoch": 0.15865,
+      "grad_norm": 1.0542372465133667,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 15865
+    },
+    {
+      "epoch": 0.15866,
+      "grad_norm": 1.015795350074768,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 15866
+    },
+    {
+      "epoch": 0.15867,
+      "grad_norm": 1.0119582414627075,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 15867
+    },
+    {
+      "epoch": 0.15868,
+      "grad_norm": 1.1160649061203003,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 15868
+    },
+    {
+      "epoch": 0.15869,
+      "grad_norm": 0.9358794093132019,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 15869
+    },
+    {
+      "epoch": 0.1587,
+      "grad_norm": 0.9446738958358765,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 15870
+    },
+    {
+      "epoch": 0.15871,
+      "grad_norm": 1.2070125341415405,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 15871
+    },
+    {
+      "epoch": 0.15872,
+      "grad_norm": 0.9497233033180237,
+      "learning_rate": 0.003,
+      "loss": 4.0406,
+      "step": 15872
+    },
+    {
+      "epoch": 0.15873,
+      "grad_norm": 0.9464455842971802,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 15873
+    },
+    {
+      "epoch": 0.15874,
+      "grad_norm": 0.9484769701957703,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 15874
+    },
+    {
+      "epoch": 0.15875,
+      "grad_norm": 1.0746848583221436,
+      "learning_rate": 0.003,
+      "loss": 4.0028,
+      "step": 15875
+    },
+    {
+      "epoch": 0.15876,
+      "grad_norm": 1.0386066436767578,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 15876
+    },
+    {
+      "epoch": 0.15877,
+      "grad_norm": 0.9693458080291748,
+      "learning_rate": 0.003,
+      "loss": 4.0467,
+      "step": 15877
+    },
+    {
+      "epoch": 0.15878,
+      "grad_norm": 0.8267617225646973,
+      "learning_rate": 0.003,
+      "loss": 4.009,
+      "step": 15878
+    },
+    {
+      "epoch": 0.15879,
+      "grad_norm": 0.811514139175415,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 15879
+    },
+    {
+      "epoch": 0.1588,
+      "grad_norm": 0.7325130701065063,
+      "learning_rate": 0.003,
+      "loss": 4.0406,
+      "step": 15880
+    },
+    {
+      "epoch": 0.15881,
+      "grad_norm": 0.8104916214942932,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 15881
+    },
+    {
+      "epoch": 0.15882,
+      "grad_norm": 0.9261886477470398,
+      "learning_rate": 0.003,
+      "loss": 3.9897,
+      "step": 15882
+    },
+    {
+      "epoch": 0.15883,
+      "grad_norm": 1.1093589067459106,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 15883
+    },
+    {
+      "epoch": 0.15884,
+      "grad_norm": 0.7875494956970215,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 15884
+    },
+    {
+      "epoch": 0.15885,
+      "grad_norm": 0.6347299814224243,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 15885
+    },
+    {
+      "epoch": 0.15886,
+      "grad_norm": 0.619236409664154,
+      "learning_rate": 0.003,
+      "loss": 4.0256,
+      "step": 15886
+    },
+    {
+      "epoch": 0.15887,
+      "grad_norm": 0.6201211810112,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 15887
+    },
+    {
+      "epoch": 0.15888,
+      "grad_norm": 0.6558188199996948,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 15888
+    },
+    {
+      "epoch": 0.15889,
+      "grad_norm": 0.6662975549697876,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 15889
+    },
+    {
+      "epoch": 0.1589,
+      "grad_norm": 0.6249191761016846,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 15890
+    },
+    {
+      "epoch": 0.15891,
+      "grad_norm": 0.6488112807273865,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 15891
+    },
+    {
+      "epoch": 0.15892,
+      "grad_norm": 0.5491712093353271,
+      "learning_rate": 0.003,
+      "loss": 3.9938,
+      "step": 15892
+    },
+    {
+      "epoch": 0.15893,
+      "grad_norm": 0.5083919763565063,
+      "learning_rate": 0.003,
+      "loss": 3.9888,
+      "step": 15893
+    },
+    {
+      "epoch": 0.15894,
+      "grad_norm": 0.5192022323608398,
+      "learning_rate": 0.003,
+      "loss": 3.9885,
+      "step": 15894
+    },
+    {
+      "epoch": 0.15895,
+      "grad_norm": 0.5186970829963684,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 15895
+    },
+    {
+      "epoch": 0.15896,
+      "grad_norm": 0.6246152520179749,
+      "learning_rate": 0.003,
+      "loss": 3.9771,
+      "step": 15896
+    },
+    {
+      "epoch": 0.15897,
+      "grad_norm": 0.788753092288971,
+      "learning_rate": 0.003,
+      "loss": 3.9679,
+      "step": 15897
+    },
+    {
+      "epoch": 0.15898,
+      "grad_norm": 0.9366752505302429,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 15898
+    },
+    {
+      "epoch": 0.15899,
+      "grad_norm": 1.1852505207061768,
+      "learning_rate": 0.003,
+      "loss": 3.976,
+      "step": 15899
+    },
+    {
+      "epoch": 0.159,
+      "grad_norm": 0.8160002827644348,
+      "learning_rate": 0.003,
+      "loss": 3.9726,
+      "step": 15900
+    },
+    {
+      "epoch": 0.15901,
+      "grad_norm": 0.7241095900535583,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 15901
+    },
+    {
+      "epoch": 0.15902,
+      "grad_norm": 0.7413921356201172,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 15902
+    },
+    {
+      "epoch": 0.15903,
+      "grad_norm": 0.7482760548591614,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 15903
+    },
+    {
+      "epoch": 0.15904,
+      "grad_norm": 0.7704154253005981,
+      "learning_rate": 0.003,
+      "loss": 4.0108,
+      "step": 15904
+    },
+    {
+      "epoch": 0.15905,
+      "grad_norm": 0.8344385027885437,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 15905
+    },
+    {
+      "epoch": 0.15906,
+      "grad_norm": 0.7981237173080444,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 15906
+    },
+    {
+      "epoch": 0.15907,
+      "grad_norm": 0.7326487898826599,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 15907
+    },
+    {
+      "epoch": 0.15908,
+      "grad_norm": 0.7611872553825378,
+      "learning_rate": 0.003,
+      "loss": 3.9944,
+      "step": 15908
+    },
+    {
+      "epoch": 0.15909,
+      "grad_norm": 0.9336971044540405,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 15909
+    },
+    {
+      "epoch": 0.1591,
+      "grad_norm": 1.031105637550354,
+      "learning_rate": 0.003,
+      "loss": 4.0176,
+      "step": 15910
+    },
+    {
+      "epoch": 0.15911,
+      "grad_norm": 0.9963960647583008,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 15911
+    },
+    {
+      "epoch": 0.15912,
+      "grad_norm": 0.822158932685852,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 15912
+    },
+    {
+      "epoch": 0.15913,
+      "grad_norm": 0.7185021042823792,
+      "learning_rate": 0.003,
+      "loss": 3.9893,
+      "step": 15913
+    },
+    {
+      "epoch": 0.15914,
+      "grad_norm": 0.6723746061325073,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 15914
+    },
+    {
+      "epoch": 0.15915,
+      "grad_norm": 0.6805190443992615,
+      "learning_rate": 0.003,
+      "loss": 3.9867,
+      "step": 15915
+    },
+    {
+      "epoch": 0.15916,
+      "grad_norm": 0.7104006409645081,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 15916
+    },
+    {
+      "epoch": 0.15917,
+      "grad_norm": 0.8186883926391602,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 15917
+    },
+    {
+      "epoch": 0.15918,
+      "grad_norm": 0.9097923636436462,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 15918
+    },
+    {
+      "epoch": 0.15919,
+      "grad_norm": 0.849427342414856,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 15919
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.8371151685714722,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 15920
+    },
+    {
+      "epoch": 0.15921,
+      "grad_norm": 0.8601378798484802,
+      "learning_rate": 0.003,
+      "loss": 4.0491,
+      "step": 15921
+    },
+    {
+      "epoch": 0.15922,
+      "grad_norm": 0.8923529982566833,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 15922
+    },
+    {
+      "epoch": 0.15923,
+      "grad_norm": 1.1591925621032715,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 15923
+    },
+    {
+      "epoch": 0.15924,
+      "grad_norm": 1.07288658618927,
+      "learning_rate": 0.003,
+      "loss": 4.0251,
+      "step": 15924
+    },
+    {
+      "epoch": 0.15925,
+      "grad_norm": 0.9240902066230774,
+      "learning_rate": 0.003,
+      "loss": 3.9928,
+      "step": 15925
+    },
+    {
+      "epoch": 0.15926,
+      "grad_norm": 0.9564006924629211,
+      "learning_rate": 0.003,
+      "loss": 4.0056,
+      "step": 15926
+    },
+    {
+      "epoch": 0.15927,
+      "grad_norm": 0.9875341057777405,
+      "learning_rate": 0.003,
+      "loss": 3.9816,
+      "step": 15927
+    },
+    {
+      "epoch": 0.15928,
+      "grad_norm": 0.8872766494750977,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 15928
+    },
+    {
+      "epoch": 0.15929,
+      "grad_norm": 0.7050520777702332,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 15929
+    },
+    {
+      "epoch": 0.1593,
+      "grad_norm": 0.675094485282898,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 15930
+    },
+    {
+      "epoch": 0.15931,
+      "grad_norm": 0.6591386795043945,
+      "learning_rate": 0.003,
+      "loss": 4.0108,
+      "step": 15931
+    },
+    {
+      "epoch": 0.15932,
+      "grad_norm": 0.6734553575515747,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 15932
+    },
+    {
+      "epoch": 0.15933,
+      "grad_norm": 0.8183740377426147,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 15933
+    },
+    {
+      "epoch": 0.15934,
+      "grad_norm": 1.0227583646774292,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 15934
+    },
+    {
+      "epoch": 0.15935,
+      "grad_norm": 1.2014424800872803,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 15935
+    },
+    {
+      "epoch": 0.15936,
+      "grad_norm": 0.6601992845535278,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 15936
+    },
+    {
+      "epoch": 0.15937,
+      "grad_norm": 0.5800651907920837,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 15937
+    },
+    {
+      "epoch": 0.15938,
+      "grad_norm": 0.7739246487617493,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 15938
+    },
+    {
+      "epoch": 0.15939,
+      "grad_norm": 0.7693057656288147,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 15939
+    },
+    {
+      "epoch": 0.1594,
+      "grad_norm": 0.7467857599258423,
+      "learning_rate": 0.003,
+      "loss": 3.9907,
+      "step": 15940
+    },
+    {
+      "epoch": 0.15941,
+      "grad_norm": 0.7646198868751526,
+      "learning_rate": 0.003,
+      "loss": 3.987,
+      "step": 15941
+    },
+    {
+      "epoch": 0.15942,
+      "grad_norm": 0.7261171936988831,
+      "learning_rate": 0.003,
+      "loss": 3.9682,
+      "step": 15942
+    },
+    {
+      "epoch": 0.15943,
+      "grad_norm": 0.7164345383644104,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 15943
+    },
+    {
+      "epoch": 0.15944,
+      "grad_norm": 0.7011243104934692,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 15944
+    },
+    {
+      "epoch": 0.15945,
+      "grad_norm": 0.8210936784744263,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 15945
+    },
+    {
+      "epoch": 0.15946,
+      "grad_norm": 0.8095575571060181,
+      "learning_rate": 0.003,
+      "loss": 3.9953,
+      "step": 15946
+    },
+    {
+      "epoch": 0.15947,
+      "grad_norm": 0.674490749835968,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 15947
+    },
+    {
+      "epoch": 0.15948,
+      "grad_norm": 0.6978493928909302,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 15948
+    },
+    {
+      "epoch": 0.15949,
+      "grad_norm": 0.7934913039207458,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 15949
+    },
+    {
+      "epoch": 0.1595,
+      "grad_norm": 0.9713555574417114,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 15950
+    },
+    {
+      "epoch": 0.15951,
+      "grad_norm": 1.0996856689453125,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 15951
+    },
+    {
+      "epoch": 0.15952,
+      "grad_norm": 0.8714984655380249,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 15952
+    },
+    {
+      "epoch": 0.15953,
+      "grad_norm": 0.8879398107528687,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 15953
+    },
+    {
+      "epoch": 0.15954,
+      "grad_norm": 1.0821666717529297,
+      "learning_rate": 0.003,
+      "loss": 4.0501,
+      "step": 15954
+    },
+    {
+      "epoch": 0.15955,
+      "grad_norm": 0.8716177344322205,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 15955
+    },
+    {
+      "epoch": 0.15956,
+      "grad_norm": 0.8655083179473877,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 15956
+    },
+    {
+      "epoch": 0.15957,
+      "grad_norm": 0.902826726436615,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 15957
+    },
+    {
+      "epoch": 0.15958,
+      "grad_norm": 0.8519112467765808,
+      "learning_rate": 0.003,
+      "loss": 4.0057,
+      "step": 15958
+    },
+    {
+      "epoch": 0.15959,
+      "grad_norm": 0.7480038404464722,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 15959
+    },
+    {
+      "epoch": 0.1596,
+      "grad_norm": 0.8103182911872864,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 15960
+    },
+    {
+      "epoch": 0.15961,
+      "grad_norm": 0.9224309921264648,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 15961
+    },
+    {
+      "epoch": 0.15962,
+      "grad_norm": 0.8068740367889404,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 15962
+    },
+    {
+      "epoch": 0.15963,
+      "grad_norm": 0.9216637015342712,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 15963
+    },
+    {
+      "epoch": 0.15964,
+      "grad_norm": 1.3789762258529663,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 15964
+    },
+    {
+      "epoch": 0.15965,
+      "grad_norm": 0.8822488784790039,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 15965
+    },
+    {
+      "epoch": 0.15966,
+      "grad_norm": 0.852074921131134,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 15966
+    },
+    {
+      "epoch": 0.15967,
+      "grad_norm": 0.842533528804779,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 15967
+    },
+    {
+      "epoch": 0.15968,
+      "grad_norm": 0.8331539034843445,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 15968
+    },
+    {
+      "epoch": 0.15969,
+      "grad_norm": 0.6840645670890808,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 15969
+    },
+    {
+      "epoch": 0.1597,
+      "grad_norm": 0.712909460067749,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 15970
+    },
+    {
+      "epoch": 0.15971,
+      "grad_norm": 0.7712053060531616,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 15971
+    },
+    {
+      "epoch": 0.15972,
+      "grad_norm": 0.7960266470909119,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 15972
+    },
+    {
+      "epoch": 0.15973,
+      "grad_norm": 0.8820586204528809,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 15973
+    },
+    {
+      "epoch": 0.15974,
+      "grad_norm": 1.0755890607833862,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 15974
+    },
+    {
+      "epoch": 0.15975,
+      "grad_norm": 0.9829635620117188,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 15975
+    },
+    {
+      "epoch": 0.15976,
+      "grad_norm": 0.9537001848220825,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 15976
+    },
+    {
+      "epoch": 0.15977,
+      "grad_norm": 0.8291124105453491,
+      "learning_rate": 0.003,
+      "loss": 4.0168,
+      "step": 15977
+    },
+    {
+      "epoch": 0.15978,
+      "grad_norm": 0.7226710319519043,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 15978
+    },
+    {
+      "epoch": 0.15979,
+      "grad_norm": 0.6324989795684814,
+      "learning_rate": 0.003,
+      "loss": 3.9748,
+      "step": 15979
+    },
+    {
+      "epoch": 0.1598,
+      "grad_norm": 0.6973319053649902,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 15980
+    },
+    {
+      "epoch": 0.15981,
+      "grad_norm": 0.7571434378623962,
+      "learning_rate": 0.003,
+      "loss": 3.9982,
+      "step": 15981
+    },
+    {
+      "epoch": 0.15982,
+      "grad_norm": 0.7698507905006409,
+      "learning_rate": 0.003,
+      "loss": 4.0316,
+      "step": 15982
+    },
+    {
+      "epoch": 0.15983,
+      "grad_norm": 0.8050060272216797,
+      "learning_rate": 0.003,
+      "loss": 3.9861,
+      "step": 15983
+    },
+    {
+      "epoch": 0.15984,
+      "grad_norm": 1.0299527645111084,
+      "learning_rate": 0.003,
+      "loss": 3.9926,
+      "step": 15984
+    },
+    {
+      "epoch": 0.15985,
+      "grad_norm": 1.2539191246032715,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 15985
+    },
+    {
+      "epoch": 0.15986,
+      "grad_norm": 0.7665183544158936,
+      "learning_rate": 0.003,
+      "loss": 3.9917,
+      "step": 15986
+    },
+    {
+      "epoch": 0.15987,
+      "grad_norm": 0.6669110655784607,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 15987
+    },
+    {
+      "epoch": 0.15988,
+      "grad_norm": 0.7086806297302246,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 15988
+    },
+    {
+      "epoch": 0.15989,
+      "grad_norm": 0.7054063677787781,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 15989
+    },
+    {
+      "epoch": 0.1599,
+      "grad_norm": 0.7590455412864685,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 15990
+    },
+    {
+      "epoch": 0.15991,
+      "grad_norm": 0.753837525844574,
+      "learning_rate": 0.003,
+      "loss": 4.031,
+      "step": 15991
+    },
+    {
+      "epoch": 0.15992,
+      "grad_norm": 0.8459493517875671,
+      "learning_rate": 0.003,
+      "loss": 4.0119,
+      "step": 15992
+    },
+    {
+      "epoch": 0.15993,
+      "grad_norm": 1.016892433166504,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 15993
+    },
+    {
+      "epoch": 0.15994,
+      "grad_norm": 1.1503039598464966,
+      "learning_rate": 0.003,
+      "loss": 4.0242,
+      "step": 15994
+    },
+    {
+      "epoch": 0.15995,
+      "grad_norm": 0.8415044546127319,
+      "learning_rate": 0.003,
+      "loss": 4.0006,
+      "step": 15995
+    },
+    {
+      "epoch": 0.15996,
+      "grad_norm": 0.806657075881958,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 15996
+    },
+    {
+      "epoch": 0.15997,
+      "grad_norm": 0.7540598511695862,
+      "learning_rate": 0.003,
+      "loss": 3.9991,
+      "step": 15997
+    },
+    {
+      "epoch": 0.15998,
+      "grad_norm": 0.6794475317001343,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 15998
+    },
+    {
+      "epoch": 0.15999,
+      "grad_norm": 0.7693151831626892,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 15999
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.9738240242004395,
+      "learning_rate": 0.003,
+      "loss": 4.0302,
+      "step": 16000
+    },
+    {
+      "epoch": 0.16001,
+      "grad_norm": 1.1086915731430054,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 16001
+    },
+    {
+      "epoch": 0.16002,
+      "grad_norm": 0.867268443107605,
+      "learning_rate": 0.003,
+      "loss": 4.0056,
+      "step": 16002
+    },
+    {
+      "epoch": 0.16003,
+      "grad_norm": 0.8266265392303467,
+      "learning_rate": 0.003,
+      "loss": 4.043,
+      "step": 16003
+    },
+    {
+      "epoch": 0.16004,
+      "grad_norm": 0.7528146505355835,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 16004
+    },
+    {
+      "epoch": 0.16005,
+      "grad_norm": 0.8397828340530396,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 16005
+    },
+    {
+      "epoch": 0.16006,
+      "grad_norm": 0.9907081127166748,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 16006
+    },
+    {
+      "epoch": 0.16007,
+      "grad_norm": 1.0469940900802612,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 16007
+    },
+    {
+      "epoch": 0.16008,
+      "grad_norm": 1.0257529020309448,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 16008
+    },
+    {
+      "epoch": 0.16009,
+      "grad_norm": 1.0527772903442383,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 16009
+    },
+    {
+      "epoch": 0.1601,
+      "grad_norm": 0.8561283349990845,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 16010
+    },
+    {
+      "epoch": 0.16011,
+      "grad_norm": 0.8803778886795044,
+      "learning_rate": 0.003,
+      "loss": 4.0176,
+      "step": 16011
+    },
+    {
+      "epoch": 0.16012,
+      "grad_norm": 1.0238463878631592,
+      "learning_rate": 0.003,
+      "loss": 4.0439,
+      "step": 16012
+    },
+    {
+      "epoch": 0.16013,
+      "grad_norm": 0.9067773818969727,
+      "learning_rate": 0.003,
+      "loss": 4.0538,
+      "step": 16013
+    },
+    {
+      "epoch": 0.16014,
+      "grad_norm": 0.9412308931350708,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 16014
+    },
+    {
+      "epoch": 0.16015,
+      "grad_norm": 0.956563413143158,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 16015
+    },
+    {
+      "epoch": 0.16016,
+      "grad_norm": 1.1569287776947021,
+      "learning_rate": 0.003,
+      "loss": 4.0057,
+      "step": 16016
+    },
+    {
+      "epoch": 0.16017,
+      "grad_norm": 0.8871514797210693,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 16017
+    },
+    {
+      "epoch": 0.16018,
+      "grad_norm": 0.8447285890579224,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 16018
+    },
+    {
+      "epoch": 0.16019,
+      "grad_norm": 0.9038500785827637,
+      "learning_rate": 0.003,
+      "loss": 3.9819,
+      "step": 16019
+    },
+    {
+      "epoch": 0.1602,
+      "grad_norm": 0.8824516534805298,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 16020
+    },
+    {
+      "epoch": 0.16021,
+      "grad_norm": 0.9204214811325073,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 16021
+    },
+    {
+      "epoch": 0.16022,
+      "grad_norm": 0.9645723104476929,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 16022
+    },
+    {
+      "epoch": 0.16023,
+      "grad_norm": 0.8048455119132996,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 16023
+    },
+    {
+      "epoch": 0.16024,
+      "grad_norm": 0.7482070326805115,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 16024
+    },
+    {
+      "epoch": 0.16025,
+      "grad_norm": 0.9110950231552124,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 16025
+    },
+    {
+      "epoch": 0.16026,
+      "grad_norm": 1.1130304336547852,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 16026
+    },
+    {
+      "epoch": 0.16027,
+      "grad_norm": 0.8323398232460022,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 16027
+    },
+    {
+      "epoch": 0.16028,
+      "grad_norm": 0.7672797441482544,
+      "learning_rate": 0.003,
+      "loss": 4.0294,
+      "step": 16028
+    },
+    {
+      "epoch": 0.16029,
+      "grad_norm": 0.7083603143692017,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 16029
+    },
+    {
+      "epoch": 0.1603,
+      "grad_norm": 0.6222543716430664,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 16030
+    },
+    {
+      "epoch": 0.16031,
+      "grad_norm": 0.5898983478546143,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 16031
+    },
+    {
+      "epoch": 0.16032,
+      "grad_norm": 0.6412619948387146,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 16032
+    },
+    {
+      "epoch": 0.16033,
+      "grad_norm": 0.6066755056381226,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 16033
+    },
+    {
+      "epoch": 0.16034,
+      "grad_norm": 0.5793968439102173,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 16034
+    },
+    {
+      "epoch": 0.16035,
+      "grad_norm": 0.5370006561279297,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 16035
+    },
+    {
+      "epoch": 0.16036,
+      "grad_norm": 0.44992175698280334,
+      "learning_rate": 0.003,
+      "loss": 3.989,
+      "step": 16036
+    },
+    {
+      "epoch": 0.16037,
+      "grad_norm": 0.6282742023468018,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 16037
+    },
+    {
+      "epoch": 0.16038,
+      "grad_norm": 0.9777520298957825,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 16038
+    },
+    {
+      "epoch": 0.16039,
+      "grad_norm": 1.5354535579681396,
+      "learning_rate": 0.003,
+      "loss": 4.0,
+      "step": 16039
+    },
+    {
+      "epoch": 0.1604,
+      "grad_norm": 0.4473283588886261,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 16040
+    },
+    {
+      "epoch": 0.16041,
+      "grad_norm": 0.9890369772911072,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 16041
+    },
+    {
+      "epoch": 0.16042,
+      "grad_norm": 1.2385085821151733,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 16042
+    },
+    {
+      "epoch": 0.16043,
+      "grad_norm": 0.6593718528747559,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 16043
+    },
+    {
+      "epoch": 0.16044,
+      "grad_norm": 0.744037389755249,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 16044
+    },
+    {
+      "epoch": 0.16045,
+      "grad_norm": 0.7767122387886047,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 16045
+    },
+    {
+      "epoch": 0.16046,
+      "grad_norm": 0.8072572946548462,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 16046
+    },
+    {
+      "epoch": 0.16047,
+      "grad_norm": 0.8254396915435791,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 16047
+    },
+    {
+      "epoch": 0.16048,
+      "grad_norm": 0.7795100212097168,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 16048
+    },
+    {
+      "epoch": 0.16049,
+      "grad_norm": 0.7616690397262573,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 16049
+    },
+    {
+      "epoch": 0.1605,
+      "grad_norm": 0.8673222064971924,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 16050
+    },
+    {
+      "epoch": 0.16051,
+      "grad_norm": 0.8658856153488159,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 16051
+    },
+    {
+      "epoch": 0.16052,
+      "grad_norm": 0.7555408477783203,
+      "learning_rate": 0.003,
+      "loss": 3.9904,
+      "step": 16052
+    },
+    {
+      "epoch": 0.16053,
+      "grad_norm": 0.7539330720901489,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 16053
+    },
+    {
+      "epoch": 0.16054,
+      "grad_norm": 0.8005216121673584,
+      "learning_rate": 0.003,
+      "loss": 3.9805,
+      "step": 16054
+    },
+    {
+      "epoch": 0.16055,
+      "grad_norm": 0.7881186604499817,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 16055
+    },
+    {
+      "epoch": 0.16056,
+      "grad_norm": 0.7905475497245789,
+      "learning_rate": 0.003,
+      "loss": 4.0289,
+      "step": 16056
+    },
+    {
+      "epoch": 0.16057,
+      "grad_norm": 0.7403662204742432,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 16057
+    },
+    {
+      "epoch": 0.16058,
+      "grad_norm": 0.9307411313056946,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 16058
+    },
+    {
+      "epoch": 0.16059,
+      "grad_norm": 1.0022494792938232,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 16059
+    },
+    {
+      "epoch": 0.1606,
+      "grad_norm": 0.9395740628242493,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 16060
+    },
+    {
+      "epoch": 0.16061,
+      "grad_norm": 0.8792284727096558,
+      "learning_rate": 0.003,
+      "loss": 4.0383,
+      "step": 16061
+    },
+    {
+      "epoch": 0.16062,
+      "grad_norm": 0.879179835319519,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 16062
+    },
+    {
+      "epoch": 0.16063,
+      "grad_norm": 1.2204158306121826,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 16063
+    },
+    {
+      "epoch": 0.16064,
+      "grad_norm": 1.0658458471298218,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 16064
+    },
+    {
+      "epoch": 0.16065,
+      "grad_norm": 0.7353061437606812,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 16065
+    },
+    {
+      "epoch": 0.16066,
+      "grad_norm": 0.5892605781555176,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 16066
+    },
+    {
+      "epoch": 0.16067,
+      "grad_norm": 0.5376728773117065,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 16067
+    },
+    {
+      "epoch": 0.16068,
+      "grad_norm": 0.5359166264533997,
+      "learning_rate": 0.003,
+      "loss": 3.9905,
+      "step": 16068
+    },
+    {
+      "epoch": 0.16069,
+      "grad_norm": 0.6350721120834351,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 16069
+    },
+    {
+      "epoch": 0.1607,
+      "grad_norm": 0.8995853066444397,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 16070
+    },
+    {
+      "epoch": 0.16071,
+      "grad_norm": 1.21028733253479,
+      "learning_rate": 0.003,
+      "loss": 3.9869,
+      "step": 16071
+    },
+    {
+      "epoch": 0.16072,
+      "grad_norm": 0.6996341347694397,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 16072
+    },
+    {
+      "epoch": 0.16073,
+      "grad_norm": 0.6284819841384888,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 16073
+    },
+    {
+      "epoch": 0.16074,
+      "grad_norm": 0.8318029046058655,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 16074
+    },
+    {
+      "epoch": 0.16075,
+      "grad_norm": 0.9144317507743835,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 16075
+    },
+    {
+      "epoch": 0.16076,
+      "grad_norm": 0.935553252696991,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 16076
+    },
+    {
+      "epoch": 0.16077,
+      "grad_norm": 0.8554702997207642,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 16077
+    },
+    {
+      "epoch": 0.16078,
+      "grad_norm": 0.7261465787887573,
+      "learning_rate": 0.003,
+      "loss": 3.9728,
+      "step": 16078
+    },
+    {
+      "epoch": 0.16079,
+      "grad_norm": 0.6737211346626282,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 16079
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.6472561955451965,
+      "learning_rate": 0.003,
+      "loss": 3.9832,
+      "step": 16080
+    },
+    {
+      "epoch": 0.16081,
+      "grad_norm": 0.7049079537391663,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 16081
+    },
+    {
+      "epoch": 0.16082,
+      "grad_norm": 0.8032575845718384,
+      "learning_rate": 0.003,
+      "loss": 3.9775,
+      "step": 16082
+    },
+    {
+      "epoch": 0.16083,
+      "grad_norm": 0.8006021976470947,
+      "learning_rate": 0.003,
+      "loss": 3.9797,
+      "step": 16083
+    },
+    {
+      "epoch": 0.16084,
+      "grad_norm": 0.5820706486701965,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 16084
+    },
+    {
+      "epoch": 0.16085,
+      "grad_norm": 0.5111738443374634,
+      "learning_rate": 0.003,
+      "loss": 3.9715,
+      "step": 16085
+    },
+    {
+      "epoch": 0.16086,
+      "grad_norm": 0.6005224585533142,
+      "learning_rate": 0.003,
+      "loss": 3.9698,
+      "step": 16086
+    },
+    {
+      "epoch": 0.16087,
+      "grad_norm": 0.8387770056724548,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 16087
+    },
+    {
+      "epoch": 0.16088,
+      "grad_norm": 1.1707104444503784,
+      "learning_rate": 0.003,
+      "loss": 3.9972,
+      "step": 16088
+    },
+    {
+      "epoch": 0.16089,
+      "grad_norm": 0.8848904371261597,
+      "learning_rate": 0.003,
+      "loss": 3.9875,
+      "step": 16089
+    },
+    {
+      "epoch": 0.1609,
+      "grad_norm": 0.76739102602005,
+      "learning_rate": 0.003,
+      "loss": 4.0056,
+      "step": 16090
+    },
+    {
+      "epoch": 0.16091,
+      "grad_norm": 0.8427786827087402,
+      "learning_rate": 0.003,
+      "loss": 3.9835,
+      "step": 16091
+    },
+    {
+      "epoch": 0.16092,
+      "grad_norm": 1.0178136825561523,
+      "learning_rate": 0.003,
+      "loss": 3.9809,
+      "step": 16092
+    },
+    {
+      "epoch": 0.16093,
+      "grad_norm": 1.1659822463989258,
+      "learning_rate": 0.003,
+      "loss": 3.9792,
+      "step": 16093
+    },
+    {
+      "epoch": 0.16094,
+      "grad_norm": 1.002729892730713,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 16094
+    },
+    {
+      "epoch": 0.16095,
+      "grad_norm": 0.9544771909713745,
+      "learning_rate": 0.003,
+      "loss": 3.9915,
+      "step": 16095
+    },
+    {
+      "epoch": 0.16096,
+      "grad_norm": 1.047904372215271,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 16096
+    },
+    {
+      "epoch": 0.16097,
+      "grad_norm": 1.0617247819900513,
+      "learning_rate": 0.003,
+      "loss": 3.9953,
+      "step": 16097
+    },
+    {
+      "epoch": 0.16098,
+      "grad_norm": 0.9110809564590454,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 16098
+    },
+    {
+      "epoch": 0.16099,
+      "grad_norm": 0.942016065120697,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 16099
+    },
+    {
+      "epoch": 0.161,
+      "grad_norm": 1.09463369846344,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 16100
+    },
+    {
+      "epoch": 0.16101,
+      "grad_norm": 0.9544133543968201,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 16101
+    },
+    {
+      "epoch": 0.16102,
+      "grad_norm": 0.9932712316513062,
+      "learning_rate": 0.003,
+      "loss": 3.9821,
+      "step": 16102
+    },
+    {
+      "epoch": 0.16103,
+      "grad_norm": 0.9824539422988892,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 16103
+    },
+    {
+      "epoch": 0.16104,
+      "grad_norm": 0.9473547339439392,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 16104
+    },
+    {
+      "epoch": 0.16105,
+      "grad_norm": 0.8918814659118652,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 16105
+    },
+    {
+      "epoch": 0.16106,
+      "grad_norm": 0.9244639277458191,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 16106
+    },
+    {
+      "epoch": 0.16107,
+      "grad_norm": 0.8934754133224487,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 16107
+    },
+    {
+      "epoch": 0.16108,
+      "grad_norm": 0.9675315618515015,
+      "learning_rate": 0.003,
+      "loss": 4.0443,
+      "step": 16108
+    },
+    {
+      "epoch": 0.16109,
+      "grad_norm": 1.0206494331359863,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 16109
+    },
+    {
+      "epoch": 0.1611,
+      "grad_norm": 0.8940903544425964,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 16110
+    },
+    {
+      "epoch": 0.16111,
+      "grad_norm": 0.9600245952606201,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 16111
+    },
+    {
+      "epoch": 0.16112,
+      "grad_norm": 0.8340402245521545,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 16112
+    },
+    {
+      "epoch": 0.16113,
+      "grad_norm": 0.693452775478363,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 16113
+    },
+    {
+      "epoch": 0.16114,
+      "grad_norm": 0.6798891425132751,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 16114
+    },
+    {
+      "epoch": 0.16115,
+      "grad_norm": 0.6488301157951355,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 16115
+    },
+    {
+      "epoch": 0.16116,
+      "grad_norm": 0.6416028141975403,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 16116
+    },
+    {
+      "epoch": 0.16117,
+      "grad_norm": 0.7598521113395691,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 16117
+    },
+    {
+      "epoch": 0.16118,
+      "grad_norm": 1.1467598676681519,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 16118
+    },
+    {
+      "epoch": 0.16119,
+      "grad_norm": 1.098025918006897,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 16119
+    },
+    {
+      "epoch": 0.1612,
+      "grad_norm": 0.8007514476776123,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 16120
+    },
+    {
+      "epoch": 0.16121,
+      "grad_norm": 0.6835646629333496,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 16121
+    },
+    {
+      "epoch": 0.16122,
+      "grad_norm": 0.7399167418479919,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 16122
+    },
+    {
+      "epoch": 0.16123,
+      "grad_norm": 0.7732407450675964,
+      "learning_rate": 0.003,
+      "loss": 4.0403,
+      "step": 16123
+    },
+    {
+      "epoch": 0.16124,
+      "grad_norm": 0.6835975050926208,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 16124
+    },
+    {
+      "epoch": 0.16125,
+      "grad_norm": 0.5874419808387756,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 16125
+    },
+    {
+      "epoch": 0.16126,
+      "grad_norm": 0.6312301754951477,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 16126
+    },
+    {
+      "epoch": 0.16127,
+      "grad_norm": 0.6397377252578735,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 16127
+    },
+    {
+      "epoch": 0.16128,
+      "grad_norm": 0.7009602785110474,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 16128
+    },
+    {
+      "epoch": 0.16129,
+      "grad_norm": 0.6710821390151978,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 16129
+    },
+    {
+      "epoch": 0.1613,
+      "grad_norm": 0.7803601622581482,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 16130
+    },
+    {
+      "epoch": 0.16131,
+      "grad_norm": 1.0563775300979614,
+      "learning_rate": 0.003,
+      "loss": 3.9992,
+      "step": 16131
+    },
+    {
+      "epoch": 0.16132,
+      "grad_norm": 1.0355607271194458,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 16132
+    },
+    {
+      "epoch": 0.16133,
+      "grad_norm": 0.7016539573669434,
+      "learning_rate": 0.003,
+      "loss": 3.998,
+      "step": 16133
+    },
+    {
+      "epoch": 0.16134,
+      "grad_norm": 0.5129038691520691,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 16134
+    },
+    {
+      "epoch": 0.16135,
+      "grad_norm": 0.5346699357032776,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 16135
+    },
+    {
+      "epoch": 0.16136,
+      "grad_norm": 0.6307773590087891,
+      "learning_rate": 0.003,
+      "loss": 3.9833,
+      "step": 16136
+    },
+    {
+      "epoch": 0.16137,
+      "grad_norm": 0.7481401562690735,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 16137
+    },
+    {
+      "epoch": 0.16138,
+      "grad_norm": 0.7668987512588501,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 16138
+    },
+    {
+      "epoch": 0.16139,
+      "grad_norm": 0.7831903100013733,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 16139
+    },
+    {
+      "epoch": 0.1614,
+      "grad_norm": 0.9852398037910461,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 16140
+    },
+    {
+      "epoch": 0.16141,
+      "grad_norm": 1.1628477573394775,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 16141
+    },
+    {
+      "epoch": 0.16142,
+      "grad_norm": 0.8994024991989136,
+      "learning_rate": 0.003,
+      "loss": 3.9936,
+      "step": 16142
+    },
+    {
+      "epoch": 0.16143,
+      "grad_norm": 0.778788149356842,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 16143
+    },
+    {
+      "epoch": 0.16144,
+      "grad_norm": 0.8439193964004517,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 16144
+    },
+    {
+      "epoch": 0.16145,
+      "grad_norm": 0.7844627499580383,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 16145
+    },
+    {
+      "epoch": 0.16146,
+      "grad_norm": 0.873829185962677,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 16146
+    },
+    {
+      "epoch": 0.16147,
+      "grad_norm": 1.0035282373428345,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 16147
+    },
+    {
+      "epoch": 0.16148,
+      "grad_norm": 0.9474618434906006,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 16148
+    },
+    {
+      "epoch": 0.16149,
+      "grad_norm": 0.9962616562843323,
+      "learning_rate": 0.003,
+      "loss": 3.9985,
+      "step": 16149
+    },
+    {
+      "epoch": 0.1615,
+      "grad_norm": 1.0283039808273315,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 16150
+    },
+    {
+      "epoch": 0.16151,
+      "grad_norm": 1.0023417472839355,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 16151
+    },
+    {
+      "epoch": 0.16152,
+      "grad_norm": 0.9786602854728699,
+      "learning_rate": 0.003,
+      "loss": 4.0072,
+      "step": 16152
+    },
+    {
+      "epoch": 0.16153,
+      "grad_norm": 0.9277042746543884,
+      "learning_rate": 0.003,
+      "loss": 4.0091,
+      "step": 16153
+    },
+    {
+      "epoch": 0.16154,
+      "grad_norm": 0.8046708703041077,
+      "learning_rate": 0.003,
+      "loss": 4.0409,
+      "step": 16154
+    },
+    {
+      "epoch": 0.16155,
+      "grad_norm": 0.7589371204376221,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 16155
+    },
+    {
+      "epoch": 0.16156,
+      "grad_norm": 0.8702610731124878,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 16156
+    },
+    {
+      "epoch": 0.16157,
+      "grad_norm": 0.9417670369148254,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 16157
+    },
+    {
+      "epoch": 0.16158,
+      "grad_norm": 0.9658819437026978,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 16158
+    },
+    {
+      "epoch": 0.16159,
+      "grad_norm": 0.9872148633003235,
+      "learning_rate": 0.003,
+      "loss": 4.0454,
+      "step": 16159
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.8776973485946655,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 16160
+    },
+    {
+      "epoch": 0.16161,
+      "grad_norm": 0.8566110134124756,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 16161
+    },
+    {
+      "epoch": 0.16162,
+      "grad_norm": 0.9043013453483582,
+      "learning_rate": 0.003,
+      "loss": 3.9886,
+      "step": 16162
+    },
+    {
+      "epoch": 0.16163,
+      "grad_norm": 0.8688957691192627,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 16163
+    },
+    {
+      "epoch": 0.16164,
+      "grad_norm": 0.8106199502944946,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 16164
+    },
+    {
+      "epoch": 0.16165,
+      "grad_norm": 0.8550721406936646,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 16165
+    },
+    {
+      "epoch": 0.16166,
+      "grad_norm": 0.9837195873260498,
+      "learning_rate": 0.003,
+      "loss": 4.0294,
+      "step": 16166
+    },
+    {
+      "epoch": 0.16167,
+      "grad_norm": 1.1086894273757935,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 16167
+    },
+    {
+      "epoch": 0.16168,
+      "grad_norm": 0.8541070222854614,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 16168
+    },
+    {
+      "epoch": 0.16169,
+      "grad_norm": 0.6221586465835571,
+      "learning_rate": 0.003,
+      "loss": 3.9864,
+      "step": 16169
+    },
+    {
+      "epoch": 0.1617,
+      "grad_norm": 0.6150177717208862,
+      "learning_rate": 0.003,
+      "loss": 3.9739,
+      "step": 16170
+    },
+    {
+      "epoch": 0.16171,
+      "grad_norm": 0.6094152927398682,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 16171
+    },
+    {
+      "epoch": 0.16172,
+      "grad_norm": 0.5822041630744934,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 16172
+    },
+    {
+      "epoch": 0.16173,
+      "grad_norm": 0.6120102405548096,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 16173
+    },
+    {
+      "epoch": 0.16174,
+      "grad_norm": 0.6903099417686462,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 16174
+    },
+    {
+      "epoch": 0.16175,
+      "grad_norm": 0.6956382393836975,
+      "learning_rate": 0.003,
+      "loss": 3.9838,
+      "step": 16175
+    },
+    {
+      "epoch": 0.16176,
+      "grad_norm": 0.6636068224906921,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 16176
+    },
+    {
+      "epoch": 0.16177,
+      "grad_norm": 0.675407886505127,
+      "learning_rate": 0.003,
+      "loss": 4.0185,
+      "step": 16177
+    },
+    {
+      "epoch": 0.16178,
+      "grad_norm": 0.9279387593269348,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 16178
+    },
+    {
+      "epoch": 0.16179,
+      "grad_norm": 1.267047643661499,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 16179
+    },
+    {
+      "epoch": 0.1618,
+      "grad_norm": 0.8936948180198669,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 16180
+    },
+    {
+      "epoch": 0.16181,
+      "grad_norm": 0.9198494553565979,
+      "learning_rate": 0.003,
+      "loss": 3.9891,
+      "step": 16181
+    },
+    {
+      "epoch": 0.16182,
+      "grad_norm": 0.9184507131576538,
+      "learning_rate": 0.003,
+      "loss": 3.9876,
+      "step": 16182
+    },
+    {
+      "epoch": 0.16183,
+      "grad_norm": 0.8332173824310303,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 16183
+    },
+    {
+      "epoch": 0.16184,
+      "grad_norm": 0.8280366659164429,
+      "learning_rate": 0.003,
+      "loss": 3.994,
+      "step": 16184
+    },
+    {
+      "epoch": 0.16185,
+      "grad_norm": 0.8992508053779602,
+      "learning_rate": 0.003,
+      "loss": 4.0384,
+      "step": 16185
+    },
+    {
+      "epoch": 0.16186,
+      "grad_norm": 0.960120677947998,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 16186
+    },
+    {
+      "epoch": 0.16187,
+      "grad_norm": 0.903139591217041,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 16187
+    },
+    {
+      "epoch": 0.16188,
+      "grad_norm": 0.8598777055740356,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 16188
+    },
+    {
+      "epoch": 0.16189,
+      "grad_norm": 0.796144962310791,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 16189
+    },
+    {
+      "epoch": 0.1619,
+      "grad_norm": 0.8381752967834473,
+      "learning_rate": 0.003,
+      "loss": 3.9745,
+      "step": 16190
+    },
+    {
+      "epoch": 0.16191,
+      "grad_norm": 0.802821695804596,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 16191
+    },
+    {
+      "epoch": 0.16192,
+      "grad_norm": 0.8790308237075806,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 16192
+    },
+    {
+      "epoch": 0.16193,
+      "grad_norm": 1.0374481678009033,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 16193
+    },
+    {
+      "epoch": 0.16194,
+      "grad_norm": 0.9121679663658142,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 16194
+    },
+    {
+      "epoch": 0.16195,
+      "grad_norm": 0.8900026082992554,
+      "learning_rate": 0.003,
+      "loss": 4.0047,
+      "step": 16195
+    },
+    {
+      "epoch": 0.16196,
+      "grad_norm": 1.0195919275283813,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 16196
+    },
+    {
+      "epoch": 0.16197,
+      "grad_norm": 1.005298376083374,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 16197
+    },
+    {
+      "epoch": 0.16198,
+      "grad_norm": 0.8747978210449219,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 16198
+    },
+    {
+      "epoch": 0.16199,
+      "grad_norm": 0.9359422326087952,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 16199
+    },
+    {
+      "epoch": 0.162,
+      "grad_norm": 0.8763812184333801,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 16200
+    },
+    {
+      "epoch": 0.16201,
+      "grad_norm": 0.7494961023330688,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 16201
+    },
+    {
+      "epoch": 0.16202,
+      "grad_norm": 0.7147844433784485,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 16202
+    },
+    {
+      "epoch": 0.16203,
+      "grad_norm": 0.7433773875236511,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 16203
+    },
+    {
+      "epoch": 0.16204,
+      "grad_norm": 0.9177911281585693,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 16204
+    },
+    {
+      "epoch": 0.16205,
+      "grad_norm": 1.0803254842758179,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 16205
+    },
+    {
+      "epoch": 0.16206,
+      "grad_norm": 1.1402027606964111,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 16206
+    },
+    {
+      "epoch": 0.16207,
+      "grad_norm": 0.8355861306190491,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 16207
+    },
+    {
+      "epoch": 0.16208,
+      "grad_norm": 0.6533427834510803,
+      "learning_rate": 0.003,
+      "loss": 3.9922,
+      "step": 16208
+    },
+    {
+      "epoch": 0.16209,
+      "grad_norm": 0.6891869902610779,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 16209
+    },
+    {
+      "epoch": 0.1621,
+      "grad_norm": 0.7308468818664551,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 16210
+    },
+    {
+      "epoch": 0.16211,
+      "grad_norm": 0.7914701700210571,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 16211
+    },
+    {
+      "epoch": 0.16212,
+      "grad_norm": 0.839141845703125,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 16212
+    },
+    {
+      "epoch": 0.16213,
+      "grad_norm": 0.888842761516571,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 16213
+    },
+    {
+      "epoch": 0.16214,
+      "grad_norm": 0.9561465382575989,
+      "learning_rate": 0.003,
+      "loss": 4.0351,
+      "step": 16214
+    },
+    {
+      "epoch": 0.16215,
+      "grad_norm": 0.8719993829727173,
+      "learning_rate": 0.003,
+      "loss": 4.0415,
+      "step": 16215
+    },
+    {
+      "epoch": 0.16216,
+      "grad_norm": 0.8226259350776672,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 16216
+    },
+    {
+      "epoch": 0.16217,
+      "grad_norm": 0.7822552919387817,
+      "learning_rate": 0.003,
+      "loss": 4.0341,
+      "step": 16217
+    },
+    {
+      "epoch": 0.16218,
+      "grad_norm": 0.7415698766708374,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 16218
+    },
+    {
+      "epoch": 0.16219,
+      "grad_norm": 0.7511895298957825,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 16219
+    },
+    {
+      "epoch": 0.1622,
+      "grad_norm": 0.8057510852813721,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 16220
+    },
+    {
+      "epoch": 0.16221,
+      "grad_norm": 0.7829014658927917,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 16221
+    },
+    {
+      "epoch": 0.16222,
+      "grad_norm": 0.7896765470504761,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 16222
+    },
+    {
+      "epoch": 0.16223,
+      "grad_norm": 0.7292823195457458,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 16223
+    },
+    {
+      "epoch": 0.16224,
+      "grad_norm": 0.7166668772697449,
+      "learning_rate": 0.003,
+      "loss": 3.996,
+      "step": 16224
+    },
+    {
+      "epoch": 0.16225,
+      "grad_norm": 0.7180719971656799,
+      "learning_rate": 0.003,
+      "loss": 3.9868,
+      "step": 16225
+    },
+    {
+      "epoch": 0.16226,
+      "grad_norm": 0.8296489119529724,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 16226
+    },
+    {
+      "epoch": 0.16227,
+      "grad_norm": 1.0246996879577637,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 16227
+    },
+    {
+      "epoch": 0.16228,
+      "grad_norm": 1.2064777612686157,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 16228
+    },
+    {
+      "epoch": 0.16229,
+      "grad_norm": 0.7835697531700134,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 16229
+    },
+    {
+      "epoch": 0.1623,
+      "grad_norm": 0.7163366079330444,
+      "learning_rate": 0.003,
+      "loss": 3.9815,
+      "step": 16230
+    },
+    {
+      "epoch": 0.16231,
+      "grad_norm": 0.8635886907577515,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 16231
+    },
+    {
+      "epoch": 0.16232,
+      "grad_norm": 0.9135334491729736,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 16232
+    },
+    {
+      "epoch": 0.16233,
+      "grad_norm": 1.0937687158584595,
+      "learning_rate": 0.003,
+      "loss": 3.9993,
+      "step": 16233
+    },
+    {
+      "epoch": 0.16234,
+      "grad_norm": 0.9035110473632812,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 16234
+    },
+    {
+      "epoch": 0.16235,
+      "grad_norm": 0.8154218196868896,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 16235
+    },
+    {
+      "epoch": 0.16236,
+      "grad_norm": 0.717538058757782,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 16236
+    },
+    {
+      "epoch": 0.16237,
+      "grad_norm": 0.8207351565361023,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 16237
+    },
+    {
+      "epoch": 0.16238,
+      "grad_norm": 0.9286295771598816,
+      "learning_rate": 0.003,
+      "loss": 4.0095,
+      "step": 16238
+    },
+    {
+      "epoch": 0.16239,
+      "grad_norm": 0.8026152849197388,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 16239
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.7241533398628235,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 16240
+    },
+    {
+      "epoch": 0.16241,
+      "grad_norm": 0.7309879064559937,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 16241
+    },
+    {
+      "epoch": 0.16242,
+      "grad_norm": 0.8363540172576904,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 16242
+    },
+    {
+      "epoch": 0.16243,
+      "grad_norm": 0.8413676023483276,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 16243
+    },
+    {
+      "epoch": 0.16244,
+      "grad_norm": 0.9204793572425842,
+      "learning_rate": 0.003,
+      "loss": 3.9826,
+      "step": 16244
+    },
+    {
+      "epoch": 0.16245,
+      "grad_norm": 1.1601897478103638,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 16245
+    },
+    {
+      "epoch": 0.16246,
+      "grad_norm": 0.9563637375831604,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 16246
+    },
+    {
+      "epoch": 0.16247,
+      "grad_norm": 0.9307214617729187,
+      "learning_rate": 0.003,
+      "loss": 4.0161,
+      "step": 16247
+    },
+    {
+      "epoch": 0.16248,
+      "grad_norm": 1.002216100692749,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 16248
+    },
+    {
+      "epoch": 0.16249,
+      "grad_norm": 1.1238837242126465,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 16249
+    },
+    {
+      "epoch": 0.1625,
+      "grad_norm": 0.8523120880126953,
+      "learning_rate": 0.003,
+      "loss": 3.9891,
+      "step": 16250
+    },
+    {
+      "epoch": 0.16251,
+      "grad_norm": 0.7996931076049805,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 16251
+    },
+    {
+      "epoch": 0.16252,
+      "grad_norm": 0.8345876932144165,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 16252
+    },
+    {
+      "epoch": 0.16253,
+      "grad_norm": 0.9343923926353455,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 16253
+    },
+    {
+      "epoch": 0.16254,
+      "grad_norm": 0.8420714735984802,
+      "learning_rate": 0.003,
+      "loss": 3.9851,
+      "step": 16254
+    },
+    {
+      "epoch": 0.16255,
+      "grad_norm": 0.6601258516311646,
+      "learning_rate": 0.003,
+      "loss": 3.9934,
+      "step": 16255
+    },
+    {
+      "epoch": 0.16256,
+      "grad_norm": 0.66865074634552,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 16256
+    },
+    {
+      "epoch": 0.16257,
+      "grad_norm": 0.7216544151306152,
+      "learning_rate": 0.003,
+      "loss": 3.9984,
+      "step": 16257
+    },
+    {
+      "epoch": 0.16258,
+      "grad_norm": 0.7510634064674377,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 16258
+    },
+    {
+      "epoch": 0.16259,
+      "grad_norm": 1.0844184160232544,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 16259
+    },
+    {
+      "epoch": 0.1626,
+      "grad_norm": 1.138572096824646,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 16260
+    },
+    {
+      "epoch": 0.16261,
+      "grad_norm": 0.8396537899971008,
+      "learning_rate": 0.003,
+      "loss": 3.9792,
+      "step": 16261
+    },
+    {
+      "epoch": 0.16262,
+      "grad_norm": 0.7281916737556458,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 16262
+    },
+    {
+      "epoch": 0.16263,
+      "grad_norm": 0.6730375289916992,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 16263
+    },
+    {
+      "epoch": 0.16264,
+      "grad_norm": 0.6375163197517395,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 16264
+    },
+    {
+      "epoch": 0.16265,
+      "grad_norm": 0.6660574674606323,
+      "learning_rate": 0.003,
+      "loss": 3.9912,
+      "step": 16265
+    },
+    {
+      "epoch": 0.16266,
+      "grad_norm": 0.7589208483695984,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 16266
+    },
+    {
+      "epoch": 0.16267,
+      "grad_norm": 0.9495667219161987,
+      "learning_rate": 0.003,
+      "loss": 3.9948,
+      "step": 16267
+    },
+    {
+      "epoch": 0.16268,
+      "grad_norm": 1.158148169517517,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 16268
+    },
+    {
+      "epoch": 0.16269,
+      "grad_norm": 0.7837066650390625,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 16269
+    },
+    {
+      "epoch": 0.1627,
+      "grad_norm": 0.7256262898445129,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 16270
+    },
+    {
+      "epoch": 0.16271,
+      "grad_norm": 0.8901933431625366,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 16271
+    },
+    {
+      "epoch": 0.16272,
+      "grad_norm": 0.9590361714363098,
+      "learning_rate": 0.003,
+      "loss": 3.9892,
+      "step": 16272
+    },
+    {
+      "epoch": 0.16273,
+      "grad_norm": 1.023372769355774,
+      "learning_rate": 0.003,
+      "loss": 4.0022,
+      "step": 16273
+    },
+    {
+      "epoch": 0.16274,
+      "grad_norm": 0.9545578360557556,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 16274
+    },
+    {
+      "epoch": 0.16275,
+      "grad_norm": 0.7929362654685974,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 16275
+    },
+    {
+      "epoch": 0.16276,
+      "grad_norm": 0.6679825186729431,
+      "learning_rate": 0.003,
+      "loss": 3.9855,
+      "step": 16276
+    },
+    {
+      "epoch": 0.16277,
+      "grad_norm": 0.58383709192276,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 16277
+    },
+    {
+      "epoch": 0.16278,
+      "grad_norm": 0.7074006199836731,
+      "learning_rate": 0.003,
+      "loss": 3.9933,
+      "step": 16278
+    },
+    {
+      "epoch": 0.16279,
+      "grad_norm": 0.774600625038147,
+      "learning_rate": 0.003,
+      "loss": 3.9983,
+      "step": 16279
+    },
+    {
+      "epoch": 0.1628,
+      "grad_norm": 0.6628326177597046,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 16280
+    },
+    {
+      "epoch": 0.16281,
+      "grad_norm": 0.6247139573097229,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 16281
+    },
+    {
+      "epoch": 0.16282,
+      "grad_norm": 0.5879446864128113,
+      "learning_rate": 0.003,
+      "loss": 3.979,
+      "step": 16282
+    },
+    {
+      "epoch": 0.16283,
+      "grad_norm": 0.6044084429740906,
+      "learning_rate": 0.003,
+      "loss": 3.9626,
+      "step": 16283
+    },
+    {
+      "epoch": 0.16284,
+      "grad_norm": 0.6138390898704529,
+      "learning_rate": 0.003,
+      "loss": 3.9824,
+      "step": 16284
+    },
+    {
+      "epoch": 0.16285,
+      "grad_norm": 0.6722159385681152,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 16285
+    },
+    {
+      "epoch": 0.16286,
+      "grad_norm": 0.7046880722045898,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 16286
+    },
+    {
+      "epoch": 0.16287,
+      "grad_norm": 0.7353987097740173,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 16287
+    },
+    {
+      "epoch": 0.16288,
+      "grad_norm": 0.9673696756362915,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 16288
+    },
+    {
+      "epoch": 0.16289,
+      "grad_norm": 1.1569815874099731,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 16289
+    },
+    {
+      "epoch": 0.1629,
+      "grad_norm": 0.7916101217269897,
+      "learning_rate": 0.003,
+      "loss": 3.9954,
+      "step": 16290
+    },
+    {
+      "epoch": 0.16291,
+      "grad_norm": 0.8873964548110962,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 16291
+    },
+    {
+      "epoch": 0.16292,
+      "grad_norm": 0.9866530299186707,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 16292
+    },
+    {
+      "epoch": 0.16293,
+      "grad_norm": 1.2426506280899048,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 16293
+    },
+    {
+      "epoch": 0.16294,
+      "grad_norm": 0.674839973449707,
+      "learning_rate": 0.003,
+      "loss": 3.9895,
+      "step": 16294
+    },
+    {
+      "epoch": 0.16295,
+      "grad_norm": 0.697393536567688,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 16295
+    },
+    {
+      "epoch": 0.16296,
+      "grad_norm": 0.8670492768287659,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 16296
+    },
+    {
+      "epoch": 0.16297,
+      "grad_norm": 1.480337381362915,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 16297
+    },
+    {
+      "epoch": 0.16298,
+      "grad_norm": 0.5470685362815857,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 16298
+    },
+    {
+      "epoch": 0.16299,
+      "grad_norm": 0.7625333070755005,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 16299
+    },
+    {
+      "epoch": 0.163,
+      "grad_norm": 0.8565612435340881,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 16300
+    },
+    {
+      "epoch": 0.16301,
+      "grad_norm": 0.8694688677787781,
+      "learning_rate": 0.003,
+      "loss": 3.9809,
+      "step": 16301
+    },
+    {
+      "epoch": 0.16302,
+      "grad_norm": 0.8741586804389954,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 16302
+    },
+    {
+      "epoch": 0.16303,
+      "grad_norm": 0.9137376546859741,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 16303
+    },
+    {
+      "epoch": 0.16304,
+      "grad_norm": 1.0089272260665894,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 16304
+    },
+    {
+      "epoch": 0.16305,
+      "grad_norm": 1.0015945434570312,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 16305
+    },
+    {
+      "epoch": 0.16306,
+      "grad_norm": 1.0515681505203247,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 16306
+    },
+    {
+      "epoch": 0.16307,
+      "grad_norm": 1.0251307487487793,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 16307
+    },
+    {
+      "epoch": 0.16308,
+      "grad_norm": 0.9413036704063416,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 16308
+    },
+    {
+      "epoch": 0.16309,
+      "grad_norm": 1.1088650226593018,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 16309
+    },
+    {
+      "epoch": 0.1631,
+      "grad_norm": 0.9975844025611877,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 16310
+    },
+    {
+      "epoch": 0.16311,
+      "grad_norm": 1.0978097915649414,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 16311
+    },
+    {
+      "epoch": 0.16312,
+      "grad_norm": 0.9465668201446533,
+      "learning_rate": 0.003,
+      "loss": 4.0681,
+      "step": 16312
+    },
+    {
+      "epoch": 0.16313,
+      "grad_norm": 0.9554963111877441,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 16313
+    },
+    {
+      "epoch": 0.16314,
+      "grad_norm": 0.9088976979255676,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 16314
+    },
+    {
+      "epoch": 0.16315,
+      "grad_norm": 0.9877479076385498,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 16315
+    },
+    {
+      "epoch": 0.16316,
+      "grad_norm": 0.7756491899490356,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 16316
+    },
+    {
+      "epoch": 0.16317,
+      "grad_norm": 0.705436110496521,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 16317
+    },
+    {
+      "epoch": 0.16318,
+      "grad_norm": 0.7293632626533508,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 16318
+    },
+    {
+      "epoch": 0.16319,
+      "grad_norm": 0.6516294479370117,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 16319
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.6582109928131104,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 16320
+    },
+    {
+      "epoch": 0.16321,
+      "grad_norm": 0.7615460157394409,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 16321
+    },
+    {
+      "epoch": 0.16322,
+      "grad_norm": 0.9899001121520996,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 16322
+    },
+    {
+      "epoch": 0.16323,
+      "grad_norm": 1.1688170433044434,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 16323
+    },
+    {
+      "epoch": 0.16324,
+      "grad_norm": 1.1293865442276,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 16324
+    },
+    {
+      "epoch": 0.16325,
+      "grad_norm": 0.7544232606887817,
+      "learning_rate": 0.003,
+      "loss": 3.9924,
+      "step": 16325
+    },
+    {
+      "epoch": 0.16326,
+      "grad_norm": 0.5888630151748657,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 16326
+    },
+    {
+      "epoch": 0.16327,
+      "grad_norm": 0.7046893835067749,
+      "learning_rate": 0.003,
+      "loss": 3.9776,
+      "step": 16327
+    },
+    {
+      "epoch": 0.16328,
+      "grad_norm": 0.7926900386810303,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 16328
+    },
+    {
+      "epoch": 0.16329,
+      "grad_norm": 0.9033036231994629,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 16329
+    },
+    {
+      "epoch": 0.1633,
+      "grad_norm": 0.8790760040283203,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 16330
+    },
+    {
+      "epoch": 0.16331,
+      "grad_norm": 0.8309141397476196,
+      "learning_rate": 0.003,
+      "loss": 4.0119,
+      "step": 16331
+    },
+    {
+      "epoch": 0.16332,
+      "grad_norm": 0.8339505195617676,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 16332
+    },
+    {
+      "epoch": 0.16333,
+      "grad_norm": 0.8661202192306519,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 16333
+    },
+    {
+      "epoch": 0.16334,
+      "grad_norm": 0.7818444967269897,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 16334
+    },
+    {
+      "epoch": 0.16335,
+      "grad_norm": 0.7368418574333191,
+      "learning_rate": 0.003,
+      "loss": 4.0369,
+      "step": 16335
+    },
+    {
+      "epoch": 0.16336,
+      "grad_norm": 0.6508287191390991,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 16336
+    },
+    {
+      "epoch": 0.16337,
+      "grad_norm": 0.7051012516021729,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 16337
+    },
+    {
+      "epoch": 0.16338,
+      "grad_norm": 0.7545062303543091,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 16338
+    },
+    {
+      "epoch": 0.16339,
+      "grad_norm": 0.8058009147644043,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 16339
+    },
+    {
+      "epoch": 0.1634,
+      "grad_norm": 0.9874696135520935,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 16340
+    },
+    {
+      "epoch": 0.16341,
+      "grad_norm": 0.9253709316253662,
+      "learning_rate": 0.003,
+      "loss": 3.9853,
+      "step": 16341
+    },
+    {
+      "epoch": 0.16342,
+      "grad_norm": 0.6806486248970032,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 16342
+    },
+    {
+      "epoch": 0.16343,
+      "grad_norm": 0.6025413274765015,
+      "learning_rate": 0.003,
+      "loss": 3.9886,
+      "step": 16343
+    },
+    {
+      "epoch": 0.16344,
+      "grad_norm": 0.6132237315177917,
+      "learning_rate": 0.003,
+      "loss": 3.9848,
+      "step": 16344
+    },
+    {
+      "epoch": 0.16345,
+      "grad_norm": 0.6393827795982361,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 16345
+    },
+    {
+      "epoch": 0.16346,
+      "grad_norm": 0.615208625793457,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 16346
+    },
+    {
+      "epoch": 0.16347,
+      "grad_norm": 0.7005764842033386,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 16347
+    },
+    {
+      "epoch": 0.16348,
+      "grad_norm": 0.7764422297477722,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 16348
+    },
+    {
+      "epoch": 0.16349,
+      "grad_norm": 0.7720112800598145,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 16349
+    },
+    {
+      "epoch": 0.1635,
+      "grad_norm": 0.838747501373291,
+      "learning_rate": 0.003,
+      "loss": 3.9892,
+      "step": 16350
+    },
+    {
+      "epoch": 0.16351,
+      "grad_norm": 0.8589238524436951,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 16351
+    },
+    {
+      "epoch": 0.16352,
+      "grad_norm": 1.0065206289291382,
+      "learning_rate": 0.003,
+      "loss": 3.9711,
+      "step": 16352
+    },
+    {
+      "epoch": 0.16353,
+      "grad_norm": 1.0837733745574951,
+      "learning_rate": 0.003,
+      "loss": 3.9747,
+      "step": 16353
+    },
+    {
+      "epoch": 0.16354,
+      "grad_norm": 0.788565456867218,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 16354
+    },
+    {
+      "epoch": 0.16355,
+      "grad_norm": 0.6954969167709351,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 16355
+    },
+    {
+      "epoch": 0.16356,
+      "grad_norm": 0.7750489115715027,
+      "learning_rate": 0.003,
+      "loss": 3.9743,
+      "step": 16356
+    },
+    {
+      "epoch": 0.16357,
+      "grad_norm": 0.9297155141830444,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 16357
+    },
+    {
+      "epoch": 0.16358,
+      "grad_norm": 0.9632990956306458,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 16358
+    },
+    {
+      "epoch": 0.16359,
+      "grad_norm": 0.8834934830665588,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 16359
+    },
+    {
+      "epoch": 0.1636,
+      "grad_norm": 0.9617258906364441,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 16360
+    },
+    {
+      "epoch": 0.16361,
+      "grad_norm": 0.9705206155776978,
+      "learning_rate": 0.003,
+      "loss": 3.9894,
+      "step": 16361
+    },
+    {
+      "epoch": 0.16362,
+      "grad_norm": 0.8674737215042114,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 16362
+    },
+    {
+      "epoch": 0.16363,
+      "grad_norm": 0.8213301301002502,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 16363
+    },
+    {
+      "epoch": 0.16364,
+      "grad_norm": 0.8834892511367798,
+      "learning_rate": 0.003,
+      "loss": 4.031,
+      "step": 16364
+    },
+    {
+      "epoch": 0.16365,
+      "grad_norm": 0.8041810393333435,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 16365
+    },
+    {
+      "epoch": 0.16366,
+      "grad_norm": 0.6862834692001343,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 16366
+    },
+    {
+      "epoch": 0.16367,
+      "grad_norm": 0.7433396577835083,
+      "learning_rate": 0.003,
+      "loss": 3.9897,
+      "step": 16367
+    },
+    {
+      "epoch": 0.16368,
+      "grad_norm": 0.8846827149391174,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 16368
+    },
+    {
+      "epoch": 0.16369,
+      "grad_norm": 1.0452004671096802,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 16369
+    },
+    {
+      "epoch": 0.1637,
+      "grad_norm": 1.139094352722168,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 16370
+    },
+    {
+      "epoch": 0.16371,
+      "grad_norm": 0.9088010787963867,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 16371
+    },
+    {
+      "epoch": 0.16372,
+      "grad_norm": 0.8271872997283936,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 16372
+    },
+    {
+      "epoch": 0.16373,
+      "grad_norm": 0.7686737179756165,
+      "learning_rate": 0.003,
+      "loss": 3.9944,
+      "step": 16373
+    },
+    {
+      "epoch": 0.16374,
+      "grad_norm": 0.6929585337638855,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 16374
+    },
+    {
+      "epoch": 0.16375,
+      "grad_norm": 0.8070968985557556,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 16375
+    },
+    {
+      "epoch": 0.16376,
+      "grad_norm": 0.7533002495765686,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 16376
+    },
+    {
+      "epoch": 0.16377,
+      "grad_norm": 0.7526688575744629,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 16377
+    },
+    {
+      "epoch": 0.16378,
+      "grad_norm": 0.8268585801124573,
+      "learning_rate": 0.003,
+      "loss": 3.9859,
+      "step": 16378
+    },
+    {
+      "epoch": 0.16379,
+      "grad_norm": 1.0098788738250732,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 16379
+    },
+    {
+      "epoch": 0.1638,
+      "grad_norm": 1.2174149751663208,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 16380
+    },
+    {
+      "epoch": 0.16381,
+      "grad_norm": 0.6912074685096741,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 16381
+    },
+    {
+      "epoch": 0.16382,
+      "grad_norm": 0.7847248315811157,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 16382
+    },
+    {
+      "epoch": 0.16383,
+      "grad_norm": 0.8508479595184326,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 16383
+    },
+    {
+      "epoch": 0.16384,
+      "grad_norm": 0.7754083275794983,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 16384
+    },
+    {
+      "epoch": 0.16385,
+      "grad_norm": 0.7800403237342834,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 16385
+    },
+    {
+      "epoch": 0.16386,
+      "grad_norm": 0.7742740511894226,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 16386
+    },
+    {
+      "epoch": 0.16387,
+      "grad_norm": 0.8291677832603455,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 16387
+    },
+    {
+      "epoch": 0.16388,
+      "grad_norm": 0.8471203446388245,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 16388
+    },
+    {
+      "epoch": 0.16389,
+      "grad_norm": 0.7955463528633118,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 16389
+    },
+    {
+      "epoch": 0.1639,
+      "grad_norm": 0.7575664520263672,
+      "learning_rate": 0.003,
+      "loss": 4.0311,
+      "step": 16390
+    },
+    {
+      "epoch": 0.16391,
+      "grad_norm": 0.6484594941139221,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 16391
+    },
+    {
+      "epoch": 0.16392,
+      "grad_norm": 0.6634478569030762,
+      "learning_rate": 0.003,
+      "loss": 3.9802,
+      "step": 16392
+    },
+    {
+      "epoch": 0.16393,
+      "grad_norm": 0.5433070659637451,
+      "learning_rate": 0.003,
+      "loss": 3.9838,
+      "step": 16393
+    },
+    {
+      "epoch": 0.16394,
+      "grad_norm": 0.49226635694503784,
+      "learning_rate": 0.003,
+      "loss": 4.0321,
+      "step": 16394
+    },
+    {
+      "epoch": 0.16395,
+      "grad_norm": 0.5242682695388794,
+      "learning_rate": 0.003,
+      "loss": 3.9975,
+      "step": 16395
+    },
+    {
+      "epoch": 0.16396,
+      "grad_norm": 0.4910825490951538,
+      "learning_rate": 0.003,
+      "loss": 3.98,
+      "step": 16396
+    },
+    {
+      "epoch": 0.16397,
+      "grad_norm": 0.6202237010002136,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 16397
+    },
+    {
+      "epoch": 0.16398,
+      "grad_norm": 0.8250946998596191,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 16398
+    },
+    {
+      "epoch": 0.16399,
+      "grad_norm": 1.031864047050476,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 16399
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 1.2869422435760498,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 16400
+    },
+    {
+      "epoch": 0.16401,
+      "grad_norm": 0.6132083535194397,
+      "learning_rate": 0.003,
+      "loss": 3.9869,
+      "step": 16401
+    },
+    {
+      "epoch": 0.16402,
+      "grad_norm": 0.6066889762878418,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 16402
+    },
+    {
+      "epoch": 0.16403,
+      "grad_norm": 0.7775403261184692,
+      "learning_rate": 0.003,
+      "loss": 3.9921,
+      "step": 16403
+    },
+    {
+      "epoch": 0.16404,
+      "grad_norm": 0.8699544668197632,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 16404
+    },
+    {
+      "epoch": 0.16405,
+      "grad_norm": 0.9804795384407043,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 16405
+    },
+    {
+      "epoch": 0.16406,
+      "grad_norm": 0.8409727811813354,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 16406
+    },
+    {
+      "epoch": 0.16407,
+      "grad_norm": 0.6656890511512756,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 16407
+    },
+    {
+      "epoch": 0.16408,
+      "grad_norm": 0.8278241157531738,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 16408
+    },
+    {
+      "epoch": 0.16409,
+      "grad_norm": 1.022950291633606,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 16409
+    },
+    {
+      "epoch": 0.1641,
+      "grad_norm": 1.1753534078598022,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 16410
+    },
+    {
+      "epoch": 0.16411,
+      "grad_norm": 0.8887650370597839,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 16411
+    },
+    {
+      "epoch": 0.16412,
+      "grad_norm": 1.007218599319458,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 16412
+    },
+    {
+      "epoch": 0.16413,
+      "grad_norm": 1.1239910125732422,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 16413
+    },
+    {
+      "epoch": 0.16414,
+      "grad_norm": 0.863362193107605,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 16414
+    },
+    {
+      "epoch": 0.16415,
+      "grad_norm": 0.9276494979858398,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 16415
+    },
+    {
+      "epoch": 0.16416,
+      "grad_norm": 1.1005187034606934,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 16416
+    },
+    {
+      "epoch": 0.16417,
+      "grad_norm": 0.9491989016532898,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 16417
+    },
+    {
+      "epoch": 0.16418,
+      "grad_norm": 0.9228178262710571,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 16418
+    },
+    {
+      "epoch": 0.16419,
+      "grad_norm": 0.9737118482589722,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 16419
+    },
+    {
+      "epoch": 0.1642,
+      "grad_norm": 0.9858505725860596,
+      "learning_rate": 0.003,
+      "loss": 4.0628,
+      "step": 16420
+    },
+    {
+      "epoch": 0.16421,
+      "grad_norm": 1.0404865741729736,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 16421
+    },
+    {
+      "epoch": 0.16422,
+      "grad_norm": 1.0032793283462524,
+      "learning_rate": 0.003,
+      "loss": 3.9988,
+      "step": 16422
+    },
+    {
+      "epoch": 0.16423,
+      "grad_norm": 1.0093377828598022,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 16423
+    },
+    {
+      "epoch": 0.16424,
+      "grad_norm": 1.0854952335357666,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 16424
+    },
+    {
+      "epoch": 0.16425,
+      "grad_norm": 0.910970151424408,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 16425
+    },
+    {
+      "epoch": 0.16426,
+      "grad_norm": 0.8588374257087708,
+      "learning_rate": 0.003,
+      "loss": 4.0501,
+      "step": 16426
+    },
+    {
+      "epoch": 0.16427,
+      "grad_norm": 0.9977859854698181,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 16427
+    },
+    {
+      "epoch": 0.16428,
+      "grad_norm": 1.202165126800537,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 16428
+    },
+    {
+      "epoch": 0.16429,
+      "grad_norm": 0.7841586470603943,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 16429
+    },
+    {
+      "epoch": 0.1643,
+      "grad_norm": 0.8664776682853699,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 16430
+    },
+    {
+      "epoch": 0.16431,
+      "grad_norm": 0.7448684573173523,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 16431
+    },
+    {
+      "epoch": 0.16432,
+      "grad_norm": 0.8166662454605103,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 16432
+    },
+    {
+      "epoch": 0.16433,
+      "grad_norm": 0.8164058327674866,
+      "learning_rate": 0.003,
+      "loss": 4.0394,
+      "step": 16433
+    },
+    {
+      "epoch": 0.16434,
+      "grad_norm": 0.7713918685913086,
+      "learning_rate": 0.003,
+      "loss": 4.0168,
+      "step": 16434
+    },
+    {
+      "epoch": 0.16435,
+      "grad_norm": 0.7069947719573975,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 16435
+    },
+    {
+      "epoch": 0.16436,
+      "grad_norm": 0.6676192879676819,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 16436
+    },
+    {
+      "epoch": 0.16437,
+      "grad_norm": 0.7678779363632202,
+      "learning_rate": 0.003,
+      "loss": 4.0319,
+      "step": 16437
+    },
+    {
+      "epoch": 0.16438,
+      "grad_norm": 0.7208015322685242,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 16438
+    },
+    {
+      "epoch": 0.16439,
+      "grad_norm": 0.7070453763008118,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 16439
+    },
+    {
+      "epoch": 0.1644,
+      "grad_norm": 0.9220455884933472,
+      "learning_rate": 0.003,
+      "loss": 3.9993,
+      "step": 16440
+    },
+    {
+      "epoch": 0.16441,
+      "grad_norm": 1.2636245489120483,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 16441
+    },
+    {
+      "epoch": 0.16442,
+      "grad_norm": 0.678617537021637,
+      "learning_rate": 0.003,
+      "loss": 3.9811,
+      "step": 16442
+    },
+    {
+      "epoch": 0.16443,
+      "grad_norm": 0.49778828024864197,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 16443
+    },
+    {
+      "epoch": 0.16444,
+      "grad_norm": 0.5420036315917969,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 16444
+    },
+    {
+      "epoch": 0.16445,
+      "grad_norm": 0.727878212928772,
+      "learning_rate": 0.003,
+      "loss": 4.0108,
+      "step": 16445
+    },
+    {
+      "epoch": 0.16446,
+      "grad_norm": 0.7359219789505005,
+      "learning_rate": 0.003,
+      "loss": 3.993,
+      "step": 16446
+    },
+    {
+      "epoch": 0.16447,
+      "grad_norm": 0.5902706980705261,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 16447
+    },
+    {
+      "epoch": 0.16448,
+      "grad_norm": 0.5670353770256042,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 16448
+    },
+    {
+      "epoch": 0.16449,
+      "grad_norm": 0.6856237053871155,
+      "learning_rate": 0.003,
+      "loss": 3.9879,
+      "step": 16449
+    },
+    {
+      "epoch": 0.1645,
+      "grad_norm": 0.8142359852790833,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 16450
+    },
+    {
+      "epoch": 0.16451,
+      "grad_norm": 0.8532720804214478,
+      "learning_rate": 0.003,
+      "loss": 4.0091,
+      "step": 16451
+    },
+    {
+      "epoch": 0.16452,
+      "grad_norm": 1.0314480066299438,
+      "learning_rate": 0.003,
+      "loss": 3.9875,
+      "step": 16452
+    },
+    {
+      "epoch": 0.16453,
+      "grad_norm": 1.0708210468292236,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 16453
+    },
+    {
+      "epoch": 0.16454,
+      "grad_norm": 0.8132475018501282,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 16454
+    },
+    {
+      "epoch": 0.16455,
+      "grad_norm": 0.7674928307533264,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 16455
+    },
+    {
+      "epoch": 0.16456,
+      "grad_norm": 0.8469036221504211,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 16456
+    },
+    {
+      "epoch": 0.16457,
+      "grad_norm": 0.852376401424408,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 16457
+    },
+    {
+      "epoch": 0.16458,
+      "grad_norm": 0.8676538467407227,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 16458
+    },
+    {
+      "epoch": 0.16459,
+      "grad_norm": 1.0059013366699219,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 16459
+    },
+    {
+      "epoch": 0.1646,
+      "grad_norm": 1.1781777143478394,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 16460
+    },
+    {
+      "epoch": 0.16461,
+      "grad_norm": 0.795784056186676,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 16461
+    },
+    {
+      "epoch": 0.16462,
+      "grad_norm": 0.6251187920570374,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 16462
+    },
+    {
+      "epoch": 0.16463,
+      "grad_norm": 0.5951520204544067,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 16463
+    },
+    {
+      "epoch": 0.16464,
+      "grad_norm": 0.6764260530471802,
+      "learning_rate": 0.003,
+      "loss": 3.9869,
+      "step": 16464
+    },
+    {
+      "epoch": 0.16465,
+      "grad_norm": 0.7524860501289368,
+      "learning_rate": 0.003,
+      "loss": 4.0185,
+      "step": 16465
+    },
+    {
+      "epoch": 0.16466,
+      "grad_norm": 0.9263510704040527,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 16466
+    },
+    {
+      "epoch": 0.16467,
+      "grad_norm": 1.0180227756500244,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 16467
+    },
+    {
+      "epoch": 0.16468,
+      "grad_norm": 0.9685873985290527,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 16468
+    },
+    {
+      "epoch": 0.16469,
+      "grad_norm": 1.026856780052185,
+      "learning_rate": 0.003,
+      "loss": 4.0321,
+      "step": 16469
+    },
+    {
+      "epoch": 0.1647,
+      "grad_norm": 0.895588755607605,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 16470
+    },
+    {
+      "epoch": 0.16471,
+      "grad_norm": 0.8129714727401733,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 16471
+    },
+    {
+      "epoch": 0.16472,
+      "grad_norm": 1.013070821762085,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 16472
+    },
+    {
+      "epoch": 0.16473,
+      "grad_norm": 1.2826343774795532,
+      "learning_rate": 0.003,
+      "loss": 4.0393,
+      "step": 16473
+    },
+    {
+      "epoch": 0.16474,
+      "grad_norm": 0.7825457453727722,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 16474
+    },
+    {
+      "epoch": 0.16475,
+      "grad_norm": 0.6030428409576416,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 16475
+    },
+    {
+      "epoch": 0.16476,
+      "grad_norm": 0.6580950617790222,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 16476
+    },
+    {
+      "epoch": 0.16477,
+      "grad_norm": 0.6692718267440796,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 16477
+    },
+    {
+      "epoch": 0.16478,
+      "grad_norm": 0.7654966711997986,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 16478
+    },
+    {
+      "epoch": 0.16479,
+      "grad_norm": 0.7372002601623535,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 16479
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.7242112755775452,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 16480
+    },
+    {
+      "epoch": 0.16481,
+      "grad_norm": 0.8388112783432007,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 16481
+    },
+    {
+      "epoch": 0.16482,
+      "grad_norm": 0.8586564660072327,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 16482
+    },
+    {
+      "epoch": 0.16483,
+      "grad_norm": 0.9531363844871521,
+      "learning_rate": 0.003,
+      "loss": 3.9869,
+      "step": 16483
+    },
+    {
+      "epoch": 0.16484,
+      "grad_norm": 1.023642897605896,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 16484
+    },
+    {
+      "epoch": 0.16485,
+      "grad_norm": 0.9483336806297302,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 16485
+    },
+    {
+      "epoch": 0.16486,
+      "grad_norm": 0.8429510593414307,
+      "learning_rate": 0.003,
+      "loss": 3.9862,
+      "step": 16486
+    },
+    {
+      "epoch": 0.16487,
+      "grad_norm": 0.8054298162460327,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 16487
+    },
+    {
+      "epoch": 0.16488,
+      "grad_norm": 1.0099833011627197,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 16488
+    },
+    {
+      "epoch": 0.16489,
+      "grad_norm": 1.0440032482147217,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 16489
+    },
+    {
+      "epoch": 0.1649,
+      "grad_norm": 0.9935981035232544,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 16490
+    },
+    {
+      "epoch": 0.16491,
+      "grad_norm": 0.8320918083190918,
+      "learning_rate": 0.003,
+      "loss": 3.9881,
+      "step": 16491
+    },
+    {
+      "epoch": 0.16492,
+      "grad_norm": 0.6321850419044495,
+      "learning_rate": 0.003,
+      "loss": 3.9921,
+      "step": 16492
+    },
+    {
+      "epoch": 0.16493,
+      "grad_norm": 0.5548147559165955,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 16493
+    },
+    {
+      "epoch": 0.16494,
+      "grad_norm": 0.6225996017456055,
+      "learning_rate": 0.003,
+      "loss": 3.9921,
+      "step": 16494
+    },
+    {
+      "epoch": 0.16495,
+      "grad_norm": 0.7261919975280762,
+      "learning_rate": 0.003,
+      "loss": 4.0346,
+      "step": 16495
+    },
+    {
+      "epoch": 0.16496,
+      "grad_norm": 0.774777352809906,
+      "learning_rate": 0.003,
+      "loss": 3.9953,
+      "step": 16496
+    },
+    {
+      "epoch": 0.16497,
+      "grad_norm": 0.6230810284614563,
+      "learning_rate": 0.003,
+      "loss": 3.9653,
+      "step": 16497
+    },
+    {
+      "epoch": 0.16498,
+      "grad_norm": 0.5928381085395813,
+      "learning_rate": 0.003,
+      "loss": 3.9545,
+      "step": 16498
+    },
+    {
+      "epoch": 0.16499,
+      "grad_norm": 0.6339703798294067,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 16499
+    },
+    {
+      "epoch": 0.165,
+      "grad_norm": 0.593946099281311,
+      "learning_rate": 0.003,
+      "loss": 3.9832,
+      "step": 16500
+    },
+    {
+      "epoch": 0.16501,
+      "grad_norm": 0.7014836072921753,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 16501
+    },
+    {
+      "epoch": 0.16502,
+      "grad_norm": 0.9443208575248718,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 16502
+    },
+    {
+      "epoch": 0.16503,
+      "grad_norm": 1.3297762870788574,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 16503
+    },
+    {
+      "epoch": 0.16504,
+      "grad_norm": 0.6993222236633301,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 16504
+    },
+    {
+      "epoch": 0.16505,
+      "grad_norm": 0.7262223958969116,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 16505
+    },
+    {
+      "epoch": 0.16506,
+      "grad_norm": 0.7761648893356323,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 16506
+    },
+    {
+      "epoch": 0.16507,
+      "grad_norm": 0.6623278260231018,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 16507
+    },
+    {
+      "epoch": 0.16508,
+      "grad_norm": 0.6675810813903809,
+      "learning_rate": 0.003,
+      "loss": 3.9913,
+      "step": 16508
+    },
+    {
+      "epoch": 0.16509,
+      "grad_norm": 0.6640301942825317,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 16509
+    },
+    {
+      "epoch": 0.1651,
+      "grad_norm": 0.7328985929489136,
+      "learning_rate": 0.003,
+      "loss": 3.9854,
+      "step": 16510
+    },
+    {
+      "epoch": 0.16511,
+      "grad_norm": 0.861212968826294,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 16511
+    },
+    {
+      "epoch": 0.16512,
+      "grad_norm": 0.9499975442886353,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 16512
+    },
+    {
+      "epoch": 0.16513,
+      "grad_norm": 1.1305769681930542,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 16513
+    },
+    {
+      "epoch": 0.16514,
+      "grad_norm": 0.7863883376121521,
+      "learning_rate": 0.003,
+      "loss": 3.9891,
+      "step": 16514
+    },
+    {
+      "epoch": 0.16515,
+      "grad_norm": 0.7641639113426208,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 16515
+    },
+    {
+      "epoch": 0.16516,
+      "grad_norm": 0.8611436486244202,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 16516
+    },
+    {
+      "epoch": 0.16517,
+      "grad_norm": 0.9765501022338867,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 16517
+    },
+    {
+      "epoch": 0.16518,
+      "grad_norm": 0.8455554246902466,
+      "learning_rate": 0.003,
+      "loss": 3.9993,
+      "step": 16518
+    },
+    {
+      "epoch": 0.16519,
+      "grad_norm": 0.692939043045044,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 16519
+    },
+    {
+      "epoch": 0.1652,
+      "grad_norm": 0.7636265754699707,
+      "learning_rate": 0.003,
+      "loss": 3.9858,
+      "step": 16520
+    },
+    {
+      "epoch": 0.16521,
+      "grad_norm": 0.7926337122917175,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 16521
+    },
+    {
+      "epoch": 0.16522,
+      "grad_norm": 0.7739710211753845,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 16522
+    },
+    {
+      "epoch": 0.16523,
+      "grad_norm": 0.8571917414665222,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 16523
+    },
+    {
+      "epoch": 0.16524,
+      "grad_norm": 0.9024523496627808,
+      "learning_rate": 0.003,
+      "loss": 3.9897,
+      "step": 16524
+    },
+    {
+      "epoch": 0.16525,
+      "grad_norm": 0.8396733999252319,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 16525
+    },
+    {
+      "epoch": 0.16526,
+      "grad_norm": 0.9109805226325989,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 16526
+    },
+    {
+      "epoch": 0.16527,
+      "grad_norm": 0.8133218884468079,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 16527
+    },
+    {
+      "epoch": 0.16528,
+      "grad_norm": 0.7340258955955505,
+      "learning_rate": 0.003,
+      "loss": 3.9835,
+      "step": 16528
+    },
+    {
+      "epoch": 0.16529,
+      "grad_norm": 0.6597772836685181,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 16529
+    },
+    {
+      "epoch": 0.1653,
+      "grad_norm": 0.7231074571609497,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 16530
+    },
+    {
+      "epoch": 0.16531,
+      "grad_norm": 0.704673707485199,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 16531
+    },
+    {
+      "epoch": 0.16532,
+      "grad_norm": 0.6683825850486755,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 16532
+    },
+    {
+      "epoch": 0.16533,
+      "grad_norm": 0.7330564260482788,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 16533
+    },
+    {
+      "epoch": 0.16534,
+      "grad_norm": 0.8109389543533325,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 16534
+    },
+    {
+      "epoch": 0.16535,
+      "grad_norm": 0.9890372157096863,
+      "learning_rate": 0.003,
+      "loss": 4.0091,
+      "step": 16535
+    },
+    {
+      "epoch": 0.16536,
+      "grad_norm": 1.5045017004013062,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 16536
+    },
+    {
+      "epoch": 0.16537,
+      "grad_norm": 0.7912871241569519,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 16537
+    },
+    {
+      "epoch": 0.16538,
+      "grad_norm": 0.8432285189628601,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 16538
+    },
+    {
+      "epoch": 0.16539,
+      "grad_norm": 1.0448075532913208,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 16539
+    },
+    {
+      "epoch": 0.1654,
+      "grad_norm": 0.9978594779968262,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 16540
+    },
+    {
+      "epoch": 0.16541,
+      "grad_norm": 0.9974130988121033,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 16541
+    },
+    {
+      "epoch": 0.16542,
+      "grad_norm": 1.2907527685165405,
+      "learning_rate": 0.003,
+      "loss": 4.0409,
+      "step": 16542
+    },
+    {
+      "epoch": 0.16543,
+      "grad_norm": 0.981609046459198,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 16543
+    },
+    {
+      "epoch": 0.16544,
+      "grad_norm": 1.1068936586380005,
+      "learning_rate": 0.003,
+      "loss": 4.0056,
+      "step": 16544
+    },
+    {
+      "epoch": 0.16545,
+      "grad_norm": 1.0436385869979858,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 16545
+    },
+    {
+      "epoch": 0.16546,
+      "grad_norm": 0.9572142958641052,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 16546
+    },
+    {
+      "epoch": 0.16547,
+      "grad_norm": 1.1582400798797607,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 16547
+    },
+    {
+      "epoch": 0.16548,
+      "grad_norm": 1.0990976095199585,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 16548
+    },
+    {
+      "epoch": 0.16549,
+      "grad_norm": 0.7857990264892578,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 16549
+    },
+    {
+      "epoch": 0.1655,
+      "grad_norm": 0.8027048110961914,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 16550
+    },
+    {
+      "epoch": 0.16551,
+      "grad_norm": 0.73116534948349,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 16551
+    },
+    {
+      "epoch": 0.16552,
+      "grad_norm": 0.6601966619491577,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 16552
+    },
+    {
+      "epoch": 0.16553,
+      "grad_norm": 0.6313268542289734,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 16553
+    },
+    {
+      "epoch": 0.16554,
+      "grad_norm": 0.82774418592453,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 16554
+    },
+    {
+      "epoch": 0.16555,
+      "grad_norm": 1.204330563545227,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 16555
+    },
+    {
+      "epoch": 0.16556,
+      "grad_norm": 0.6298305988311768,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 16556
+    },
+    {
+      "epoch": 0.16557,
+      "grad_norm": 0.6001787185668945,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 16557
+    },
+    {
+      "epoch": 0.16558,
+      "grad_norm": 0.665256142616272,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 16558
+    },
+    {
+      "epoch": 0.16559,
+      "grad_norm": 0.7350388765335083,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 16559
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.9374911785125732,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 16560
+    },
+    {
+      "epoch": 0.16561,
+      "grad_norm": 0.9782078266143799,
+      "learning_rate": 0.003,
+      "loss": 4.0326,
+      "step": 16561
+    },
+    {
+      "epoch": 0.16562,
+      "grad_norm": 0.7695868015289307,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 16562
+    },
+    {
+      "epoch": 0.16563,
+      "grad_norm": 0.7391797304153442,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 16563
+    },
+    {
+      "epoch": 0.16564,
+      "grad_norm": 0.7675747275352478,
+      "learning_rate": 0.003,
+      "loss": 3.9744,
+      "step": 16564
+    },
+    {
+      "epoch": 0.16565,
+      "grad_norm": 0.7832668423652649,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 16565
+    },
+    {
+      "epoch": 0.16566,
+      "grad_norm": 0.6910805702209473,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 16566
+    },
+    {
+      "epoch": 0.16567,
+      "grad_norm": 0.7149823307991028,
+      "learning_rate": 0.003,
+      "loss": 3.9893,
+      "step": 16567
+    },
+    {
+      "epoch": 0.16568,
+      "grad_norm": 0.6877647638320923,
+      "learning_rate": 0.003,
+      "loss": 3.9765,
+      "step": 16568
+    },
+    {
+      "epoch": 0.16569,
+      "grad_norm": 0.7742563486099243,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 16569
+    },
+    {
+      "epoch": 0.1657,
+      "grad_norm": 0.8138877153396606,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 16570
+    },
+    {
+      "epoch": 0.16571,
+      "grad_norm": 0.7903016805648804,
+      "learning_rate": 0.003,
+      "loss": 3.9902,
+      "step": 16571
+    },
+    {
+      "epoch": 0.16572,
+      "grad_norm": 0.861447811126709,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 16572
+    },
+    {
+      "epoch": 0.16573,
+      "grad_norm": 0.8637712001800537,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 16573
+    },
+    {
+      "epoch": 0.16574,
+      "grad_norm": 0.8199882507324219,
+      "learning_rate": 0.003,
+      "loss": 3.9841,
+      "step": 16574
+    },
+    {
+      "epoch": 0.16575,
+      "grad_norm": 0.7627727389335632,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 16575
+    },
+    {
+      "epoch": 0.16576,
+      "grad_norm": 0.9434555768966675,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 16576
+    },
+    {
+      "epoch": 0.16577,
+      "grad_norm": 1.17056143283844,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 16577
+    },
+    {
+      "epoch": 0.16578,
+      "grad_norm": 0.8861210346221924,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 16578
+    },
+    {
+      "epoch": 0.16579,
+      "grad_norm": 0.977642834186554,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 16579
+    },
+    {
+      "epoch": 0.1658,
+      "grad_norm": 1.097947120666504,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 16580
+    },
+    {
+      "epoch": 0.16581,
+      "grad_norm": 0.8989746570587158,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 16581
+    },
+    {
+      "epoch": 0.16582,
+      "grad_norm": 0.8438913822174072,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 16582
+    },
+    {
+      "epoch": 0.16583,
+      "grad_norm": 1.0367250442504883,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 16583
+    },
+    {
+      "epoch": 0.16584,
+      "grad_norm": 0.9683894515037537,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 16584
+    },
+    {
+      "epoch": 0.16585,
+      "grad_norm": 0.8536483645439148,
+      "learning_rate": 0.003,
+      "loss": 3.9808,
+      "step": 16585
+    },
+    {
+      "epoch": 0.16586,
+      "grad_norm": 0.8029671907424927,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 16586
+    },
+    {
+      "epoch": 0.16587,
+      "grad_norm": 0.7941018342971802,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 16587
+    },
+    {
+      "epoch": 0.16588,
+      "grad_norm": 0.7110337615013123,
+      "learning_rate": 0.003,
+      "loss": 3.9852,
+      "step": 16588
+    },
+    {
+      "epoch": 0.16589,
+      "grad_norm": 0.6337109804153442,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 16589
+    },
+    {
+      "epoch": 0.1659,
+      "grad_norm": 0.5970135927200317,
+      "learning_rate": 0.003,
+      "loss": 3.9791,
+      "step": 16590
+    },
+    {
+      "epoch": 0.16591,
+      "grad_norm": 0.7086123824119568,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 16591
+    },
+    {
+      "epoch": 0.16592,
+      "grad_norm": 0.7931826114654541,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 16592
+    },
+    {
+      "epoch": 0.16593,
+      "grad_norm": 0.9041086435317993,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 16593
+    },
+    {
+      "epoch": 0.16594,
+      "grad_norm": 0.8759629726409912,
+      "learning_rate": 0.003,
+      "loss": 3.9886,
+      "step": 16594
+    },
+    {
+      "epoch": 0.16595,
+      "grad_norm": 0.7272791862487793,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 16595
+    },
+    {
+      "epoch": 0.16596,
+      "grad_norm": 0.6055447459220886,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 16596
+    },
+    {
+      "epoch": 0.16597,
+      "grad_norm": 0.65510493516922,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 16597
+    },
+    {
+      "epoch": 0.16598,
+      "grad_norm": 0.7917929887771606,
+      "learning_rate": 0.003,
+      "loss": 3.9688,
+      "step": 16598
+    },
+    {
+      "epoch": 0.16599,
+      "grad_norm": 1.0322705507278442,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 16599
+    },
+    {
+      "epoch": 0.166,
+      "grad_norm": 1.1653863191604614,
+      "learning_rate": 0.003,
+      "loss": 3.9752,
+      "step": 16600
+    },
+    {
+      "epoch": 0.16601,
+      "grad_norm": 0.7524556517601013,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 16601
+    },
+    {
+      "epoch": 0.16602,
+      "grad_norm": 0.7310243248939514,
+      "learning_rate": 0.003,
+      "loss": 3.9943,
+      "step": 16602
+    },
+    {
+      "epoch": 0.16603,
+      "grad_norm": 0.7169476747512817,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 16603
+    },
+    {
+      "epoch": 0.16604,
+      "grad_norm": 0.7632569670677185,
+      "learning_rate": 0.003,
+      "loss": 3.9992,
+      "step": 16604
+    },
+    {
+      "epoch": 0.16605,
+      "grad_norm": 0.7261447906494141,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 16605
+    },
+    {
+      "epoch": 0.16606,
+      "grad_norm": 0.7755059599876404,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 16606
+    },
+    {
+      "epoch": 0.16607,
+      "grad_norm": 0.7412046790122986,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 16607
+    },
+    {
+      "epoch": 0.16608,
+      "grad_norm": 0.9550292491912842,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 16608
+    },
+    {
+      "epoch": 0.16609,
+      "grad_norm": 1.1417499780654907,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 16609
+    },
+    {
+      "epoch": 0.1661,
+      "grad_norm": 0.9677286744117737,
+      "learning_rate": 0.003,
+      "loss": 3.979,
+      "step": 16610
+    },
+    {
+      "epoch": 0.16611,
+      "grad_norm": 0.9584037065505981,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 16611
+    },
+    {
+      "epoch": 0.16612,
+      "grad_norm": 0.834425687789917,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 16612
+    },
+    {
+      "epoch": 0.16613,
+      "grad_norm": 0.8633244037628174,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 16613
+    },
+    {
+      "epoch": 0.16614,
+      "grad_norm": 0.9459644556045532,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 16614
+    },
+    {
+      "epoch": 0.16615,
+      "grad_norm": 1.3256431818008423,
+      "learning_rate": 0.003,
+      "loss": 3.9865,
+      "step": 16615
+    },
+    {
+      "epoch": 0.16616,
+      "grad_norm": 0.7476480603218079,
+      "learning_rate": 0.003,
+      "loss": 4.0584,
+      "step": 16616
+    },
+    {
+      "epoch": 0.16617,
+      "grad_norm": 0.7218948006629944,
+      "learning_rate": 0.003,
+      "loss": 3.9895,
+      "step": 16617
+    },
+    {
+      "epoch": 0.16618,
+      "grad_norm": 0.9353631734848022,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 16618
+    },
+    {
+      "epoch": 0.16619,
+      "grad_norm": 0.949628472328186,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 16619
+    },
+    {
+      "epoch": 0.1662,
+      "grad_norm": 1.0367956161499023,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 16620
+    },
+    {
+      "epoch": 0.16621,
+      "grad_norm": 1.2297966480255127,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 16621
+    },
+    {
+      "epoch": 0.16622,
+      "grad_norm": 0.6927123069763184,
+      "learning_rate": 0.003,
+      "loss": 3.9726,
+      "step": 16622
+    },
+    {
+      "epoch": 0.16623,
+      "grad_norm": 0.6782436966896057,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 16623
+    },
+    {
+      "epoch": 0.16624,
+      "grad_norm": 0.645195722579956,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 16624
+    },
+    {
+      "epoch": 0.16625,
+      "grad_norm": 0.6785486340522766,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 16625
+    },
+    {
+      "epoch": 0.16626,
+      "grad_norm": 0.8116571307182312,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 16626
+    },
+    {
+      "epoch": 0.16627,
+      "grad_norm": 0.9605870842933655,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 16627
+    },
+    {
+      "epoch": 0.16628,
+      "grad_norm": 1.026713490486145,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 16628
+    },
+    {
+      "epoch": 0.16629,
+      "grad_norm": 0.8040949106216431,
+      "learning_rate": 0.003,
+      "loss": 4.0475,
+      "step": 16629
+    },
+    {
+      "epoch": 0.1663,
+      "grad_norm": 0.6801572442054749,
+      "learning_rate": 0.003,
+      "loss": 3.9905,
+      "step": 16630
+    },
+    {
+      "epoch": 0.16631,
+      "grad_norm": 0.7101470232009888,
+      "learning_rate": 0.003,
+      "loss": 4.0249,
+      "step": 16631
+    },
+    {
+      "epoch": 0.16632,
+      "grad_norm": 0.78169846534729,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 16632
+    },
+    {
+      "epoch": 0.16633,
+      "grad_norm": 0.7497729659080505,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 16633
+    },
+    {
+      "epoch": 0.16634,
+      "grad_norm": 0.7496538758277893,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 16634
+    },
+    {
+      "epoch": 0.16635,
+      "grad_norm": 0.7790054082870483,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 16635
+    },
+    {
+      "epoch": 0.16636,
+      "grad_norm": 0.8766295313835144,
+      "learning_rate": 0.003,
+      "loss": 3.9589,
+      "step": 16636
+    },
+    {
+      "epoch": 0.16637,
+      "grad_norm": 0.895172119140625,
+      "learning_rate": 0.003,
+      "loss": 3.9737,
+      "step": 16637
+    },
+    {
+      "epoch": 0.16638,
+      "grad_norm": 0.9995051026344299,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 16638
+    },
+    {
+      "epoch": 0.16639,
+      "grad_norm": 0.9869672656059265,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 16639
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.9185425043106079,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 16640
+    },
+    {
+      "epoch": 0.16641,
+      "grad_norm": 0.9075988531112671,
+      "learning_rate": 0.003,
+      "loss": 4.0375,
+      "step": 16641
+    },
+    {
+      "epoch": 0.16642,
+      "grad_norm": 0.8588910698890686,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 16642
+    },
+    {
+      "epoch": 0.16643,
+      "grad_norm": 0.8271640539169312,
+      "learning_rate": 0.003,
+      "loss": 3.9933,
+      "step": 16643
+    },
+    {
+      "epoch": 0.16644,
+      "grad_norm": 0.7942617535591125,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 16644
+    },
+    {
+      "epoch": 0.16645,
+      "grad_norm": 0.8908455967903137,
+      "learning_rate": 0.003,
+      "loss": 3.998,
+      "step": 16645
+    },
+    {
+      "epoch": 0.16646,
+      "grad_norm": 0.8527781963348389,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 16646
+    },
+    {
+      "epoch": 0.16647,
+      "grad_norm": 0.9275540113449097,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 16647
+    },
+    {
+      "epoch": 0.16648,
+      "grad_norm": 0.8698004484176636,
+      "learning_rate": 0.003,
+      "loss": 3.9794,
+      "step": 16648
+    },
+    {
+      "epoch": 0.16649,
+      "grad_norm": 0.7929522395133972,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 16649
+    },
+    {
+      "epoch": 0.1665,
+      "grad_norm": 0.7279219031333923,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 16650
+    },
+    {
+      "epoch": 0.16651,
+      "grad_norm": 0.6361696720123291,
+      "learning_rate": 0.003,
+      "loss": 3.9809,
+      "step": 16651
+    },
+    {
+      "epoch": 0.16652,
+      "grad_norm": 0.7483872175216675,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 16652
+    },
+    {
+      "epoch": 0.16653,
+      "grad_norm": 0.8242842555046082,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 16653
+    },
+    {
+      "epoch": 0.16654,
+      "grad_norm": 0.7325317859649658,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 16654
+    },
+    {
+      "epoch": 0.16655,
+      "grad_norm": 0.6815661787986755,
+      "learning_rate": 0.003,
+      "loss": 3.9831,
+      "step": 16655
+    },
+    {
+      "epoch": 0.16656,
+      "grad_norm": 0.8499429821968079,
+      "learning_rate": 0.003,
+      "loss": 3.9901,
+      "step": 16656
+    },
+    {
+      "epoch": 0.16657,
+      "grad_norm": 1.0942624807357788,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 16657
+    },
+    {
+      "epoch": 0.16658,
+      "grad_norm": 1.0901638269424438,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 16658
+    },
+    {
+      "epoch": 0.16659,
+      "grad_norm": 0.8472113013267517,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 16659
+    },
+    {
+      "epoch": 0.1666,
+      "grad_norm": 0.6041252613067627,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 16660
+    },
+    {
+      "epoch": 0.16661,
+      "grad_norm": 0.8856112957000732,
+      "learning_rate": 0.003,
+      "loss": 3.9875,
+      "step": 16661
+    },
+    {
+      "epoch": 0.16662,
+      "grad_norm": 1.2002215385437012,
+      "learning_rate": 0.003,
+      "loss": 4.0056,
+      "step": 16662
+    },
+    {
+      "epoch": 0.16663,
+      "grad_norm": 0.8653046488761902,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 16663
+    },
+    {
+      "epoch": 0.16664,
+      "grad_norm": 0.8390600085258484,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 16664
+    },
+    {
+      "epoch": 0.16665,
+      "grad_norm": 0.8085476160049438,
+      "learning_rate": 0.003,
+      "loss": 4.0335,
+      "step": 16665
+    },
+    {
+      "epoch": 0.16666,
+      "grad_norm": 0.8118124604225159,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 16666
+    },
+    {
+      "epoch": 0.16667,
+      "grad_norm": 0.8476701378822327,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 16667
+    },
+    {
+      "epoch": 0.16668,
+      "grad_norm": 0.6915823817253113,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 16668
+    },
+    {
+      "epoch": 0.16669,
+      "grad_norm": 0.6230525374412537,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 16669
+    },
+    {
+      "epoch": 0.1667,
+      "grad_norm": 0.6813180446624756,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 16670
+    },
+    {
+      "epoch": 0.16671,
+      "grad_norm": 0.7478421926498413,
+      "learning_rate": 0.003,
+      "loss": 3.9928,
+      "step": 16671
+    },
+    {
+      "epoch": 0.16672,
+      "grad_norm": 0.7379879951477051,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 16672
+    },
+    {
+      "epoch": 0.16673,
+      "grad_norm": 0.9299353361129761,
+      "learning_rate": 0.003,
+      "loss": 3.9689,
+      "step": 16673
+    },
+    {
+      "epoch": 0.16674,
+      "grad_norm": 1.1519417762756348,
+      "learning_rate": 0.003,
+      "loss": 3.9985,
+      "step": 16674
+    },
+    {
+      "epoch": 0.16675,
+      "grad_norm": 0.934518575668335,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 16675
+    },
+    {
+      "epoch": 0.16676,
+      "grad_norm": 0.9274440407752991,
+      "learning_rate": 0.003,
+      "loss": 3.9876,
+      "step": 16676
+    },
+    {
+      "epoch": 0.16677,
+      "grad_norm": 0.9019926190376282,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 16677
+    },
+    {
+      "epoch": 0.16678,
+      "grad_norm": 0.9301829934120178,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 16678
+    },
+    {
+      "epoch": 0.16679,
+      "grad_norm": 1.0301803350448608,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 16679
+    },
+    {
+      "epoch": 0.1668,
+      "grad_norm": 0.9349686503410339,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 16680
+    },
+    {
+      "epoch": 0.16681,
+      "grad_norm": 1.0394617319107056,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 16681
+    },
+    {
+      "epoch": 0.16682,
+      "grad_norm": 1.0207794904708862,
+      "learning_rate": 0.003,
+      "loss": 3.9934,
+      "step": 16682
+    },
+    {
+      "epoch": 0.16683,
+      "grad_norm": 0.8925315737724304,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 16683
+    },
+    {
+      "epoch": 0.16684,
+      "grad_norm": 0.7841940522193909,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 16684
+    },
+    {
+      "epoch": 0.16685,
+      "grad_norm": 0.7427353262901306,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 16685
+    },
+    {
+      "epoch": 0.16686,
+      "grad_norm": 0.6968230605125427,
+      "learning_rate": 0.003,
+      "loss": 3.9998,
+      "step": 16686
+    },
+    {
+      "epoch": 0.16687,
+      "grad_norm": 0.5778324007987976,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 16687
+    },
+    {
+      "epoch": 0.16688,
+      "grad_norm": 0.6392512321472168,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 16688
+    },
+    {
+      "epoch": 0.16689,
+      "grad_norm": 0.7645626664161682,
+      "learning_rate": 0.003,
+      "loss": 3.9857,
+      "step": 16689
+    },
+    {
+      "epoch": 0.1669,
+      "grad_norm": 0.8958572745323181,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 16690
+    },
+    {
+      "epoch": 0.16691,
+      "grad_norm": 0.8632878065109253,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 16691
+    },
+    {
+      "epoch": 0.16692,
+      "grad_norm": 0.75998854637146,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 16692
+    },
+    {
+      "epoch": 0.16693,
+      "grad_norm": 0.7607892155647278,
+      "learning_rate": 0.003,
+      "loss": 3.9817,
+      "step": 16693
+    },
+    {
+      "epoch": 0.16694,
+      "grad_norm": 0.8183885216712952,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 16694
+    },
+    {
+      "epoch": 0.16695,
+      "grad_norm": 0.8750672936439514,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 16695
+    },
+    {
+      "epoch": 0.16696,
+      "grad_norm": 0.9015554189682007,
+      "learning_rate": 0.003,
+      "loss": 3.9907,
+      "step": 16696
+    },
+    {
+      "epoch": 0.16697,
+      "grad_norm": 1.065500259399414,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 16697
+    },
+    {
+      "epoch": 0.16698,
+      "grad_norm": 1.045125961303711,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 16698
+    },
+    {
+      "epoch": 0.16699,
+      "grad_norm": 0.9421795606613159,
+      "learning_rate": 0.003,
+      "loss": 3.9943,
+      "step": 16699
+    },
+    {
+      "epoch": 0.167,
+      "grad_norm": 0.8270700573921204,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 16700
+    },
+    {
+      "epoch": 0.16701,
+      "grad_norm": 0.771172821521759,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 16701
+    },
+    {
+      "epoch": 0.16702,
+      "grad_norm": 0.7797114849090576,
+      "learning_rate": 0.003,
+      "loss": 3.9778,
+      "step": 16702
+    },
+    {
+      "epoch": 0.16703,
+      "grad_norm": 0.7795373797416687,
+      "learning_rate": 0.003,
+      "loss": 3.9736,
+      "step": 16703
+    },
+    {
+      "epoch": 0.16704,
+      "grad_norm": 0.8451706171035767,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 16704
+    },
+    {
+      "epoch": 0.16705,
+      "grad_norm": 0.8600301146507263,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 16705
+    },
+    {
+      "epoch": 0.16706,
+      "grad_norm": 0.8787464499473572,
+      "learning_rate": 0.003,
+      "loss": 4.0002,
+      "step": 16706
+    },
+    {
+      "epoch": 0.16707,
+      "grad_norm": 0.8668013215065002,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 16707
+    },
+    {
+      "epoch": 0.16708,
+      "grad_norm": 0.8679298758506775,
+      "learning_rate": 0.003,
+      "loss": 3.994,
+      "step": 16708
+    },
+    {
+      "epoch": 0.16709,
+      "grad_norm": 0.8530462980270386,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 16709
+    },
+    {
+      "epoch": 0.1671,
+      "grad_norm": 0.8643242716789246,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 16710
+    },
+    {
+      "epoch": 0.16711,
+      "grad_norm": 0.8478008508682251,
+      "learning_rate": 0.003,
+      "loss": 3.9809,
+      "step": 16711
+    },
+    {
+      "epoch": 0.16712,
+      "grad_norm": 0.7355970144271851,
+      "learning_rate": 0.003,
+      "loss": 3.9622,
+      "step": 16712
+    },
+    {
+      "epoch": 0.16713,
+      "grad_norm": 0.6986602544784546,
+      "learning_rate": 0.003,
+      "loss": 3.9941,
+      "step": 16713
+    },
+    {
+      "epoch": 0.16714,
+      "grad_norm": 0.7060433030128479,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 16714
+    },
+    {
+      "epoch": 0.16715,
+      "grad_norm": 0.7944496273994446,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 16715
+    },
+    {
+      "epoch": 0.16716,
+      "grad_norm": 0.8400862812995911,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 16716
+    },
+    {
+      "epoch": 0.16717,
+      "grad_norm": 0.7830513715744019,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 16717
+    },
+    {
+      "epoch": 0.16718,
+      "grad_norm": 0.8651089072227478,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 16718
+    },
+    {
+      "epoch": 0.16719,
+      "grad_norm": 0.9211392402648926,
+      "learning_rate": 0.003,
+      "loss": 4.0057,
+      "step": 16719
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.8560593724250793,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 16720
+    },
+    {
+      "epoch": 0.16721,
+      "grad_norm": 0.7442037463188171,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 16721
+    },
+    {
+      "epoch": 0.16722,
+      "grad_norm": 0.783574104309082,
+      "learning_rate": 0.003,
+      "loss": 3.9863,
+      "step": 16722
+    },
+    {
+      "epoch": 0.16723,
+      "grad_norm": 0.8946051001548767,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 16723
+    },
+    {
+      "epoch": 0.16724,
+      "grad_norm": 0.8144287467002869,
+      "learning_rate": 0.003,
+      "loss": 4.0387,
+      "step": 16724
+    },
+    {
+      "epoch": 0.16725,
+      "grad_norm": 0.6272501349449158,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 16725
+    },
+    {
+      "epoch": 0.16726,
+      "grad_norm": 0.6548783183097839,
+      "learning_rate": 0.003,
+      "loss": 3.9769,
+      "step": 16726
+    },
+    {
+      "epoch": 0.16727,
+      "grad_norm": 0.7995438575744629,
+      "learning_rate": 0.003,
+      "loss": 3.9909,
+      "step": 16727
+    },
+    {
+      "epoch": 0.16728,
+      "grad_norm": 0.9519756436347961,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 16728
+    },
+    {
+      "epoch": 0.16729,
+      "grad_norm": 1.064491868019104,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 16729
+    },
+    {
+      "epoch": 0.1673,
+      "grad_norm": 0.819176197052002,
+      "learning_rate": 0.003,
+      "loss": 4.0508,
+      "step": 16730
+    },
+    {
+      "epoch": 0.16731,
+      "grad_norm": 0.6019748449325562,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 16731
+    },
+    {
+      "epoch": 0.16732,
+      "grad_norm": 0.7550683617591858,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 16732
+    },
+    {
+      "epoch": 0.16733,
+      "grad_norm": 0.9951023459434509,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 16733
+    },
+    {
+      "epoch": 0.16734,
+      "grad_norm": 1.3040541410446167,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 16734
+    },
+    {
+      "epoch": 0.16735,
+      "grad_norm": 0.8052306175231934,
+      "learning_rate": 0.003,
+      "loss": 4.0384,
+      "step": 16735
+    },
+    {
+      "epoch": 0.16736,
+      "grad_norm": 0.7533053159713745,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 16736
+    },
+    {
+      "epoch": 0.16737,
+      "grad_norm": 0.7331545948982239,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 16737
+    },
+    {
+      "epoch": 0.16738,
+      "grad_norm": 0.7126925587654114,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 16738
+    },
+    {
+      "epoch": 0.16739,
+      "grad_norm": 0.707188606262207,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 16739
+    },
+    {
+      "epoch": 0.1674,
+      "grad_norm": 0.6312726736068726,
+      "learning_rate": 0.003,
+      "loss": 3.9826,
+      "step": 16740
+    },
+    {
+      "epoch": 0.16741,
+      "grad_norm": 0.631080150604248,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 16741
+    },
+    {
+      "epoch": 0.16742,
+      "grad_norm": 0.6698132157325745,
+      "learning_rate": 0.003,
+      "loss": 3.9821,
+      "step": 16742
+    },
+    {
+      "epoch": 0.16743,
+      "grad_norm": 0.8329685926437378,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 16743
+    },
+    {
+      "epoch": 0.16744,
+      "grad_norm": 1.090051293373108,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 16744
+    },
+    {
+      "epoch": 0.16745,
+      "grad_norm": 1.2471741437911987,
+      "learning_rate": 0.003,
+      "loss": 4.0022,
+      "step": 16745
+    },
+    {
+      "epoch": 0.16746,
+      "grad_norm": 0.7032690048217773,
+      "learning_rate": 0.003,
+      "loss": 4.0355,
+      "step": 16746
+    },
+    {
+      "epoch": 0.16747,
+      "grad_norm": 0.5997713804244995,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 16747
+    },
+    {
+      "epoch": 0.16748,
+      "grad_norm": 0.7260429263114929,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 16748
+    },
+    {
+      "epoch": 0.16749,
+      "grad_norm": 0.8784670829772949,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 16749
+    },
+    {
+      "epoch": 0.1675,
+      "grad_norm": 0.969279944896698,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 16750
+    },
+    {
+      "epoch": 0.16751,
+      "grad_norm": 0.830579936504364,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 16751
+    },
+    {
+      "epoch": 0.16752,
+      "grad_norm": 0.7464983463287354,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 16752
+    },
+    {
+      "epoch": 0.16753,
+      "grad_norm": 0.7157581448554993,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 16753
+    },
+    {
+      "epoch": 0.16754,
+      "grad_norm": 0.8800062537193298,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 16754
+    },
+    {
+      "epoch": 0.16755,
+      "grad_norm": 0.9574975371360779,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 16755
+    },
+    {
+      "epoch": 0.16756,
+      "grad_norm": 0.9339281916618347,
+      "learning_rate": 0.003,
+      "loss": 3.9887,
+      "step": 16756
+    },
+    {
+      "epoch": 0.16757,
+      "grad_norm": 1.0285277366638184,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 16757
+    },
+    {
+      "epoch": 0.16758,
+      "grad_norm": 1.0120329856872559,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 16758
+    },
+    {
+      "epoch": 0.16759,
+      "grad_norm": 0.9101712107658386,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 16759
+    },
+    {
+      "epoch": 0.1676,
+      "grad_norm": 1.0395931005477905,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 16760
+    },
+    {
+      "epoch": 0.16761,
+      "grad_norm": 1.0496395826339722,
+      "learning_rate": 0.003,
+      "loss": 4.0487,
+      "step": 16761
+    },
+    {
+      "epoch": 0.16762,
+      "grad_norm": 1.0535210371017456,
+      "learning_rate": 0.003,
+      "loss": 4.0339,
+      "step": 16762
+    },
+    {
+      "epoch": 0.16763,
+      "grad_norm": 1.2106959819793701,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 16763
+    },
+    {
+      "epoch": 0.16764,
+      "grad_norm": 0.9122406840324402,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 16764
+    },
+    {
+      "epoch": 0.16765,
+      "grad_norm": 0.9126542210578918,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 16765
+    },
+    {
+      "epoch": 0.16766,
+      "grad_norm": 0.9665969014167786,
+      "learning_rate": 0.003,
+      "loss": 4.062,
+      "step": 16766
+    },
+    {
+      "epoch": 0.16767,
+      "grad_norm": 0.9059301614761353,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 16767
+    },
+    {
+      "epoch": 0.16768,
+      "grad_norm": 0.8744325637817383,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 16768
+    },
+    {
+      "epoch": 0.16769,
+      "grad_norm": 0.9320008754730225,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 16769
+    },
+    {
+      "epoch": 0.1677,
+      "grad_norm": 0.8519328236579895,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 16770
+    },
+    {
+      "epoch": 0.16771,
+      "grad_norm": 0.6243749856948853,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 16771
+    },
+    {
+      "epoch": 0.16772,
+      "grad_norm": 0.702466607093811,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 16772
+    },
+    {
+      "epoch": 0.16773,
+      "grad_norm": 0.7545448541641235,
+      "learning_rate": 0.003,
+      "loss": 3.9725,
+      "step": 16773
+    },
+    {
+      "epoch": 0.16774,
+      "grad_norm": 0.8205435872077942,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 16774
+    },
+    {
+      "epoch": 0.16775,
+      "grad_norm": 1.0039379596710205,
+      "learning_rate": 0.003,
+      "loss": 3.9891,
+      "step": 16775
+    },
+    {
+      "epoch": 0.16776,
+      "grad_norm": 1.189281702041626,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 16776
+    },
+    {
+      "epoch": 0.16777,
+      "grad_norm": 0.6821681261062622,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 16777
+    },
+    {
+      "epoch": 0.16778,
+      "grad_norm": 0.6866998076438904,
+      "learning_rate": 0.003,
+      "loss": 4.0394,
+      "step": 16778
+    },
+    {
+      "epoch": 0.16779,
+      "grad_norm": 0.813926637172699,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 16779
+    },
+    {
+      "epoch": 0.1678,
+      "grad_norm": 0.9408845901489258,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 16780
+    },
+    {
+      "epoch": 0.16781,
+      "grad_norm": 0.8010312914848328,
+      "learning_rate": 0.003,
+      "loss": 4.0523,
+      "step": 16781
+    },
+    {
+      "epoch": 0.16782,
+      "grad_norm": 0.6520599126815796,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 16782
+    },
+    {
+      "epoch": 0.16783,
+      "grad_norm": 0.5568283200263977,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 16783
+    },
+    {
+      "epoch": 0.16784,
+      "grad_norm": 0.5546129941940308,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 16784
+    },
+    {
+      "epoch": 0.16785,
+      "grad_norm": 0.5724064707756042,
+      "learning_rate": 0.003,
+      "loss": 4.0185,
+      "step": 16785
+    },
+    {
+      "epoch": 0.16786,
+      "grad_norm": 0.5485352873802185,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 16786
+    },
+    {
+      "epoch": 0.16787,
+      "grad_norm": 0.6238598227500916,
+      "learning_rate": 0.003,
+      "loss": 3.9934,
+      "step": 16787
+    },
+    {
+      "epoch": 0.16788,
+      "grad_norm": 0.8130132555961609,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 16788
+    },
+    {
+      "epoch": 0.16789,
+      "grad_norm": 0.9928077459335327,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 16789
+    },
+    {
+      "epoch": 0.1679,
+      "grad_norm": 1.0398330688476562,
+      "learning_rate": 0.003,
+      "loss": 3.9881,
+      "step": 16790
+    },
+    {
+      "epoch": 0.16791,
+      "grad_norm": 0.8814020752906799,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 16791
+    },
+    {
+      "epoch": 0.16792,
+      "grad_norm": 0.8732738494873047,
+      "learning_rate": 0.003,
+      "loss": 3.9735,
+      "step": 16792
+    },
+    {
+      "epoch": 0.16793,
+      "grad_norm": 0.848568856716156,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 16793
+    },
+    {
+      "epoch": 0.16794,
+      "grad_norm": 0.9220749735832214,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 16794
+    },
+    {
+      "epoch": 0.16795,
+      "grad_norm": 1.1375749111175537,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 16795
+    },
+    {
+      "epoch": 0.16796,
+      "grad_norm": 0.8748759031295776,
+      "learning_rate": 0.003,
+      "loss": 3.9736,
+      "step": 16796
+    },
+    {
+      "epoch": 0.16797,
+      "grad_norm": 0.7760801315307617,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 16797
+    },
+    {
+      "epoch": 0.16798,
+      "grad_norm": 0.7592577338218689,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 16798
+    },
+    {
+      "epoch": 0.16799,
+      "grad_norm": 0.634211003780365,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 16799
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.5644124150276184,
+      "learning_rate": 0.003,
+      "loss": 3.9942,
+      "step": 16800
+    },
+    {
+      "epoch": 0.16801,
+      "grad_norm": 0.5643258690834045,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 16801
+    },
+    {
+      "epoch": 0.16802,
+      "grad_norm": 0.6898741126060486,
+      "learning_rate": 0.003,
+      "loss": 4.0072,
+      "step": 16802
+    },
+    {
+      "epoch": 0.16803,
+      "grad_norm": 0.7783620357513428,
+      "learning_rate": 0.003,
+      "loss": 4.0371,
+      "step": 16803
+    },
+    {
+      "epoch": 0.16804,
+      "grad_norm": 0.9071454405784607,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 16804
+    },
+    {
+      "epoch": 0.16805,
+      "grad_norm": 1.0606682300567627,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 16805
+    },
+    {
+      "epoch": 0.16806,
+      "grad_norm": 0.7918750643730164,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 16806
+    },
+    {
+      "epoch": 0.16807,
+      "grad_norm": 0.5339258909225464,
+      "learning_rate": 0.003,
+      "loss": 3.9763,
+      "step": 16807
+    },
+    {
+      "epoch": 0.16808,
+      "grad_norm": 0.5802764296531677,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 16808
+    },
+    {
+      "epoch": 0.16809,
+      "grad_norm": 0.613307774066925,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 16809
+    },
+    {
+      "epoch": 0.1681,
+      "grad_norm": 0.6131017804145813,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 16810
+    },
+    {
+      "epoch": 0.16811,
+      "grad_norm": 0.6089868545532227,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 16811
+    },
+    {
+      "epoch": 0.16812,
+      "grad_norm": 0.6131261587142944,
+      "learning_rate": 0.003,
+      "loss": 3.9787,
+      "step": 16812
+    },
+    {
+      "epoch": 0.16813,
+      "grad_norm": 0.6756606698036194,
+      "learning_rate": 0.003,
+      "loss": 3.9921,
+      "step": 16813
+    },
+    {
+      "epoch": 0.16814,
+      "grad_norm": 0.8853565454483032,
+      "learning_rate": 0.003,
+      "loss": 3.9795,
+      "step": 16814
+    },
+    {
+      "epoch": 0.16815,
+      "grad_norm": 1.149666428565979,
+      "learning_rate": 0.003,
+      "loss": 4.0531,
+      "step": 16815
+    },
+    {
+      "epoch": 0.16816,
+      "grad_norm": 1.1497527360916138,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 16816
+    },
+    {
+      "epoch": 0.16817,
+      "grad_norm": 0.889690637588501,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 16817
+    },
+    {
+      "epoch": 0.16818,
+      "grad_norm": 0.818009078502655,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 16818
+    },
+    {
+      "epoch": 0.16819,
+      "grad_norm": 0.9402246475219727,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 16819
+    },
+    {
+      "epoch": 0.1682,
+      "grad_norm": 1.109734058380127,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 16820
+    },
+    {
+      "epoch": 0.16821,
+      "grad_norm": 1.1249537467956543,
+      "learning_rate": 0.003,
+      "loss": 3.9815,
+      "step": 16821
+    },
+    {
+      "epoch": 0.16822,
+      "grad_norm": 0.8454878926277161,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 16822
+    },
+    {
+      "epoch": 0.16823,
+      "grad_norm": 0.7891453504562378,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 16823
+    },
+    {
+      "epoch": 0.16824,
+      "grad_norm": 0.9035543203353882,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 16824
+    },
+    {
+      "epoch": 0.16825,
+      "grad_norm": 1.0534626245498657,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 16825
+    },
+    {
+      "epoch": 0.16826,
+      "grad_norm": 0.8612683415412903,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 16826
+    },
+    {
+      "epoch": 0.16827,
+      "grad_norm": 0.8174473643302917,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 16827
+    },
+    {
+      "epoch": 0.16828,
+      "grad_norm": 0.9313732385635376,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 16828
+    },
+    {
+      "epoch": 0.16829,
+      "grad_norm": 0.9340553283691406,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 16829
+    },
+    {
+      "epoch": 0.1683,
+      "grad_norm": 0.9243738651275635,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 16830
+    },
+    {
+      "epoch": 0.16831,
+      "grad_norm": 0.9216597080230713,
+      "learning_rate": 0.003,
+      "loss": 3.9893,
+      "step": 16831
+    },
+    {
+      "epoch": 0.16832,
+      "grad_norm": 0.8400397300720215,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 16832
+    },
+    {
+      "epoch": 0.16833,
+      "grad_norm": 0.7118893265724182,
+      "learning_rate": 0.003,
+      "loss": 3.9822,
+      "step": 16833
+    },
+    {
+      "epoch": 0.16834,
+      "grad_norm": 0.7463409900665283,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 16834
+    },
+    {
+      "epoch": 0.16835,
+      "grad_norm": 0.798879086971283,
+      "learning_rate": 0.003,
+      "loss": 3.9623,
+      "step": 16835
+    },
+    {
+      "epoch": 0.16836,
+      "grad_norm": 0.7490203976631165,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 16836
+    },
+    {
+      "epoch": 0.16837,
+      "grad_norm": 0.7311861515045166,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 16837
+    },
+    {
+      "epoch": 0.16838,
+      "grad_norm": 0.7826172113418579,
+      "learning_rate": 0.003,
+      "loss": 4.0357,
+      "step": 16838
+    },
+    {
+      "epoch": 0.16839,
+      "grad_norm": 0.7967138886451721,
+      "learning_rate": 0.003,
+      "loss": 4.0006,
+      "step": 16839
+    },
+    {
+      "epoch": 0.1684,
+      "grad_norm": 0.8535910248756409,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 16840
+    },
+    {
+      "epoch": 0.16841,
+      "grad_norm": 0.9957929253578186,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 16841
+    },
+    {
+      "epoch": 0.16842,
+      "grad_norm": 1.0661824941635132,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 16842
+    },
+    {
+      "epoch": 0.16843,
+      "grad_norm": 0.8872292637825012,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 16843
+    },
+    {
+      "epoch": 0.16844,
+      "grad_norm": 0.641571581363678,
+      "learning_rate": 0.003,
+      "loss": 4.0289,
+      "step": 16844
+    },
+    {
+      "epoch": 0.16845,
+      "grad_norm": 0.6273783445358276,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 16845
+    },
+    {
+      "epoch": 0.16846,
+      "grad_norm": 0.738833487033844,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 16846
+    },
+    {
+      "epoch": 0.16847,
+      "grad_norm": 0.9297849535942078,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 16847
+    },
+    {
+      "epoch": 0.16848,
+      "grad_norm": 1.1222742795944214,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 16848
+    },
+    {
+      "epoch": 0.16849,
+      "grad_norm": 0.9273313283920288,
+      "learning_rate": 0.003,
+      "loss": 3.992,
+      "step": 16849
+    },
+    {
+      "epoch": 0.1685,
+      "grad_norm": 0.8675814867019653,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 16850
+    },
+    {
+      "epoch": 0.16851,
+      "grad_norm": 0.8588464856147766,
+      "learning_rate": 0.003,
+      "loss": 3.9922,
+      "step": 16851
+    },
+    {
+      "epoch": 0.16852,
+      "grad_norm": 0.8039618730545044,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 16852
+    },
+    {
+      "epoch": 0.16853,
+      "grad_norm": 0.8666275143623352,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 16853
+    },
+    {
+      "epoch": 0.16854,
+      "grad_norm": 0.8161850571632385,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 16854
+    },
+    {
+      "epoch": 0.16855,
+      "grad_norm": 0.6379879713058472,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 16855
+    },
+    {
+      "epoch": 0.16856,
+      "grad_norm": 0.7508741021156311,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 16856
+    },
+    {
+      "epoch": 0.16857,
+      "grad_norm": 0.7264812588691711,
+      "learning_rate": 0.003,
+      "loss": 3.9619,
+      "step": 16857
+    },
+    {
+      "epoch": 0.16858,
+      "grad_norm": 0.6979163885116577,
+      "learning_rate": 0.003,
+      "loss": 3.9725,
+      "step": 16858
+    },
+    {
+      "epoch": 0.16859,
+      "grad_norm": 0.7951305508613586,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 16859
+    },
+    {
+      "epoch": 0.1686,
+      "grad_norm": 0.9455273151397705,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 16860
+    },
+    {
+      "epoch": 0.16861,
+      "grad_norm": 0.9702208042144775,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 16861
+    },
+    {
+      "epoch": 0.16862,
+      "grad_norm": 0.8496451377868652,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 16862
+    },
+    {
+      "epoch": 0.16863,
+      "grad_norm": 0.9306349754333496,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 16863
+    },
+    {
+      "epoch": 0.16864,
+      "grad_norm": 0.916183590888977,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 16864
+    },
+    {
+      "epoch": 0.16865,
+      "grad_norm": 0.8136289715766907,
+      "learning_rate": 0.003,
+      "loss": 3.98,
+      "step": 16865
+    },
+    {
+      "epoch": 0.16866,
+      "grad_norm": 0.7357578873634338,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 16866
+    },
+    {
+      "epoch": 0.16867,
+      "grad_norm": 0.6378408074378967,
+      "learning_rate": 0.003,
+      "loss": 3.9944,
+      "step": 16867
+    },
+    {
+      "epoch": 0.16868,
+      "grad_norm": 0.614768385887146,
+      "learning_rate": 0.003,
+      "loss": 3.9905,
+      "step": 16868
+    },
+    {
+      "epoch": 0.16869,
+      "grad_norm": 0.5955229997634888,
+      "learning_rate": 0.003,
+      "loss": 3.9931,
+      "step": 16869
+    },
+    {
+      "epoch": 0.1687,
+      "grad_norm": 0.5942284464836121,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 16870
+    },
+    {
+      "epoch": 0.16871,
+      "grad_norm": 0.6663064956665039,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 16871
+    },
+    {
+      "epoch": 0.16872,
+      "grad_norm": 0.7257936000823975,
+      "learning_rate": 0.003,
+      "loss": 3.9908,
+      "step": 16872
+    },
+    {
+      "epoch": 0.16873,
+      "grad_norm": 0.7455652952194214,
+      "learning_rate": 0.003,
+      "loss": 3.9954,
+      "step": 16873
+    },
+    {
+      "epoch": 0.16874,
+      "grad_norm": 0.8603283762931824,
+      "learning_rate": 0.003,
+      "loss": 3.9871,
+      "step": 16874
+    },
+    {
+      "epoch": 0.16875,
+      "grad_norm": 1.2169922590255737,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 16875
+    },
+    {
+      "epoch": 0.16876,
+      "grad_norm": 0.9302839040756226,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 16876
+    },
+    {
+      "epoch": 0.16877,
+      "grad_norm": 0.9746444821357727,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 16877
+    },
+    {
+      "epoch": 0.16878,
+      "grad_norm": 1.0206416845321655,
+      "learning_rate": 0.003,
+      "loss": 3.9878,
+      "step": 16878
+    },
+    {
+      "epoch": 0.16879,
+      "grad_norm": 1.0273840427398682,
+      "learning_rate": 0.003,
+      "loss": 3.9818,
+      "step": 16879
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.8442237973213196,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 16880
+    },
+    {
+      "epoch": 0.16881,
+      "grad_norm": 0.6956433057785034,
+      "learning_rate": 0.003,
+      "loss": 3.993,
+      "step": 16881
+    },
+    {
+      "epoch": 0.16882,
+      "grad_norm": 0.7793794870376587,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 16882
+    },
+    {
+      "epoch": 0.16883,
+      "grad_norm": 0.8479170799255371,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 16883
+    },
+    {
+      "epoch": 0.16884,
+      "grad_norm": 0.8433241248130798,
+      "learning_rate": 0.003,
+      "loss": 4.0108,
+      "step": 16884
+    },
+    {
+      "epoch": 0.16885,
+      "grad_norm": 0.7989548444747925,
+      "learning_rate": 0.003,
+      "loss": 3.9802,
+      "step": 16885
+    },
+    {
+      "epoch": 0.16886,
+      "grad_norm": 0.8560846447944641,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 16886
+    },
+    {
+      "epoch": 0.16887,
+      "grad_norm": 0.8832461833953857,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 16887
+    },
+    {
+      "epoch": 0.16888,
+      "grad_norm": 0.9571264982223511,
+      "learning_rate": 0.003,
+      "loss": 4.0403,
+      "step": 16888
+    },
+    {
+      "epoch": 0.16889,
+      "grad_norm": 0.9143347144126892,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 16889
+    },
+    {
+      "epoch": 0.1689,
+      "grad_norm": 0.8070544600486755,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 16890
+    },
+    {
+      "epoch": 0.16891,
+      "grad_norm": 0.7932570576667786,
+      "learning_rate": 0.003,
+      "loss": 3.9944,
+      "step": 16891
+    },
+    {
+      "epoch": 0.16892,
+      "grad_norm": 0.7867026925086975,
+      "learning_rate": 0.003,
+      "loss": 3.9822,
+      "step": 16892
+    },
+    {
+      "epoch": 0.16893,
+      "grad_norm": 0.7658373117446899,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 16893
+    },
+    {
+      "epoch": 0.16894,
+      "grad_norm": 0.7852148413658142,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 16894
+    },
+    {
+      "epoch": 0.16895,
+      "grad_norm": 0.7895588278770447,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 16895
+    },
+    {
+      "epoch": 0.16896,
+      "grad_norm": 0.8559044599533081,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 16896
+    },
+    {
+      "epoch": 0.16897,
+      "grad_norm": 1.0634825229644775,
+      "learning_rate": 0.003,
+      "loss": 3.997,
+      "step": 16897
+    },
+    {
+      "epoch": 0.16898,
+      "grad_norm": 0.9409705996513367,
+      "learning_rate": 0.003,
+      "loss": 4.0039,
+      "step": 16898
+    },
+    {
+      "epoch": 0.16899,
+      "grad_norm": 0.8026694655418396,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 16899
+    },
+    {
+      "epoch": 0.169,
+      "grad_norm": 0.7315845489501953,
+      "learning_rate": 0.003,
+      "loss": 3.9808,
+      "step": 16900
+    },
+    {
+      "epoch": 0.16901,
+      "grad_norm": 0.7260497808456421,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 16901
+    },
+    {
+      "epoch": 0.16902,
+      "grad_norm": 0.6549625396728516,
+      "learning_rate": 0.003,
+      "loss": 3.9998,
+      "step": 16902
+    },
+    {
+      "epoch": 0.16903,
+      "grad_norm": 0.607406735420227,
+      "learning_rate": 0.003,
+      "loss": 3.9907,
+      "step": 16903
+    },
+    {
+      "epoch": 0.16904,
+      "grad_norm": 0.6448104381561279,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 16904
+    },
+    {
+      "epoch": 0.16905,
+      "grad_norm": 0.8571560978889465,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 16905
+    },
+    {
+      "epoch": 0.16906,
+      "grad_norm": 0.9251641035079956,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 16906
+    },
+    {
+      "epoch": 0.16907,
+      "grad_norm": 0.9457172751426697,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 16907
+    },
+    {
+      "epoch": 0.16908,
+      "grad_norm": 0.8236315250396729,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 16908
+    },
+    {
+      "epoch": 0.16909,
+      "grad_norm": 0.8186244368553162,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 16909
+    },
+    {
+      "epoch": 0.1691,
+      "grad_norm": 0.8274247646331787,
+      "learning_rate": 0.003,
+      "loss": 3.9652,
+      "step": 16910
+    },
+    {
+      "epoch": 0.16911,
+      "grad_norm": 0.9622073173522949,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 16911
+    },
+    {
+      "epoch": 0.16912,
+      "grad_norm": 1.0782620906829834,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 16912
+    },
+    {
+      "epoch": 0.16913,
+      "grad_norm": 1.0125151872634888,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 16913
+    },
+    {
+      "epoch": 0.16914,
+      "grad_norm": 0.798346757888794,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 16914
+    },
+    {
+      "epoch": 0.16915,
+      "grad_norm": 0.672568142414093,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 16915
+    },
+    {
+      "epoch": 0.16916,
+      "grad_norm": 0.6295229196548462,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 16916
+    },
+    {
+      "epoch": 0.16917,
+      "grad_norm": 0.6785711050033569,
+      "learning_rate": 0.003,
+      "loss": 3.9923,
+      "step": 16917
+    },
+    {
+      "epoch": 0.16918,
+      "grad_norm": 0.7268692851066589,
+      "learning_rate": 0.003,
+      "loss": 4.0242,
+      "step": 16918
+    },
+    {
+      "epoch": 0.16919,
+      "grad_norm": 0.7384181022644043,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 16919
+    },
+    {
+      "epoch": 0.1692,
+      "grad_norm": 0.9648261070251465,
+      "learning_rate": 0.003,
+      "loss": 3.9839,
+      "step": 16920
+    },
+    {
+      "epoch": 0.16921,
+      "grad_norm": 1.593048334121704,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 16921
+    },
+    {
+      "epoch": 0.16922,
+      "grad_norm": 0.5928442478179932,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 16922
+    },
+    {
+      "epoch": 0.16923,
+      "grad_norm": 0.7853564620018005,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 16923
+    },
+    {
+      "epoch": 0.16924,
+      "grad_norm": 1.0473350286483765,
+      "learning_rate": 0.003,
+      "loss": 4.0185,
+      "step": 16924
+    },
+    {
+      "epoch": 0.16925,
+      "grad_norm": 1.0236759185791016,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 16925
+    },
+    {
+      "epoch": 0.16926,
+      "grad_norm": 0.8573715090751648,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 16926
+    },
+    {
+      "epoch": 0.16927,
+      "grad_norm": 0.7441908717155457,
+      "learning_rate": 0.003,
+      "loss": 3.9884,
+      "step": 16927
+    },
+    {
+      "epoch": 0.16928,
+      "grad_norm": 0.8400618433952332,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 16928
+    },
+    {
+      "epoch": 0.16929,
+      "grad_norm": 1.0212396383285522,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 16929
+    },
+    {
+      "epoch": 0.1693,
+      "grad_norm": 1.0390324592590332,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 16930
+    },
+    {
+      "epoch": 0.16931,
+      "grad_norm": 0.9738713502883911,
+      "learning_rate": 0.003,
+      "loss": 4.0363,
+      "step": 16931
+    },
+    {
+      "epoch": 0.16932,
+      "grad_norm": 1.1137603521347046,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 16932
+    },
+    {
+      "epoch": 0.16933,
+      "grad_norm": 1.0305066108703613,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 16933
+    },
+    {
+      "epoch": 0.16934,
+      "grad_norm": 0.9579649567604065,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 16934
+    },
+    {
+      "epoch": 0.16935,
+      "grad_norm": 0.7607799768447876,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 16935
+    },
+    {
+      "epoch": 0.16936,
+      "grad_norm": 0.7850809693336487,
+      "learning_rate": 0.003,
+      "loss": 3.99,
+      "step": 16936
+    },
+    {
+      "epoch": 0.16937,
+      "grad_norm": 0.8200135231018066,
+      "learning_rate": 0.003,
+      "loss": 3.9878,
+      "step": 16937
+    },
+    {
+      "epoch": 0.16938,
+      "grad_norm": 0.7746044397354126,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 16938
+    },
+    {
+      "epoch": 0.16939,
+      "grad_norm": 0.8466827273368835,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 16939
+    },
+    {
+      "epoch": 0.1694,
+      "grad_norm": 0.7908685207366943,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 16940
+    },
+    {
+      "epoch": 0.16941,
+      "grad_norm": 0.6856575608253479,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 16941
+    },
+    {
+      "epoch": 0.16942,
+      "grad_norm": 0.6925505995750427,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 16942
+    },
+    {
+      "epoch": 0.16943,
+      "grad_norm": 0.8004992008209229,
+      "learning_rate": 0.003,
+      "loss": 3.9721,
+      "step": 16943
+    },
+    {
+      "epoch": 0.16944,
+      "grad_norm": 0.822459876537323,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 16944
+    },
+    {
+      "epoch": 0.16945,
+      "grad_norm": 0.8093191981315613,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 16945
+    },
+    {
+      "epoch": 0.16946,
+      "grad_norm": 0.8074596524238586,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 16946
+    },
+    {
+      "epoch": 0.16947,
+      "grad_norm": 0.9427773356437683,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 16947
+    },
+    {
+      "epoch": 0.16948,
+      "grad_norm": 1.1139631271362305,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 16948
+    },
+    {
+      "epoch": 0.16949,
+      "grad_norm": 0.9527763724327087,
+      "learning_rate": 0.003,
+      "loss": 3.9667,
+      "step": 16949
+    },
+    {
+      "epoch": 0.1695,
+      "grad_norm": 0.7660547494888306,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 16950
+    },
+    {
+      "epoch": 0.16951,
+      "grad_norm": 0.6911223530769348,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 16951
+    },
+    {
+      "epoch": 0.16952,
+      "grad_norm": 0.6668634414672852,
+      "learning_rate": 0.003,
+      "loss": 3.9943,
+      "step": 16952
+    },
+    {
+      "epoch": 0.16953,
+      "grad_norm": 0.687681257724762,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 16953
+    },
+    {
+      "epoch": 0.16954,
+      "grad_norm": 0.6563870310783386,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 16954
+    },
+    {
+      "epoch": 0.16955,
+      "grad_norm": 0.6711004972457886,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 16955
+    },
+    {
+      "epoch": 0.16956,
+      "grad_norm": 0.7026411294937134,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 16956
+    },
+    {
+      "epoch": 0.16957,
+      "grad_norm": 0.704535186290741,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 16957
+    },
+    {
+      "epoch": 0.16958,
+      "grad_norm": 0.6693637371063232,
+      "learning_rate": 0.003,
+      "loss": 3.9835,
+      "step": 16958
+    },
+    {
+      "epoch": 0.16959,
+      "grad_norm": 0.6984379887580872,
+      "learning_rate": 0.003,
+      "loss": 3.9797,
+      "step": 16959
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.750443696975708,
+      "learning_rate": 0.003,
+      "loss": 3.9915,
+      "step": 16960
+    },
+    {
+      "epoch": 0.16961,
+      "grad_norm": 0.8894288539886475,
+      "learning_rate": 0.003,
+      "loss": 3.9819,
+      "step": 16961
+    },
+    {
+      "epoch": 0.16962,
+      "grad_norm": 1.128329873085022,
+      "learning_rate": 0.003,
+      "loss": 4.0311,
+      "step": 16962
+    },
+    {
+      "epoch": 0.16963,
+      "grad_norm": 0.7519640922546387,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 16963
+    },
+    {
+      "epoch": 0.16964,
+      "grad_norm": 0.552846372127533,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 16964
+    },
+    {
+      "epoch": 0.16965,
+      "grad_norm": 0.5697802901268005,
+      "learning_rate": 0.003,
+      "loss": 3.9723,
+      "step": 16965
+    },
+    {
+      "epoch": 0.16966,
+      "grad_norm": 0.6609813570976257,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 16966
+    },
+    {
+      "epoch": 0.16967,
+      "grad_norm": 0.7198853492736816,
+      "learning_rate": 0.003,
+      "loss": 3.9966,
+      "step": 16967
+    },
+    {
+      "epoch": 0.16968,
+      "grad_norm": 0.7277693748474121,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 16968
+    },
+    {
+      "epoch": 0.16969,
+      "grad_norm": 0.7942861914634705,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 16969
+    },
+    {
+      "epoch": 0.1697,
+      "grad_norm": 0.7987948656082153,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 16970
+    },
+    {
+      "epoch": 0.16971,
+      "grad_norm": 0.8817940354347229,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 16971
+    },
+    {
+      "epoch": 0.16972,
+      "grad_norm": 1.0301092863082886,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 16972
+    },
+    {
+      "epoch": 0.16973,
+      "grad_norm": 0.9825281500816345,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 16973
+    },
+    {
+      "epoch": 0.16974,
+      "grad_norm": 0.9258411526679993,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 16974
+    },
+    {
+      "epoch": 0.16975,
+      "grad_norm": 1.0217163562774658,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 16975
+    },
+    {
+      "epoch": 0.16976,
+      "grad_norm": 1.1775392293930054,
+      "learning_rate": 0.003,
+      "loss": 3.977,
+      "step": 16976
+    },
+    {
+      "epoch": 0.16977,
+      "grad_norm": 0.9654213786125183,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 16977
+    },
+    {
+      "epoch": 0.16978,
+      "grad_norm": 1.1151385307312012,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 16978
+    },
+    {
+      "epoch": 0.16979,
+      "grad_norm": 1.012771487236023,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 16979
+    },
+    {
+      "epoch": 0.1698,
+      "grad_norm": 0.8385891914367676,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 16980
+    },
+    {
+      "epoch": 0.16981,
+      "grad_norm": 0.7704838514328003,
+      "learning_rate": 0.003,
+      "loss": 3.9844,
+      "step": 16981
+    },
+    {
+      "epoch": 0.16982,
+      "grad_norm": 0.9659813046455383,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 16982
+    },
+    {
+      "epoch": 0.16983,
+      "grad_norm": 1.3794572353363037,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 16983
+    },
+    {
+      "epoch": 0.16984,
+      "grad_norm": 0.6425350904464722,
+      "learning_rate": 0.003,
+      "loss": 4.0311,
+      "step": 16984
+    },
+    {
+      "epoch": 0.16985,
+      "grad_norm": 0.7135833501815796,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 16985
+    },
+    {
+      "epoch": 0.16986,
+      "grad_norm": 0.7565526962280273,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 16986
+    },
+    {
+      "epoch": 0.16987,
+      "grad_norm": 0.7284208536148071,
+      "learning_rate": 0.003,
+      "loss": 4.0423,
+      "step": 16987
+    },
+    {
+      "epoch": 0.16988,
+      "grad_norm": 0.7384746074676514,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 16988
+    },
+    {
+      "epoch": 0.16989,
+      "grad_norm": 0.7987564206123352,
+      "learning_rate": 0.003,
+      "loss": 3.9973,
+      "step": 16989
+    },
+    {
+      "epoch": 0.1699,
+      "grad_norm": 0.6784647107124329,
+      "learning_rate": 0.003,
+      "loss": 4.0028,
+      "step": 16990
+    },
+    {
+      "epoch": 0.16991,
+      "grad_norm": 0.6576104760169983,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 16991
+    },
+    {
+      "epoch": 0.16992,
+      "grad_norm": 0.818426251411438,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 16992
+    },
+    {
+      "epoch": 0.16993,
+      "grad_norm": 1.1293405294418335,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 16993
+    },
+    {
+      "epoch": 0.16994,
+      "grad_norm": 1.1304106712341309,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 16994
+    },
+    {
+      "epoch": 0.16995,
+      "grad_norm": 0.7580990791320801,
+      "learning_rate": 0.003,
+      "loss": 3.973,
+      "step": 16995
+    },
+    {
+      "epoch": 0.16996,
+      "grad_norm": 0.6400741934776306,
+      "learning_rate": 0.003,
+      "loss": 3.9919,
+      "step": 16996
+    },
+    {
+      "epoch": 0.16997,
+      "grad_norm": 0.6320247650146484,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 16997
+    },
+    {
+      "epoch": 0.16998,
+      "grad_norm": 0.6405655145645142,
+      "learning_rate": 0.003,
+      "loss": 4.0237,
+      "step": 16998
+    },
+    {
+      "epoch": 0.16999,
+      "grad_norm": 0.7054705023765564,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 16999
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.7184433937072754,
+      "learning_rate": 0.003,
+      "loss": 3.9673,
+      "step": 17000
+    },
+    {
+      "epoch": 0.17001,
+      "grad_norm": 0.7353405952453613,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 17001
+    },
+    {
+      "epoch": 0.17002,
+      "grad_norm": 0.7217193841934204,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 17002
+    },
+    {
+      "epoch": 0.17003,
+      "grad_norm": 0.9274918437004089,
+      "learning_rate": 0.003,
+      "loss": 3.9758,
+      "step": 17003
+    },
+    {
+      "epoch": 0.17004,
+      "grad_norm": 1.2057504653930664,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 17004
+    },
+    {
+      "epoch": 0.17005,
+      "grad_norm": 0.794136643409729,
+      "learning_rate": 0.003,
+      "loss": 3.9954,
+      "step": 17005
+    },
+    {
+      "epoch": 0.17006,
+      "grad_norm": 0.6768636107444763,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 17006
+    },
+    {
+      "epoch": 0.17007,
+      "grad_norm": 0.725312352180481,
+      "learning_rate": 0.003,
+      "loss": 3.9843,
+      "step": 17007
+    },
+    {
+      "epoch": 0.17008,
+      "grad_norm": 0.7155160903930664,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 17008
+    },
+    {
+      "epoch": 0.17009,
+      "grad_norm": 0.6952875852584839,
+      "learning_rate": 0.003,
+      "loss": 3.9906,
+      "step": 17009
+    },
+    {
+      "epoch": 0.1701,
+      "grad_norm": 0.7479856014251709,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 17010
+    },
+    {
+      "epoch": 0.17011,
+      "grad_norm": 0.8017983436584473,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 17011
+    },
+    {
+      "epoch": 0.17012,
+      "grad_norm": 1.0040581226348877,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 17012
+    },
+    {
+      "epoch": 0.17013,
+      "grad_norm": 1.1289502382278442,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 17013
+    },
+    {
+      "epoch": 0.17014,
+      "grad_norm": 0.9964608550071716,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 17014
+    },
+    {
+      "epoch": 0.17015,
+      "grad_norm": 1.200086236000061,
+      "learning_rate": 0.003,
+      "loss": 4.015,
+      "step": 17015
+    },
+    {
+      "epoch": 0.17016,
+      "grad_norm": 0.8935628533363342,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 17016
+    },
+    {
+      "epoch": 0.17017,
+      "grad_norm": 0.835453450679779,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 17017
+    },
+    {
+      "epoch": 0.17018,
+      "grad_norm": 0.9155903458595276,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 17018
+    },
+    {
+      "epoch": 0.17019,
+      "grad_norm": 0.8293262720108032,
+      "learning_rate": 0.003,
+      "loss": 3.9957,
+      "step": 17019
+    },
+    {
+      "epoch": 0.1702,
+      "grad_norm": 0.7839001417160034,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 17020
+    },
+    {
+      "epoch": 0.17021,
+      "grad_norm": 0.833850622177124,
+      "learning_rate": 0.003,
+      "loss": 4.0039,
+      "step": 17021
+    },
+    {
+      "epoch": 0.17022,
+      "grad_norm": 0.7615661025047302,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 17022
+    },
+    {
+      "epoch": 0.17023,
+      "grad_norm": 0.7861036062240601,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 17023
+    },
+    {
+      "epoch": 0.17024,
+      "grad_norm": 0.9979937672615051,
+      "learning_rate": 0.003,
+      "loss": 4.0056,
+      "step": 17024
+    },
+    {
+      "epoch": 0.17025,
+      "grad_norm": 1.267797589302063,
+      "learning_rate": 0.003,
+      "loss": 3.9901,
+      "step": 17025
+    },
+    {
+      "epoch": 0.17026,
+      "grad_norm": 1.0182993412017822,
+      "learning_rate": 0.003,
+      "loss": 3.9865,
+      "step": 17026
+    },
+    {
+      "epoch": 0.17027,
+      "grad_norm": 0.9555907845497131,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 17027
+    },
+    {
+      "epoch": 0.17028,
+      "grad_norm": 1.3939357995986938,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 17028
+    },
+    {
+      "epoch": 0.17029,
+      "grad_norm": 1.0221747159957886,
+      "learning_rate": 0.003,
+      "loss": 4.0422,
+      "step": 17029
+    },
+    {
+      "epoch": 0.1703,
+      "grad_norm": 0.7883939146995544,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 17030
+    },
+    {
+      "epoch": 0.17031,
+      "grad_norm": 0.8329399228096008,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 17031
+    },
+    {
+      "epoch": 0.17032,
+      "grad_norm": 0.9606727957725525,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 17032
+    },
+    {
+      "epoch": 0.17033,
+      "grad_norm": 0.8926258683204651,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 17033
+    },
+    {
+      "epoch": 0.17034,
+      "grad_norm": 0.837275505065918,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 17034
+    },
+    {
+      "epoch": 0.17035,
+      "grad_norm": 0.8217374682426453,
+      "learning_rate": 0.003,
+      "loss": 3.988,
+      "step": 17035
+    },
+    {
+      "epoch": 0.17036,
+      "grad_norm": 0.7080724239349365,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 17036
+    },
+    {
+      "epoch": 0.17037,
+      "grad_norm": 0.759881317615509,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 17037
+    },
+    {
+      "epoch": 0.17038,
+      "grad_norm": 0.6957381963729858,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 17038
+    },
+    {
+      "epoch": 0.17039,
+      "grad_norm": 0.6571491956710815,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 17039
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.5505919456481934,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 17040
+    },
+    {
+      "epoch": 0.17041,
+      "grad_norm": 0.5832685232162476,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 17041
+    },
+    {
+      "epoch": 0.17042,
+      "grad_norm": 0.6317238807678223,
+      "learning_rate": 0.003,
+      "loss": 3.9859,
+      "step": 17042
+    },
+    {
+      "epoch": 0.17043,
+      "grad_norm": 0.8054482936859131,
+      "learning_rate": 0.003,
+      "loss": 3.9864,
+      "step": 17043
+    },
+    {
+      "epoch": 0.17044,
+      "grad_norm": 1.0918316841125488,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 17044
+    },
+    {
+      "epoch": 0.17045,
+      "grad_norm": 1.0211206674575806,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 17045
+    },
+    {
+      "epoch": 0.17046,
+      "grad_norm": 0.8288164734840393,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 17046
+    },
+    {
+      "epoch": 0.17047,
+      "grad_norm": 0.6238090991973877,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 17047
+    },
+    {
+      "epoch": 0.17048,
+      "grad_norm": 0.6770544052124023,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 17048
+    },
+    {
+      "epoch": 0.17049,
+      "grad_norm": 0.8049353957176208,
+      "learning_rate": 0.003,
+      "loss": 3.9735,
+      "step": 17049
+    },
+    {
+      "epoch": 0.1705,
+      "grad_norm": 0.8502638339996338,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 17050
+    },
+    {
+      "epoch": 0.17051,
+      "grad_norm": 0.8071856498718262,
+      "learning_rate": 0.003,
+      "loss": 3.9897,
+      "step": 17051
+    },
+    {
+      "epoch": 0.17052,
+      "grad_norm": 0.7418029308319092,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 17052
+    },
+    {
+      "epoch": 0.17053,
+      "grad_norm": 0.883597195148468,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 17053
+    },
+    {
+      "epoch": 0.17054,
+      "grad_norm": 1.0723153352737427,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 17054
+    },
+    {
+      "epoch": 0.17055,
+      "grad_norm": 1.1307390928268433,
+      "learning_rate": 0.003,
+      "loss": 4.0263,
+      "step": 17055
+    },
+    {
+      "epoch": 0.17056,
+      "grad_norm": 0.7959531545639038,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 17056
+    },
+    {
+      "epoch": 0.17057,
+      "grad_norm": 0.7517292499542236,
+      "learning_rate": 0.003,
+      "loss": 3.9838,
+      "step": 17057
+    },
+    {
+      "epoch": 0.17058,
+      "grad_norm": 0.8400816917419434,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 17058
+    },
+    {
+      "epoch": 0.17059,
+      "grad_norm": 0.8377096652984619,
+      "learning_rate": 0.003,
+      "loss": 3.9753,
+      "step": 17059
+    },
+    {
+      "epoch": 0.1706,
+      "grad_norm": 0.8193461298942566,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 17060
+    },
+    {
+      "epoch": 0.17061,
+      "grad_norm": 0.845766007900238,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 17061
+    },
+    {
+      "epoch": 0.17062,
+      "grad_norm": 0.8220046162605286,
+      "learning_rate": 0.003,
+      "loss": 3.9652,
+      "step": 17062
+    },
+    {
+      "epoch": 0.17063,
+      "grad_norm": 0.6880900859832764,
+      "learning_rate": 0.003,
+      "loss": 3.9916,
+      "step": 17063
+    },
+    {
+      "epoch": 0.17064,
+      "grad_norm": 0.6444327235221863,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 17064
+    },
+    {
+      "epoch": 0.17065,
+      "grad_norm": 0.7091438174247742,
+      "learning_rate": 0.003,
+      "loss": 4.0022,
+      "step": 17065
+    },
+    {
+      "epoch": 0.17066,
+      "grad_norm": 0.744178831577301,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 17066
+    },
+    {
+      "epoch": 0.17067,
+      "grad_norm": 0.7807066440582275,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 17067
+    },
+    {
+      "epoch": 0.17068,
+      "grad_norm": 0.7357064485549927,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 17068
+    },
+    {
+      "epoch": 0.17069,
+      "grad_norm": 0.8833433985710144,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 17069
+    },
+    {
+      "epoch": 0.1707,
+      "grad_norm": 1.200100064277649,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 17070
+    },
+    {
+      "epoch": 0.17071,
+      "grad_norm": 0.8609374165534973,
+      "learning_rate": 0.003,
+      "loss": 3.9885,
+      "step": 17071
+    },
+    {
+      "epoch": 0.17072,
+      "grad_norm": 0.7197414636611938,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 17072
+    },
+    {
+      "epoch": 0.17073,
+      "grad_norm": 0.6767376065254211,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 17073
+    },
+    {
+      "epoch": 0.17074,
+      "grad_norm": 0.7874352335929871,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 17074
+    },
+    {
+      "epoch": 0.17075,
+      "grad_norm": 0.8052144646644592,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 17075
+    },
+    {
+      "epoch": 0.17076,
+      "grad_norm": 0.8144032955169678,
+      "learning_rate": 0.003,
+      "loss": 3.9854,
+      "step": 17076
+    },
+    {
+      "epoch": 0.17077,
+      "grad_norm": 0.9464995861053467,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 17077
+    },
+    {
+      "epoch": 0.17078,
+      "grad_norm": 1.0766133069992065,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 17078
+    },
+    {
+      "epoch": 0.17079,
+      "grad_norm": 1.0682998895645142,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 17079
+    },
+    {
+      "epoch": 0.1708,
+      "grad_norm": 1.1437551975250244,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 17080
+    },
+    {
+      "epoch": 0.17081,
+      "grad_norm": 0.9291083812713623,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 17081
+    },
+    {
+      "epoch": 0.17082,
+      "grad_norm": 0.7274229526519775,
+      "learning_rate": 0.003,
+      "loss": 4.0423,
+      "step": 17082
+    },
+    {
+      "epoch": 0.17083,
+      "grad_norm": 0.7047369480133057,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 17083
+    },
+    {
+      "epoch": 0.17084,
+      "grad_norm": 0.7894013524055481,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 17084
+    },
+    {
+      "epoch": 0.17085,
+      "grad_norm": 0.9227026700973511,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 17085
+    },
+    {
+      "epoch": 0.17086,
+      "grad_norm": 0.8737115263938904,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 17086
+    },
+    {
+      "epoch": 0.17087,
+      "grad_norm": 0.8501720428466797,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 17087
+    },
+    {
+      "epoch": 0.17088,
+      "grad_norm": 0.8258364796638489,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 17088
+    },
+    {
+      "epoch": 0.17089,
+      "grad_norm": 0.8701712489128113,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 17089
+    },
+    {
+      "epoch": 0.1709,
+      "grad_norm": 0.8529579639434814,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 17090
+    },
+    {
+      "epoch": 0.17091,
+      "grad_norm": 0.876214325428009,
+      "learning_rate": 0.003,
+      "loss": 4.045,
+      "step": 17091
+    },
+    {
+      "epoch": 0.17092,
+      "grad_norm": 0.9395101070404053,
+      "learning_rate": 0.003,
+      "loss": 3.987,
+      "step": 17092
+    },
+    {
+      "epoch": 0.17093,
+      "grad_norm": 1.09381103515625,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 17093
+    },
+    {
+      "epoch": 0.17094,
+      "grad_norm": 0.8975692391395569,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 17094
+    },
+    {
+      "epoch": 0.17095,
+      "grad_norm": 1.1432256698608398,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 17095
+    },
+    {
+      "epoch": 0.17096,
+      "grad_norm": 0.9737615585327148,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 17096
+    },
+    {
+      "epoch": 0.17097,
+      "grad_norm": 0.8718317747116089,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 17097
+    },
+    {
+      "epoch": 0.17098,
+      "grad_norm": 0.8652920126914978,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 17098
+    },
+    {
+      "epoch": 0.17099,
+      "grad_norm": 0.9636838436126709,
+      "learning_rate": 0.003,
+      "loss": 3.9813,
+      "step": 17099
+    },
+    {
+      "epoch": 0.171,
+      "grad_norm": 1.0163567066192627,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 17100
+    },
+    {
+      "epoch": 0.17101,
+      "grad_norm": 0.9739014506340027,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 17101
+    },
+    {
+      "epoch": 0.17102,
+      "grad_norm": 0.8297356963157654,
+      "learning_rate": 0.003,
+      "loss": 3.9992,
+      "step": 17102
+    },
+    {
+      "epoch": 0.17103,
+      "grad_norm": 0.7012573480606079,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 17103
+    },
+    {
+      "epoch": 0.17104,
+      "grad_norm": 0.6135175228118896,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 17104
+    },
+    {
+      "epoch": 0.17105,
+      "grad_norm": 0.7124812602996826,
+      "learning_rate": 0.003,
+      "loss": 3.988,
+      "step": 17105
+    },
+    {
+      "epoch": 0.17106,
+      "grad_norm": 0.8142275214195251,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 17106
+    },
+    {
+      "epoch": 0.17107,
+      "grad_norm": 0.9401968717575073,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 17107
+    },
+    {
+      "epoch": 0.17108,
+      "grad_norm": 0.9679330587387085,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 17108
+    },
+    {
+      "epoch": 0.17109,
+      "grad_norm": 0.8321598172187805,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 17109
+    },
+    {
+      "epoch": 0.1711,
+      "grad_norm": 0.675540566444397,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 17110
+    },
+    {
+      "epoch": 0.17111,
+      "grad_norm": 0.7044729590415955,
+      "learning_rate": 0.003,
+      "loss": 3.9902,
+      "step": 17111
+    },
+    {
+      "epoch": 0.17112,
+      "grad_norm": 0.8573277592658997,
+      "learning_rate": 0.003,
+      "loss": 3.9904,
+      "step": 17112
+    },
+    {
+      "epoch": 0.17113,
+      "grad_norm": 0.9914045333862305,
+      "learning_rate": 0.003,
+      "loss": 3.9928,
+      "step": 17113
+    },
+    {
+      "epoch": 0.17114,
+      "grad_norm": 1.0339431762695312,
+      "learning_rate": 0.003,
+      "loss": 4.0047,
+      "step": 17114
+    },
+    {
+      "epoch": 0.17115,
+      "grad_norm": 0.9043312668800354,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 17115
+    },
+    {
+      "epoch": 0.17116,
+      "grad_norm": 0.9394837021827698,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 17116
+    },
+    {
+      "epoch": 0.17117,
+      "grad_norm": 1.0119091272354126,
+      "learning_rate": 0.003,
+      "loss": 3.9973,
+      "step": 17117
+    },
+    {
+      "epoch": 0.17118,
+      "grad_norm": 0.8556259274482727,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 17118
+    },
+    {
+      "epoch": 0.17119,
+      "grad_norm": 0.7036623358726501,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 17119
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.7242422699928284,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 17120
+    },
+    {
+      "epoch": 0.17121,
+      "grad_norm": 0.8174219131469727,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 17121
+    },
+    {
+      "epoch": 0.17122,
+      "grad_norm": 0.9132465124130249,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 17122
+    },
+    {
+      "epoch": 0.17123,
+      "grad_norm": 0.9040917158126831,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 17123
+    },
+    {
+      "epoch": 0.17124,
+      "grad_norm": 0.8150893449783325,
+      "learning_rate": 0.003,
+      "loss": 3.9696,
+      "step": 17124
+    },
+    {
+      "epoch": 0.17125,
+      "grad_norm": 0.7183358073234558,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 17125
+    },
+    {
+      "epoch": 0.17126,
+      "grad_norm": 0.7179135680198669,
+      "learning_rate": 0.003,
+      "loss": 3.9942,
+      "step": 17126
+    },
+    {
+      "epoch": 0.17127,
+      "grad_norm": 0.7805479168891907,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 17127
+    },
+    {
+      "epoch": 0.17128,
+      "grad_norm": 0.8239583969116211,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 17128
+    },
+    {
+      "epoch": 0.17129,
+      "grad_norm": 0.8165447115898132,
+      "learning_rate": 0.003,
+      "loss": 3.9852,
+      "step": 17129
+    },
+    {
+      "epoch": 0.1713,
+      "grad_norm": 0.7133782505989075,
+      "learning_rate": 0.003,
+      "loss": 3.9983,
+      "step": 17130
+    },
+    {
+      "epoch": 0.17131,
+      "grad_norm": 0.6737720370292664,
+      "learning_rate": 0.003,
+      "loss": 3.9875,
+      "step": 17131
+    },
+    {
+      "epoch": 0.17132,
+      "grad_norm": 0.65351402759552,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 17132
+    },
+    {
+      "epoch": 0.17133,
+      "grad_norm": 0.7062671780586243,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 17133
+    },
+    {
+      "epoch": 0.17134,
+      "grad_norm": 0.7021439075469971,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 17134
+    },
+    {
+      "epoch": 0.17135,
+      "grad_norm": 0.6557730436325073,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 17135
+    },
+    {
+      "epoch": 0.17136,
+      "grad_norm": 0.6355788707733154,
+      "learning_rate": 0.003,
+      "loss": 3.9849,
+      "step": 17136
+    },
+    {
+      "epoch": 0.17137,
+      "grad_norm": 0.6955825686454773,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 17137
+    },
+    {
+      "epoch": 0.17138,
+      "grad_norm": 0.7713178992271423,
+      "learning_rate": 0.003,
+      "loss": 3.9929,
+      "step": 17138
+    },
+    {
+      "epoch": 0.17139,
+      "grad_norm": 0.8570329546928406,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 17139
+    },
+    {
+      "epoch": 0.1714,
+      "grad_norm": 1.0380743741989136,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 17140
+    },
+    {
+      "epoch": 0.17141,
+      "grad_norm": 1.0252989530563354,
+      "learning_rate": 0.003,
+      "loss": 3.9973,
+      "step": 17141
+    },
+    {
+      "epoch": 0.17142,
+      "grad_norm": 0.8437224626541138,
+      "learning_rate": 0.003,
+      "loss": 3.9735,
+      "step": 17142
+    },
+    {
+      "epoch": 0.17143,
+      "grad_norm": 0.7631940245628357,
+      "learning_rate": 0.003,
+      "loss": 3.9975,
+      "step": 17143
+    },
+    {
+      "epoch": 0.17144,
+      "grad_norm": 0.8039262890815735,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 17144
+    },
+    {
+      "epoch": 0.17145,
+      "grad_norm": 0.9135908484458923,
+      "learning_rate": 0.003,
+      "loss": 3.9867,
+      "step": 17145
+    },
+    {
+      "epoch": 0.17146,
+      "grad_norm": 0.8931128978729248,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 17146
+    },
+    {
+      "epoch": 0.17147,
+      "grad_norm": 0.9946384429931641,
+      "learning_rate": 0.003,
+      "loss": 3.9854,
+      "step": 17147
+    },
+    {
+      "epoch": 0.17148,
+      "grad_norm": 1.0222218036651611,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 17148
+    },
+    {
+      "epoch": 0.17149,
+      "grad_norm": 0.7616196274757385,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 17149
+    },
+    {
+      "epoch": 0.1715,
+      "grad_norm": 0.7758917212486267,
+      "learning_rate": 0.003,
+      "loss": 4.0002,
+      "step": 17150
+    },
+    {
+      "epoch": 0.17151,
+      "grad_norm": 0.7636691331863403,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 17151
+    },
+    {
+      "epoch": 0.17152,
+      "grad_norm": 0.8937175273895264,
+      "learning_rate": 0.003,
+      "loss": 3.9942,
+      "step": 17152
+    },
+    {
+      "epoch": 0.17153,
+      "grad_norm": 0.8333864212036133,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 17153
+    },
+    {
+      "epoch": 0.17154,
+      "grad_norm": 0.6168978214263916,
+      "learning_rate": 0.003,
+      "loss": 3.9957,
+      "step": 17154
+    },
+    {
+      "epoch": 0.17155,
+      "grad_norm": 0.6731457710266113,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 17155
+    },
+    {
+      "epoch": 0.17156,
+      "grad_norm": 0.8194024562835693,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 17156
+    },
+    {
+      "epoch": 0.17157,
+      "grad_norm": 0.8781210780143738,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 17157
+    },
+    {
+      "epoch": 0.17158,
+      "grad_norm": 0.9894959926605225,
+      "learning_rate": 0.003,
+      "loss": 3.9912,
+      "step": 17158
+    },
+    {
+      "epoch": 0.17159,
+      "grad_norm": 1.1849749088287354,
+      "learning_rate": 0.003,
+      "loss": 4.033,
+      "step": 17159
+    },
+    {
+      "epoch": 0.1716,
+      "grad_norm": 1.0248125791549683,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 17160
+    },
+    {
+      "epoch": 0.17161,
+      "grad_norm": 0.9380013942718506,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 17161
+    },
+    {
+      "epoch": 0.17162,
+      "grad_norm": 0.9853338003158569,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 17162
+    },
+    {
+      "epoch": 0.17163,
+      "grad_norm": 1.119477391242981,
+      "learning_rate": 0.003,
+      "loss": 3.9924,
+      "step": 17163
+    },
+    {
+      "epoch": 0.17164,
+      "grad_norm": 1.0069806575775146,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 17164
+    },
+    {
+      "epoch": 0.17165,
+      "grad_norm": 1.0723400115966797,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 17165
+    },
+    {
+      "epoch": 0.17166,
+      "grad_norm": 1.010719656944275,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 17166
+    },
+    {
+      "epoch": 0.17167,
+      "grad_norm": 0.8696228861808777,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 17167
+    },
+    {
+      "epoch": 0.17168,
+      "grad_norm": 0.7739221453666687,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 17168
+    },
+    {
+      "epoch": 0.17169,
+      "grad_norm": 0.7838475704193115,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 17169
+    },
+    {
+      "epoch": 0.1717,
+      "grad_norm": 0.7348381280899048,
+      "learning_rate": 0.003,
+      "loss": 4.0176,
+      "step": 17170
+    },
+    {
+      "epoch": 0.17171,
+      "grad_norm": 0.801749050617218,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 17171
+    },
+    {
+      "epoch": 0.17172,
+      "grad_norm": 0.8654886484146118,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 17172
+    },
+    {
+      "epoch": 0.17173,
+      "grad_norm": 0.8426519632339478,
+      "learning_rate": 0.003,
+      "loss": 3.9801,
+      "step": 17173
+    },
+    {
+      "epoch": 0.17174,
+      "grad_norm": 0.754588782787323,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 17174
+    },
+    {
+      "epoch": 0.17175,
+      "grad_norm": 0.7090778350830078,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 17175
+    },
+    {
+      "epoch": 0.17176,
+      "grad_norm": 0.6298714280128479,
+      "learning_rate": 0.003,
+      "loss": 3.9994,
+      "step": 17176
+    },
+    {
+      "epoch": 0.17177,
+      "grad_norm": 0.5926030278205872,
+      "learning_rate": 0.003,
+      "loss": 3.9751,
+      "step": 17177
+    },
+    {
+      "epoch": 0.17178,
+      "grad_norm": 0.6813340783119202,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 17178
+    },
+    {
+      "epoch": 0.17179,
+      "grad_norm": 0.9781045913696289,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 17179
+    },
+    {
+      "epoch": 0.1718,
+      "grad_norm": 1.287935733795166,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 17180
+    },
+    {
+      "epoch": 0.17181,
+      "grad_norm": 0.6834552884101868,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 17181
+    },
+    {
+      "epoch": 0.17182,
+      "grad_norm": 0.9143573641777039,
+      "learning_rate": 0.003,
+      "loss": 3.981,
+      "step": 17182
+    },
+    {
+      "epoch": 0.17183,
+      "grad_norm": 1.3454179763793945,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 17183
+    },
+    {
+      "epoch": 0.17184,
+      "grad_norm": 0.5757114291191101,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 17184
+    },
+    {
+      "epoch": 0.17185,
+      "grad_norm": 0.6277640461921692,
+      "learning_rate": 0.003,
+      "loss": 3.9775,
+      "step": 17185
+    },
+    {
+      "epoch": 0.17186,
+      "grad_norm": 0.7600275874137878,
+      "learning_rate": 0.003,
+      "loss": 3.9992,
+      "step": 17186
+    },
+    {
+      "epoch": 0.17187,
+      "grad_norm": 0.7353819608688354,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 17187
+    },
+    {
+      "epoch": 0.17188,
+      "grad_norm": 0.6280838847160339,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 17188
+    },
+    {
+      "epoch": 0.17189,
+      "grad_norm": 0.6756056547164917,
+      "learning_rate": 0.003,
+      "loss": 3.9786,
+      "step": 17189
+    },
+    {
+      "epoch": 0.1719,
+      "grad_norm": 0.7582512497901917,
+      "learning_rate": 0.003,
+      "loss": 3.9916,
+      "step": 17190
+    },
+    {
+      "epoch": 0.17191,
+      "grad_norm": 0.9030353426933289,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 17191
+    },
+    {
+      "epoch": 0.17192,
+      "grad_norm": 0.9921842217445374,
+      "learning_rate": 0.003,
+      "loss": 4.0057,
+      "step": 17192
+    },
+    {
+      "epoch": 0.17193,
+      "grad_norm": 0.9811756014823914,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 17193
+    },
+    {
+      "epoch": 0.17194,
+      "grad_norm": 0.9288318753242493,
+      "learning_rate": 0.003,
+      "loss": 3.9935,
+      "step": 17194
+    },
+    {
+      "epoch": 0.17195,
+      "grad_norm": 1.0272241830825806,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 17195
+    },
+    {
+      "epoch": 0.17196,
+      "grad_norm": 0.9172542095184326,
+      "learning_rate": 0.003,
+      "loss": 3.9922,
+      "step": 17196
+    },
+    {
+      "epoch": 0.17197,
+      "grad_norm": 0.8295038342475891,
+      "learning_rate": 0.003,
+      "loss": 4.0,
+      "step": 17197
+    },
+    {
+      "epoch": 0.17198,
+      "grad_norm": 0.8457723259925842,
+      "learning_rate": 0.003,
+      "loss": 4.0529,
+      "step": 17198
+    },
+    {
+      "epoch": 0.17199,
+      "grad_norm": 0.9383748173713684,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 17199
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.847207248210907,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 17200
+    },
+    {
+      "epoch": 0.17201,
+      "grad_norm": 0.6900160312652588,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 17201
+    },
+    {
+      "epoch": 0.17202,
+      "grad_norm": 0.8649606704711914,
+      "learning_rate": 0.003,
+      "loss": 3.9898,
+      "step": 17202
+    },
+    {
+      "epoch": 0.17203,
+      "grad_norm": 1.037705421447754,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 17203
+    },
+    {
+      "epoch": 0.17204,
+      "grad_norm": 1.084540605545044,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 17204
+    },
+    {
+      "epoch": 0.17205,
+      "grad_norm": 0.826554000377655,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 17205
+    },
+    {
+      "epoch": 0.17206,
+      "grad_norm": 0.9185588359832764,
+      "learning_rate": 0.003,
+      "loss": 4.0279,
+      "step": 17206
+    },
+    {
+      "epoch": 0.17207,
+      "grad_norm": 0.9008485078811646,
+      "learning_rate": 0.003,
+      "loss": 4.0276,
+      "step": 17207
+    },
+    {
+      "epoch": 0.17208,
+      "grad_norm": 0.8682863116264343,
+      "learning_rate": 0.003,
+      "loss": 4.0249,
+      "step": 17208
+    },
+    {
+      "epoch": 0.17209,
+      "grad_norm": 0.895900309085846,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 17209
+    },
+    {
+      "epoch": 0.1721,
+      "grad_norm": 0.7852314114570618,
+      "learning_rate": 0.003,
+      "loss": 3.9863,
+      "step": 17210
+    },
+    {
+      "epoch": 0.17211,
+      "grad_norm": 0.9408220648765564,
+      "learning_rate": 0.003,
+      "loss": 3.9992,
+      "step": 17211
+    },
+    {
+      "epoch": 0.17212,
+      "grad_norm": 1.2642772197723389,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 17212
+    },
+    {
+      "epoch": 0.17213,
+      "grad_norm": 0.8733190298080444,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 17213
+    },
+    {
+      "epoch": 0.17214,
+      "grad_norm": 0.7148447632789612,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 17214
+    },
+    {
+      "epoch": 0.17215,
+      "grad_norm": 0.6363114714622498,
+      "learning_rate": 0.003,
+      "loss": 4.0001,
+      "step": 17215
+    },
+    {
+      "epoch": 0.17216,
+      "grad_norm": 0.6281545758247375,
+      "learning_rate": 0.003,
+      "loss": 3.9853,
+      "step": 17216
+    },
+    {
+      "epoch": 0.17217,
+      "grad_norm": 0.6153284311294556,
+      "learning_rate": 0.003,
+      "loss": 3.965,
+      "step": 17217
+    },
+    {
+      "epoch": 0.17218,
+      "grad_norm": 0.6405128240585327,
+      "learning_rate": 0.003,
+      "loss": 3.9768,
+      "step": 17218
+    },
+    {
+      "epoch": 0.17219,
+      "grad_norm": 0.7058283090591431,
+      "learning_rate": 0.003,
+      "loss": 3.9786,
+      "step": 17219
+    },
+    {
+      "epoch": 0.1722,
+      "grad_norm": 0.7759990096092224,
+      "learning_rate": 0.003,
+      "loss": 3.9728,
+      "step": 17220
+    },
+    {
+      "epoch": 0.17221,
+      "grad_norm": 0.8714685440063477,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 17221
+    },
+    {
+      "epoch": 0.17222,
+      "grad_norm": 0.8724744319915771,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 17222
+    },
+    {
+      "epoch": 0.17223,
+      "grad_norm": 0.7614827752113342,
+      "learning_rate": 0.003,
+      "loss": 3.9683,
+      "step": 17223
+    },
+    {
+      "epoch": 0.17224,
+      "grad_norm": 0.692354142665863,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 17224
+    },
+    {
+      "epoch": 0.17225,
+      "grad_norm": 0.7642661929130554,
+      "learning_rate": 0.003,
+      "loss": 4.0,
+      "step": 17225
+    },
+    {
+      "epoch": 0.17226,
+      "grad_norm": 0.732671320438385,
+      "learning_rate": 0.003,
+      "loss": 4.03,
+      "step": 17226
+    },
+    {
+      "epoch": 0.17227,
+      "grad_norm": 0.7117012143135071,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 17227
+    },
+    {
+      "epoch": 0.17228,
+      "grad_norm": 0.8455045819282532,
+      "learning_rate": 0.003,
+      "loss": 3.9946,
+      "step": 17228
+    },
+    {
+      "epoch": 0.17229,
+      "grad_norm": 0.9891947507858276,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 17229
+    },
+    {
+      "epoch": 0.1723,
+      "grad_norm": 1.20091712474823,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 17230
+    },
+    {
+      "epoch": 0.17231,
+      "grad_norm": 0.8701549768447876,
+      "learning_rate": 0.003,
+      "loss": 3.9906,
+      "step": 17231
+    },
+    {
+      "epoch": 0.17232,
+      "grad_norm": 0.7704989314079285,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 17232
+    },
+    {
+      "epoch": 0.17233,
+      "grad_norm": 0.7895516157150269,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 17233
+    },
+    {
+      "epoch": 0.17234,
+      "grad_norm": 0.7732393145561218,
+      "learning_rate": 0.003,
+      "loss": 3.9858,
+      "step": 17234
+    },
+    {
+      "epoch": 0.17235,
+      "grad_norm": 0.7781100273132324,
+      "learning_rate": 0.003,
+      "loss": 3.9745,
+      "step": 17235
+    },
+    {
+      "epoch": 0.17236,
+      "grad_norm": 0.7803873419761658,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 17236
+    },
+    {
+      "epoch": 0.17237,
+      "grad_norm": 0.8356990814208984,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 17237
+    },
+    {
+      "epoch": 0.17238,
+      "grad_norm": 0.7919490933418274,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 17238
+    },
+    {
+      "epoch": 0.17239,
+      "grad_norm": 0.9274510145187378,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 17239
+    },
+    {
+      "epoch": 0.1724,
+      "grad_norm": 1.0621308088302612,
+      "learning_rate": 0.003,
+      "loss": 3.952,
+      "step": 17240
+    },
+    {
+      "epoch": 0.17241,
+      "grad_norm": 0.7842041254043579,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 17241
+    },
+    {
+      "epoch": 0.17242,
+      "grad_norm": 0.6512811779975891,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 17242
+    },
+    {
+      "epoch": 0.17243,
+      "grad_norm": 0.6548734307289124,
+      "learning_rate": 0.003,
+      "loss": 3.9922,
+      "step": 17243
+    },
+    {
+      "epoch": 0.17244,
+      "grad_norm": 0.7729583382606506,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 17244
+    },
+    {
+      "epoch": 0.17245,
+      "grad_norm": 1.075770378112793,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 17245
+    },
+    {
+      "epoch": 0.17246,
+      "grad_norm": 0.8497618436813354,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 17246
+    },
+    {
+      "epoch": 0.17247,
+      "grad_norm": 0.7210960388183594,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 17247
+    },
+    {
+      "epoch": 0.17248,
+      "grad_norm": 0.7670256495475769,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 17248
+    },
+    {
+      "epoch": 0.17249,
+      "grad_norm": 0.7562484741210938,
+      "learning_rate": 0.003,
+      "loss": 3.9847,
+      "step": 17249
+    },
+    {
+      "epoch": 0.1725,
+      "grad_norm": 0.6801670789718628,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 17250
+    },
+    {
+      "epoch": 0.17251,
+      "grad_norm": 0.7317301034927368,
+      "learning_rate": 0.003,
+      "loss": 3.9953,
+      "step": 17251
+    },
+    {
+      "epoch": 0.17252,
+      "grad_norm": 0.733239471912384,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 17252
+    },
+    {
+      "epoch": 0.17253,
+      "grad_norm": 0.8458531498908997,
+      "learning_rate": 0.003,
+      "loss": 3.9912,
+      "step": 17253
+    },
+    {
+      "epoch": 0.17254,
+      "grad_norm": 1.0262222290039062,
+      "learning_rate": 0.003,
+      "loss": 3.9756,
+      "step": 17254
+    },
+    {
+      "epoch": 0.17255,
+      "grad_norm": 1.158956527709961,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 17255
+    },
+    {
+      "epoch": 0.17256,
+      "grad_norm": 0.7779101729393005,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 17256
+    },
+    {
+      "epoch": 0.17257,
+      "grad_norm": 0.7064163088798523,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 17257
+    },
+    {
+      "epoch": 0.17258,
+      "grad_norm": 0.7622659802436829,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 17258
+    },
+    {
+      "epoch": 0.17259,
+      "grad_norm": 0.9194077849388123,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 17259
+    },
+    {
+      "epoch": 0.1726,
+      "grad_norm": 0.8892446756362915,
+      "learning_rate": 0.003,
+      "loss": 3.9946,
+      "step": 17260
+    },
+    {
+      "epoch": 0.17261,
+      "grad_norm": 0.8862422108650208,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 17261
+    },
+    {
+      "epoch": 0.17262,
+      "grad_norm": 0.937934935092926,
+      "learning_rate": 0.003,
+      "loss": 3.9982,
+      "step": 17262
+    },
+    {
+      "epoch": 0.17263,
+      "grad_norm": 1.2220501899719238,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 17263
+    },
+    {
+      "epoch": 0.17264,
+      "grad_norm": 0.7869848608970642,
+      "learning_rate": 0.003,
+      "loss": 3.9839,
+      "step": 17264
+    },
+    {
+      "epoch": 0.17265,
+      "grad_norm": 0.7467491030693054,
+      "learning_rate": 0.003,
+      "loss": 3.9795,
+      "step": 17265
+    },
+    {
+      "epoch": 0.17266,
+      "grad_norm": 0.6699659824371338,
+      "learning_rate": 0.003,
+      "loss": 3.9836,
+      "step": 17266
+    },
+    {
+      "epoch": 0.17267,
+      "grad_norm": 0.6560962200164795,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 17267
+    },
+    {
+      "epoch": 0.17268,
+      "grad_norm": 0.6503502726554871,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 17268
+    },
+    {
+      "epoch": 0.17269,
+      "grad_norm": 0.7429957985877991,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 17269
+    },
+    {
+      "epoch": 0.1727,
+      "grad_norm": 0.7676070332527161,
+      "learning_rate": 0.003,
+      "loss": 3.9909,
+      "step": 17270
+    },
+    {
+      "epoch": 0.17271,
+      "grad_norm": 0.9311237335205078,
+      "learning_rate": 0.003,
+      "loss": 4.0294,
+      "step": 17271
+    },
+    {
+      "epoch": 0.17272,
+      "grad_norm": 0.8896855711936951,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 17272
+    },
+    {
+      "epoch": 0.17273,
+      "grad_norm": 0.8906723856925964,
+      "learning_rate": 0.003,
+      "loss": 3.9935,
+      "step": 17273
+    },
+    {
+      "epoch": 0.17274,
+      "grad_norm": 0.9185470342636108,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 17274
+    },
+    {
+      "epoch": 0.17275,
+      "grad_norm": 0.9359113574028015,
+      "learning_rate": 0.003,
+      "loss": 3.9813,
+      "step": 17275
+    },
+    {
+      "epoch": 0.17276,
+      "grad_norm": 1.127929449081421,
+      "learning_rate": 0.003,
+      "loss": 4.0091,
+      "step": 17276
+    },
+    {
+      "epoch": 0.17277,
+      "grad_norm": 0.961293637752533,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 17277
+    },
+    {
+      "epoch": 0.17278,
+      "grad_norm": 0.8514905571937561,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 17278
+    },
+    {
+      "epoch": 0.17279,
+      "grad_norm": 0.8976634740829468,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 17279
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 1.0153040885925293,
+      "learning_rate": 0.003,
+      "loss": 3.9903,
+      "step": 17280
+    },
+    {
+      "epoch": 0.17281,
+      "grad_norm": 1.1854093074798584,
+      "learning_rate": 0.003,
+      "loss": 3.9723,
+      "step": 17281
+    },
+    {
+      "epoch": 0.17282,
+      "grad_norm": 0.6242632865905762,
+      "learning_rate": 0.003,
+      "loss": 3.9908,
+      "step": 17282
+    },
+    {
+      "epoch": 0.17283,
+      "grad_norm": 0.7148314118385315,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 17283
+    },
+    {
+      "epoch": 0.17284,
+      "grad_norm": 0.5994605422019958,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 17284
+    },
+    {
+      "epoch": 0.17285,
+      "grad_norm": 0.5930424928665161,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 17285
+    },
+    {
+      "epoch": 0.17286,
+      "grad_norm": 0.6516755223274231,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 17286
+    },
+    {
+      "epoch": 0.17287,
+      "grad_norm": 0.5659880638122559,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 17287
+    },
+    {
+      "epoch": 0.17288,
+      "grad_norm": 0.5813016295433044,
+      "learning_rate": 0.003,
+      "loss": 3.9896,
+      "step": 17288
+    },
+    {
+      "epoch": 0.17289,
+      "grad_norm": 0.6589696407318115,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 17289
+    },
+    {
+      "epoch": 0.1729,
+      "grad_norm": 0.6921236515045166,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 17290
+    },
+    {
+      "epoch": 0.17291,
+      "grad_norm": 0.6518563628196716,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 17291
+    },
+    {
+      "epoch": 0.17292,
+      "grad_norm": 0.638307511806488,
+      "learning_rate": 0.003,
+      "loss": 3.9826,
+      "step": 17292
+    },
+    {
+      "epoch": 0.17293,
+      "grad_norm": 0.7391517162322998,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 17293
+    },
+    {
+      "epoch": 0.17294,
+      "grad_norm": 0.8546097874641418,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 17294
+    },
+    {
+      "epoch": 0.17295,
+      "grad_norm": 0.9340639710426331,
+      "learning_rate": 0.003,
+      "loss": 3.9824,
+      "step": 17295
+    },
+    {
+      "epoch": 0.17296,
+      "grad_norm": 0.9694673418998718,
+      "learning_rate": 0.003,
+      "loss": 3.9876,
+      "step": 17296
+    },
+    {
+      "epoch": 0.17297,
+      "grad_norm": 1.064598560333252,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 17297
+    },
+    {
+      "epoch": 0.17298,
+      "grad_norm": 0.958608865737915,
+      "learning_rate": 0.003,
+      "loss": 3.9853,
+      "step": 17298
+    },
+    {
+      "epoch": 0.17299,
+      "grad_norm": 0.8235632181167603,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 17299
+    },
+    {
+      "epoch": 0.173,
+      "grad_norm": 0.9273446202278137,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 17300
+    },
+    {
+      "epoch": 0.17301,
+      "grad_norm": 1.0050894021987915,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 17301
+    },
+    {
+      "epoch": 0.17302,
+      "grad_norm": 1.0514724254608154,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 17302
+    },
+    {
+      "epoch": 0.17303,
+      "grad_norm": 1.0046824216842651,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 17303
+    },
+    {
+      "epoch": 0.17304,
+      "grad_norm": 1.0234330892562866,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 17304
+    },
+    {
+      "epoch": 0.17305,
+      "grad_norm": 1.0548959970474243,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 17305
+    },
+    {
+      "epoch": 0.17306,
+      "grad_norm": 1.0699856281280518,
+      "learning_rate": 0.003,
+      "loss": 3.9944,
+      "step": 17306
+    },
+    {
+      "epoch": 0.17307,
+      "grad_norm": 0.8867344260215759,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 17307
+    },
+    {
+      "epoch": 0.17308,
+      "grad_norm": 0.8486331105232239,
+      "learning_rate": 0.003,
+      "loss": 4.0281,
+      "step": 17308
+    },
+    {
+      "epoch": 0.17309,
+      "grad_norm": 0.85123211145401,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 17309
+    },
+    {
+      "epoch": 0.1731,
+      "grad_norm": 1.0190993547439575,
+      "learning_rate": 0.003,
+      "loss": 4.0426,
+      "step": 17310
+    },
+    {
+      "epoch": 0.17311,
+      "grad_norm": 1.1102086305618286,
+      "learning_rate": 0.003,
+      "loss": 4.0472,
+      "step": 17311
+    },
+    {
+      "epoch": 0.17312,
+      "grad_norm": 0.920134425163269,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 17312
+    },
+    {
+      "epoch": 0.17313,
+      "grad_norm": 0.8725000023841858,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 17313
+    },
+    {
+      "epoch": 0.17314,
+      "grad_norm": 0.7170385122299194,
+      "learning_rate": 0.003,
+      "loss": 4.0289,
+      "step": 17314
+    },
+    {
+      "epoch": 0.17315,
+      "grad_norm": 0.7331812977790833,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 17315
+    },
+    {
+      "epoch": 0.17316,
+      "grad_norm": 0.9502736926078796,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 17316
+    },
+    {
+      "epoch": 0.17317,
+      "grad_norm": 0.9615503549575806,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 17317
+    },
+    {
+      "epoch": 0.17318,
+      "grad_norm": 0.8293330669403076,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 17318
+    },
+    {
+      "epoch": 0.17319,
+      "grad_norm": 0.7895271182060242,
+      "learning_rate": 0.003,
+      "loss": 3.9906,
+      "step": 17319
+    },
+    {
+      "epoch": 0.1732,
+      "grad_norm": 0.9117076396942139,
+      "learning_rate": 0.003,
+      "loss": 3.9985,
+      "step": 17320
+    },
+    {
+      "epoch": 0.17321,
+      "grad_norm": 0.9139329791069031,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 17321
+    },
+    {
+      "epoch": 0.17322,
+      "grad_norm": 1.0657029151916504,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 17322
+    },
+    {
+      "epoch": 0.17323,
+      "grad_norm": 1.171359896659851,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 17323
+    },
+    {
+      "epoch": 0.17324,
+      "grad_norm": 0.7427003979682922,
+      "learning_rate": 0.003,
+      "loss": 4.0242,
+      "step": 17324
+    },
+    {
+      "epoch": 0.17325,
+      "grad_norm": 0.5160406231880188,
+      "learning_rate": 0.003,
+      "loss": 3.98,
+      "step": 17325
+    },
+    {
+      "epoch": 0.17326,
+      "grad_norm": 0.7402018308639526,
+      "learning_rate": 0.003,
+      "loss": 3.9889,
+      "step": 17326
+    },
+    {
+      "epoch": 0.17327,
+      "grad_norm": 0.8790965676307678,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 17327
+    },
+    {
+      "epoch": 0.17328,
+      "grad_norm": 0.928956151008606,
+      "learning_rate": 0.003,
+      "loss": 3.988,
+      "step": 17328
+    },
+    {
+      "epoch": 0.17329,
+      "grad_norm": 0.8769863843917847,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 17329
+    },
+    {
+      "epoch": 0.1733,
+      "grad_norm": 0.7440482378005981,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 17330
+    },
+    {
+      "epoch": 0.17331,
+      "grad_norm": 0.5764461755752563,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 17331
+    },
+    {
+      "epoch": 0.17332,
+      "grad_norm": 0.5576710104942322,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 17332
+    },
+    {
+      "epoch": 0.17333,
+      "grad_norm": 0.695499837398529,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 17333
+    },
+    {
+      "epoch": 0.17334,
+      "grad_norm": 0.7192192077636719,
+      "learning_rate": 0.003,
+      "loss": 3.9738,
+      "step": 17334
+    },
+    {
+      "epoch": 0.17335,
+      "grad_norm": 0.6879513263702393,
+      "learning_rate": 0.003,
+      "loss": 3.9889,
+      "step": 17335
+    },
+    {
+      "epoch": 0.17336,
+      "grad_norm": 0.6746930480003357,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 17336
+    },
+    {
+      "epoch": 0.17337,
+      "grad_norm": 0.7167201042175293,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 17337
+    },
+    {
+      "epoch": 0.17338,
+      "grad_norm": 0.7446791529655457,
+      "learning_rate": 0.003,
+      "loss": 3.9592,
+      "step": 17338
+    },
+    {
+      "epoch": 0.17339,
+      "grad_norm": 0.7925171256065369,
+      "learning_rate": 0.003,
+      "loss": 4.0217,
+      "step": 17339
+    },
+    {
+      "epoch": 0.1734,
+      "grad_norm": 0.9689291715621948,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 17340
+    },
+    {
+      "epoch": 0.17341,
+      "grad_norm": 1.0888067483901978,
+      "learning_rate": 0.003,
+      "loss": 3.9928,
+      "step": 17341
+    },
+    {
+      "epoch": 0.17342,
+      "grad_norm": 0.6682828068733215,
+      "learning_rate": 0.003,
+      "loss": 3.9839,
+      "step": 17342
+    },
+    {
+      "epoch": 0.17343,
+      "grad_norm": 0.66913902759552,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 17343
+    },
+    {
+      "epoch": 0.17344,
+      "grad_norm": 0.6523056030273438,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 17344
+    },
+    {
+      "epoch": 0.17345,
+      "grad_norm": 0.5623226165771484,
+      "learning_rate": 0.003,
+      "loss": 3.9908,
+      "step": 17345
+    },
+    {
+      "epoch": 0.17346,
+      "grad_norm": 0.583284854888916,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 17346
+    },
+    {
+      "epoch": 0.17347,
+      "grad_norm": 0.6016397476196289,
+      "learning_rate": 0.003,
+      "loss": 3.977,
+      "step": 17347
+    },
+    {
+      "epoch": 0.17348,
+      "grad_norm": 0.589728832244873,
+      "learning_rate": 0.003,
+      "loss": 3.9795,
+      "step": 17348
+    },
+    {
+      "epoch": 0.17349,
+      "grad_norm": 0.5602003931999207,
+      "learning_rate": 0.003,
+      "loss": 3.9931,
+      "step": 17349
+    },
+    {
+      "epoch": 0.1735,
+      "grad_norm": 0.6237813234329224,
+      "learning_rate": 0.003,
+      "loss": 3.984,
+      "step": 17350
+    },
+    {
+      "epoch": 0.17351,
+      "grad_norm": 0.5999209880828857,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 17351
+    },
+    {
+      "epoch": 0.17352,
+      "grad_norm": 0.652887225151062,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 17352
+    },
+    {
+      "epoch": 0.17353,
+      "grad_norm": 0.7345176935195923,
+      "learning_rate": 0.003,
+      "loss": 3.9573,
+      "step": 17353
+    },
+    {
+      "epoch": 0.17354,
+      "grad_norm": 0.7107290029525757,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 17354
+    },
+    {
+      "epoch": 0.17355,
+      "grad_norm": 0.7638047337532043,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 17355
+    },
+    {
+      "epoch": 0.17356,
+      "grad_norm": 0.683320939540863,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 17356
+    },
+    {
+      "epoch": 0.17357,
+      "grad_norm": 0.8579681515693665,
+      "learning_rate": 0.003,
+      "loss": 4.0316,
+      "step": 17357
+    },
+    {
+      "epoch": 0.17358,
+      "grad_norm": 1.1549464464187622,
+      "learning_rate": 0.003,
+      "loss": 3.9845,
+      "step": 17358
+    },
+    {
+      "epoch": 0.17359,
+      "grad_norm": 1.3736201524734497,
+      "learning_rate": 0.003,
+      "loss": 3.9893,
+      "step": 17359
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.847314178943634,
+      "learning_rate": 0.003,
+      "loss": 3.9972,
+      "step": 17360
+    },
+    {
+      "epoch": 0.17361,
+      "grad_norm": 0.7125725150108337,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 17361
+    },
+    {
+      "epoch": 0.17362,
+      "grad_norm": 0.6976441144943237,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 17362
+    },
+    {
+      "epoch": 0.17363,
+      "grad_norm": 0.6987001895904541,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 17363
+    },
+    {
+      "epoch": 0.17364,
+      "grad_norm": 0.6975489258766174,
+      "learning_rate": 0.003,
+      "loss": 3.9576,
+      "step": 17364
+    },
+    {
+      "epoch": 0.17365,
+      "grad_norm": 0.8965944647789001,
+      "learning_rate": 0.003,
+      "loss": 3.9797,
+      "step": 17365
+    },
+    {
+      "epoch": 0.17366,
+      "grad_norm": 1.063056230545044,
+      "learning_rate": 0.003,
+      "loss": 3.9849,
+      "step": 17366
+    },
+    {
+      "epoch": 0.17367,
+      "grad_norm": 1.0388587713241577,
+      "learning_rate": 0.003,
+      "loss": 3.9816,
+      "step": 17367
+    },
+    {
+      "epoch": 0.17368,
+      "grad_norm": 0.911588728427887,
+      "learning_rate": 0.003,
+      "loss": 3.9707,
+      "step": 17368
+    },
+    {
+      "epoch": 0.17369,
+      "grad_norm": 0.8996477723121643,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 17369
+    },
+    {
+      "epoch": 0.1737,
+      "grad_norm": 0.9115211367607117,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 17370
+    },
+    {
+      "epoch": 0.17371,
+      "grad_norm": 0.8117915391921997,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 17371
+    },
+    {
+      "epoch": 0.17372,
+      "grad_norm": 1.026132345199585,
+      "learning_rate": 0.003,
+      "loss": 4.0001,
+      "step": 17372
+    },
+    {
+      "epoch": 0.17373,
+      "grad_norm": 1.2777268886566162,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 17373
+    },
+    {
+      "epoch": 0.17374,
+      "grad_norm": 0.8393291234970093,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 17374
+    },
+    {
+      "epoch": 0.17375,
+      "grad_norm": 0.724292516708374,
+      "learning_rate": 0.003,
+      "loss": 4.009,
+      "step": 17375
+    },
+    {
+      "epoch": 0.17376,
+      "grad_norm": 0.6945507526397705,
+      "learning_rate": 0.003,
+      "loss": 3.9929,
+      "step": 17376
+    },
+    {
+      "epoch": 0.17377,
+      "grad_norm": 0.693756639957428,
+      "learning_rate": 0.003,
+      "loss": 4.0028,
+      "step": 17377
+    },
+    {
+      "epoch": 0.17378,
+      "grad_norm": 0.8418163061141968,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 17378
+    },
+    {
+      "epoch": 0.17379,
+      "grad_norm": 0.6874514222145081,
+      "learning_rate": 0.003,
+      "loss": 3.9736,
+      "step": 17379
+    },
+    {
+      "epoch": 0.1738,
+      "grad_norm": 0.6427291035652161,
+      "learning_rate": 0.003,
+      "loss": 3.9605,
+      "step": 17380
+    },
+    {
+      "epoch": 0.17381,
+      "grad_norm": 0.6788190007209778,
+      "learning_rate": 0.003,
+      "loss": 3.9925,
+      "step": 17381
+    },
+    {
+      "epoch": 0.17382,
+      "grad_norm": 0.7309727668762207,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 17382
+    },
+    {
+      "epoch": 0.17383,
+      "grad_norm": 0.8250257968902588,
+      "learning_rate": 0.003,
+      "loss": 3.9819,
+      "step": 17383
+    },
+    {
+      "epoch": 0.17384,
+      "grad_norm": 1.0060784816741943,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 17384
+    },
+    {
+      "epoch": 0.17385,
+      "grad_norm": 1.2069662809371948,
+      "learning_rate": 0.003,
+      "loss": 3.9967,
+      "step": 17385
+    },
+    {
+      "epoch": 0.17386,
+      "grad_norm": 0.9430493712425232,
+      "learning_rate": 0.003,
+      "loss": 4.0002,
+      "step": 17386
+    },
+    {
+      "epoch": 0.17387,
+      "grad_norm": 0.9377607107162476,
+      "learning_rate": 0.003,
+      "loss": 4.009,
+      "step": 17387
+    },
+    {
+      "epoch": 0.17388,
+      "grad_norm": 0.9175955057144165,
+      "learning_rate": 0.003,
+      "loss": 3.9985,
+      "step": 17388
+    },
+    {
+      "epoch": 0.17389,
+      "grad_norm": 0.9722581505775452,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 17389
+    },
+    {
+      "epoch": 0.1739,
+      "grad_norm": 0.8540912866592407,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 17390
+    },
+    {
+      "epoch": 0.17391,
+      "grad_norm": 0.8366203308105469,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 17391
+    },
+    {
+      "epoch": 0.17392,
+      "grad_norm": 0.7991741299629211,
+      "learning_rate": 0.003,
+      "loss": 3.9941,
+      "step": 17392
+    },
+    {
+      "epoch": 0.17393,
+      "grad_norm": 0.7675210237503052,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 17393
+    },
+    {
+      "epoch": 0.17394,
+      "grad_norm": 0.9029843807220459,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 17394
+    },
+    {
+      "epoch": 0.17395,
+      "grad_norm": 0.9093694090843201,
+      "learning_rate": 0.003,
+      "loss": 3.9836,
+      "step": 17395
+    },
+    {
+      "epoch": 0.17396,
+      "grad_norm": 1.017406940460205,
+      "learning_rate": 0.003,
+      "loss": 3.9878,
+      "step": 17396
+    },
+    {
+      "epoch": 0.17397,
+      "grad_norm": 1.1229156255722046,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 17397
+    },
+    {
+      "epoch": 0.17398,
+      "grad_norm": 1.0047029256820679,
+      "learning_rate": 0.003,
+      "loss": 4.0498,
+      "step": 17398
+    },
+    {
+      "epoch": 0.17399,
+      "grad_norm": 0.9616352319717407,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 17399
+    },
+    {
+      "epoch": 0.174,
+      "grad_norm": 0.7757841348648071,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 17400
+    },
+    {
+      "epoch": 0.17401,
+      "grad_norm": 0.8163595795631409,
+      "learning_rate": 0.003,
+      "loss": 3.9845,
+      "step": 17401
+    },
+    {
+      "epoch": 0.17402,
+      "grad_norm": 0.7642260789871216,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 17402
+    },
+    {
+      "epoch": 0.17403,
+      "grad_norm": 0.7993416786193848,
+      "learning_rate": 0.003,
+      "loss": 4.0294,
+      "step": 17403
+    },
+    {
+      "epoch": 0.17404,
+      "grad_norm": 0.7023702263832092,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 17404
+    },
+    {
+      "epoch": 0.17405,
+      "grad_norm": 0.6713720560073853,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 17405
+    },
+    {
+      "epoch": 0.17406,
+      "grad_norm": 0.7091724872589111,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 17406
+    },
+    {
+      "epoch": 0.17407,
+      "grad_norm": 0.6860936880111694,
+      "learning_rate": 0.003,
+      "loss": 3.9763,
+      "step": 17407
+    },
+    {
+      "epoch": 0.17408,
+      "grad_norm": 0.6985510587692261,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 17408
+    },
+    {
+      "epoch": 0.17409,
+      "grad_norm": 0.753868043422699,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 17409
+    },
+    {
+      "epoch": 0.1741,
+      "grad_norm": 0.9177746176719666,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 17410
+    },
+    {
+      "epoch": 0.17411,
+      "grad_norm": 1.244633436203003,
+      "learning_rate": 0.003,
+      "loss": 4.0027,
+      "step": 17411
+    },
+    {
+      "epoch": 0.17412,
+      "grad_norm": 1.0975927114486694,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 17412
+    },
+    {
+      "epoch": 0.17413,
+      "grad_norm": 1.0281498432159424,
+      "learning_rate": 0.003,
+      "loss": 3.9845,
+      "step": 17413
+    },
+    {
+      "epoch": 0.17414,
+      "grad_norm": 1.0181832313537598,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 17414
+    },
+    {
+      "epoch": 0.17415,
+      "grad_norm": 0.9445314407348633,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 17415
+    },
+    {
+      "epoch": 0.17416,
+      "grad_norm": 0.8794584274291992,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 17416
+    },
+    {
+      "epoch": 0.17417,
+      "grad_norm": 0.839174211025238,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 17417
+    },
+    {
+      "epoch": 0.17418,
+      "grad_norm": 0.7632862329483032,
+      "learning_rate": 0.003,
+      "loss": 3.995,
+      "step": 17418
+    },
+    {
+      "epoch": 0.17419,
+      "grad_norm": 0.7480762004852295,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 17419
+    },
+    {
+      "epoch": 0.1742,
+      "grad_norm": 0.9292443990707397,
+      "learning_rate": 0.003,
+      "loss": 3.9827,
+      "step": 17420
+    },
+    {
+      "epoch": 0.17421,
+      "grad_norm": 1.1653343439102173,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 17421
+    },
+    {
+      "epoch": 0.17422,
+      "grad_norm": 0.8291074633598328,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 17422
+    },
+    {
+      "epoch": 0.17423,
+      "grad_norm": 0.6278573870658875,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 17423
+    },
+    {
+      "epoch": 0.17424,
+      "grad_norm": 0.5656075477600098,
+      "learning_rate": 0.003,
+      "loss": 4.0047,
+      "step": 17424
+    },
+    {
+      "epoch": 0.17425,
+      "grad_norm": 0.6245643496513367,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 17425
+    },
+    {
+      "epoch": 0.17426,
+      "grad_norm": 0.5158340930938721,
+      "learning_rate": 0.003,
+      "loss": 3.9858,
+      "step": 17426
+    },
+    {
+      "epoch": 0.17427,
+      "grad_norm": 0.5346347689628601,
+      "learning_rate": 0.003,
+      "loss": 3.993,
+      "step": 17427
+    },
+    {
+      "epoch": 0.17428,
+      "grad_norm": 0.6253666281700134,
+      "learning_rate": 0.003,
+      "loss": 3.9777,
+      "step": 17428
+    },
+    {
+      "epoch": 0.17429,
+      "grad_norm": 0.6902015209197998,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 17429
+    },
+    {
+      "epoch": 0.1743,
+      "grad_norm": 0.7939674854278564,
+      "learning_rate": 0.003,
+      "loss": 3.9776,
+      "step": 17430
+    },
+    {
+      "epoch": 0.17431,
+      "grad_norm": 0.9390441179275513,
+      "learning_rate": 0.003,
+      "loss": 3.9719,
+      "step": 17431
+    },
+    {
+      "epoch": 0.17432,
+      "grad_norm": 1.0236916542053223,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 17432
+    },
+    {
+      "epoch": 0.17433,
+      "grad_norm": 0.9324219822883606,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 17433
+    },
+    {
+      "epoch": 0.17434,
+      "grad_norm": 0.8530586957931519,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 17434
+    },
+    {
+      "epoch": 0.17435,
+      "grad_norm": 0.8744862079620361,
+      "learning_rate": 0.003,
+      "loss": 4.0251,
+      "step": 17435
+    },
+    {
+      "epoch": 0.17436,
+      "grad_norm": 0.999214768409729,
+      "learning_rate": 0.003,
+      "loss": 3.9786,
+      "step": 17436
+    },
+    {
+      "epoch": 0.17437,
+      "grad_norm": 1.099568247795105,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 17437
+    },
+    {
+      "epoch": 0.17438,
+      "grad_norm": 0.8788972496986389,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 17438
+    },
+    {
+      "epoch": 0.17439,
+      "grad_norm": 0.7022820711135864,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 17439
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.6724662780761719,
+      "learning_rate": 0.003,
+      "loss": 4.0039,
+      "step": 17440
+    },
+    {
+      "epoch": 0.17441,
+      "grad_norm": 0.7439224123954773,
+      "learning_rate": 0.003,
+      "loss": 3.9992,
+      "step": 17441
+    },
+    {
+      "epoch": 0.17442,
+      "grad_norm": 0.7784821391105652,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 17442
+    },
+    {
+      "epoch": 0.17443,
+      "grad_norm": 0.7862952947616577,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 17443
+    },
+    {
+      "epoch": 0.17444,
+      "grad_norm": 0.9709135293960571,
+      "learning_rate": 0.003,
+      "loss": 3.9712,
+      "step": 17444
+    },
+    {
+      "epoch": 0.17445,
+      "grad_norm": 1.1998265981674194,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 17445
+    },
+    {
+      "epoch": 0.17446,
+      "grad_norm": 0.9091138243675232,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 17446
+    },
+    {
+      "epoch": 0.17447,
+      "grad_norm": 0.9684601426124573,
+      "learning_rate": 0.003,
+      "loss": 3.994,
+      "step": 17447
+    },
+    {
+      "epoch": 0.17448,
+      "grad_norm": 1.0080211162567139,
+      "learning_rate": 0.003,
+      "loss": 4.0333,
+      "step": 17448
+    },
+    {
+      "epoch": 0.17449,
+      "grad_norm": 0.9079150557518005,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 17449
+    },
+    {
+      "epoch": 0.1745,
+      "grad_norm": 0.6905058026313782,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 17450
+    },
+    {
+      "epoch": 0.17451,
+      "grad_norm": 0.632532000541687,
+      "learning_rate": 0.003,
+      "loss": 3.9696,
+      "step": 17451
+    },
+    {
+      "epoch": 0.17452,
+      "grad_norm": 0.7154324054718018,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 17452
+    },
+    {
+      "epoch": 0.17453,
+      "grad_norm": 0.7559896111488342,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 17453
+    },
+    {
+      "epoch": 0.17454,
+      "grad_norm": 0.7690410017967224,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 17454
+    },
+    {
+      "epoch": 0.17455,
+      "grad_norm": 0.7744832038879395,
+      "learning_rate": 0.003,
+      "loss": 3.9931,
+      "step": 17455
+    },
+    {
+      "epoch": 0.17456,
+      "grad_norm": 0.787226676940918,
+      "learning_rate": 0.003,
+      "loss": 3.9876,
+      "step": 17456
+    },
+    {
+      "epoch": 0.17457,
+      "grad_norm": 0.8192123770713806,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 17457
+    },
+    {
+      "epoch": 0.17458,
+      "grad_norm": 0.8754841089248657,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 17458
+    },
+    {
+      "epoch": 0.17459,
+      "grad_norm": 0.8357239365577698,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 17459
+    },
+    {
+      "epoch": 0.1746,
+      "grad_norm": 0.9402945637702942,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 17460
+    },
+    {
+      "epoch": 0.17461,
+      "grad_norm": 1.1382694244384766,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 17461
+    },
+    {
+      "epoch": 0.17462,
+      "grad_norm": 0.9957150816917419,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 17462
+    },
+    {
+      "epoch": 0.17463,
+      "grad_norm": 0.9740569591522217,
+      "learning_rate": 0.003,
+      "loss": 3.9782,
+      "step": 17463
+    },
+    {
+      "epoch": 0.17464,
+      "grad_norm": 0.8861000537872314,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 17464
+    },
+    {
+      "epoch": 0.17465,
+      "grad_norm": 0.7433106303215027,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 17465
+    },
+    {
+      "epoch": 0.17466,
+      "grad_norm": 0.6907755732536316,
+      "learning_rate": 0.003,
+      "loss": 3.9883,
+      "step": 17466
+    },
+    {
+      "epoch": 0.17467,
+      "grad_norm": 0.7396727204322815,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 17467
+    },
+    {
+      "epoch": 0.17468,
+      "grad_norm": 0.7633123993873596,
+      "learning_rate": 0.003,
+      "loss": 3.998,
+      "step": 17468
+    },
+    {
+      "epoch": 0.17469,
+      "grad_norm": 0.6752055883407593,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 17469
+    },
+    {
+      "epoch": 0.1747,
+      "grad_norm": 0.6542142033576965,
+      "learning_rate": 0.003,
+      "loss": 4.0207,
+      "step": 17470
+    },
+    {
+      "epoch": 0.17471,
+      "grad_norm": 0.724885106086731,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 17471
+    },
+    {
+      "epoch": 0.17472,
+      "grad_norm": 0.7976656556129456,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 17472
+    },
+    {
+      "epoch": 0.17473,
+      "grad_norm": 0.8885031342506409,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 17473
+    },
+    {
+      "epoch": 0.17474,
+      "grad_norm": 1.2055965662002563,
+      "learning_rate": 0.003,
+      "loss": 4.0072,
+      "step": 17474
+    },
+    {
+      "epoch": 0.17475,
+      "grad_norm": 1.0816218852996826,
+      "learning_rate": 0.003,
+      "loss": 3.9906,
+      "step": 17475
+    },
+    {
+      "epoch": 0.17476,
+      "grad_norm": 0.8149133920669556,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 17476
+    },
+    {
+      "epoch": 0.17477,
+      "grad_norm": 0.7656036615371704,
+      "learning_rate": 0.003,
+      "loss": 3.9896,
+      "step": 17477
+    },
+    {
+      "epoch": 0.17478,
+      "grad_norm": 0.7564748525619507,
+      "learning_rate": 0.003,
+      "loss": 3.9814,
+      "step": 17478
+    },
+    {
+      "epoch": 0.17479,
+      "grad_norm": 0.7418794631958008,
+      "learning_rate": 0.003,
+      "loss": 3.9863,
+      "step": 17479
+    },
+    {
+      "epoch": 0.1748,
+      "grad_norm": 0.6966302394866943,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 17480
+    },
+    {
+      "epoch": 0.17481,
+      "grad_norm": 0.6780658960342407,
+      "learning_rate": 0.003,
+      "loss": 3.9838,
+      "step": 17481
+    },
+    {
+      "epoch": 0.17482,
+      "grad_norm": 0.7791665196418762,
+      "learning_rate": 0.003,
+      "loss": 3.9741,
+      "step": 17482
+    },
+    {
+      "epoch": 0.17483,
+      "grad_norm": 0.7765859365463257,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 17483
+    },
+    {
+      "epoch": 0.17484,
+      "grad_norm": 0.9068479537963867,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 17484
+    },
+    {
+      "epoch": 0.17485,
+      "grad_norm": 1.1002610921859741,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 17485
+    },
+    {
+      "epoch": 0.17486,
+      "grad_norm": 0.99592524766922,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 17486
+    },
+    {
+      "epoch": 0.17487,
+      "grad_norm": 1.0522546768188477,
+      "learning_rate": 0.003,
+      "loss": 3.9806,
+      "step": 17487
+    },
+    {
+      "epoch": 0.17488,
+      "grad_norm": 0.7485376000404358,
+      "learning_rate": 0.003,
+      "loss": 3.9923,
+      "step": 17488
+    },
+    {
+      "epoch": 0.17489,
+      "grad_norm": 0.6797248125076294,
+      "learning_rate": 0.003,
+      "loss": 3.9834,
+      "step": 17489
+    },
+    {
+      "epoch": 0.1749,
+      "grad_norm": 0.7047320604324341,
+      "learning_rate": 0.003,
+      "loss": 3.9786,
+      "step": 17490
+    },
+    {
+      "epoch": 0.17491,
+      "grad_norm": 0.8675941824913025,
+      "learning_rate": 0.003,
+      "loss": 3.9782,
+      "step": 17491
+    },
+    {
+      "epoch": 0.17492,
+      "grad_norm": 0.8862576484680176,
+      "learning_rate": 0.003,
+      "loss": 3.9765,
+      "step": 17492
+    },
+    {
+      "epoch": 0.17493,
+      "grad_norm": 0.9569544792175293,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 17493
+    },
+    {
+      "epoch": 0.17494,
+      "grad_norm": 0.8425646424293518,
+      "learning_rate": 0.003,
+      "loss": 3.9901,
+      "step": 17494
+    },
+    {
+      "epoch": 0.17495,
+      "grad_norm": 0.785433828830719,
+      "learning_rate": 0.003,
+      "loss": 3.9802,
+      "step": 17495
+    },
+    {
+      "epoch": 0.17496,
+      "grad_norm": 0.7538527846336365,
+      "learning_rate": 0.003,
+      "loss": 3.987,
+      "step": 17496
+    },
+    {
+      "epoch": 0.17497,
+      "grad_norm": 0.7240882515907288,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 17497
+    },
+    {
+      "epoch": 0.17498,
+      "grad_norm": 0.7209701538085938,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 17498
+    },
+    {
+      "epoch": 0.17499,
+      "grad_norm": 0.7353964447975159,
+      "learning_rate": 0.003,
+      "loss": 3.9847,
+      "step": 17499
+    },
+    {
+      "epoch": 0.175,
+      "grad_norm": 0.8131001591682434,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 17500
+    },
+    {
+      "epoch": 0.17501,
+      "grad_norm": 1.0257822275161743,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 17501
+    },
+    {
+      "epoch": 0.17502,
+      "grad_norm": 1.1355948448181152,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 17502
+    },
+    {
+      "epoch": 0.17503,
+      "grad_norm": 0.8676596879959106,
+      "learning_rate": 0.003,
+      "loss": 4.0339,
+      "step": 17503
+    },
+    {
+      "epoch": 0.17504,
+      "grad_norm": 0.7113892436027527,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 17504
+    },
+    {
+      "epoch": 0.17505,
+      "grad_norm": 0.7276456952095032,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 17505
+    },
+    {
+      "epoch": 0.17506,
+      "grad_norm": 0.8023135662078857,
+      "learning_rate": 0.003,
+      "loss": 3.9859,
+      "step": 17506
+    },
+    {
+      "epoch": 0.17507,
+      "grad_norm": 0.8918072581291199,
+      "learning_rate": 0.003,
+      "loss": 3.9845,
+      "step": 17507
+    },
+    {
+      "epoch": 0.17508,
+      "grad_norm": 0.9726418256759644,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 17508
+    },
+    {
+      "epoch": 0.17509,
+      "grad_norm": 1.2349975109100342,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 17509
+    },
+    {
+      "epoch": 0.1751,
+      "grad_norm": 0.7286527156829834,
+      "learning_rate": 0.003,
+      "loss": 3.9907,
+      "step": 17510
+    },
+    {
+      "epoch": 0.17511,
+      "grad_norm": 0.7270987033843994,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 17511
+    },
+    {
+      "epoch": 0.17512,
+      "grad_norm": 0.9057313203811646,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 17512
+    },
+    {
+      "epoch": 0.17513,
+      "grad_norm": 1.2127231359481812,
+      "learning_rate": 0.003,
+      "loss": 4.0523,
+      "step": 17513
+    },
+    {
+      "epoch": 0.17514,
+      "grad_norm": 1.0335619449615479,
+      "learning_rate": 0.003,
+      "loss": 4.0343,
+      "step": 17514
+    },
+    {
+      "epoch": 0.17515,
+      "grad_norm": 1.0899924039840698,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 17515
+    },
+    {
+      "epoch": 0.17516,
+      "grad_norm": 0.788173496723175,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 17516
+    },
+    {
+      "epoch": 0.17517,
+      "grad_norm": 0.7215850949287415,
+      "learning_rate": 0.003,
+      "loss": 4.0302,
+      "step": 17517
+    },
+    {
+      "epoch": 0.17518,
+      "grad_norm": 0.6537896394729614,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 17518
+    },
+    {
+      "epoch": 0.17519,
+      "grad_norm": 0.7272935509681702,
+      "learning_rate": 0.003,
+      "loss": 3.9806,
+      "step": 17519
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.8016128540039062,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 17520
+    },
+    {
+      "epoch": 0.17521,
+      "grad_norm": 0.9136723279953003,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 17521
+    },
+    {
+      "epoch": 0.17522,
+      "grad_norm": 1.1162629127502441,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 17522
+    },
+    {
+      "epoch": 0.17523,
+      "grad_norm": 1.1937209367752075,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 17523
+    },
+    {
+      "epoch": 0.17524,
+      "grad_norm": 0.7823119163513184,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 17524
+    },
+    {
+      "epoch": 0.17525,
+      "grad_norm": 0.5235046148300171,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 17525
+    },
+    {
+      "epoch": 0.17526,
+      "grad_norm": 0.6116799712181091,
+      "learning_rate": 0.003,
+      "loss": 3.9941,
+      "step": 17526
+    },
+    {
+      "epoch": 0.17527,
+      "grad_norm": 0.5505747199058533,
+      "learning_rate": 0.003,
+      "loss": 3.969,
+      "step": 17527
+    },
+    {
+      "epoch": 0.17528,
+      "grad_norm": 0.6316537857055664,
+      "learning_rate": 0.003,
+      "loss": 3.9787,
+      "step": 17528
+    },
+    {
+      "epoch": 0.17529,
+      "grad_norm": 0.7600823044776917,
+      "learning_rate": 0.003,
+      "loss": 3.996,
+      "step": 17529
+    },
+    {
+      "epoch": 0.1753,
+      "grad_norm": 0.9958966374397278,
+      "learning_rate": 0.003,
+      "loss": 3.9786,
+      "step": 17530
+    },
+    {
+      "epoch": 0.17531,
+      "grad_norm": 1.2995574474334717,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 17531
+    },
+    {
+      "epoch": 0.17532,
+      "grad_norm": 0.7968376874923706,
+      "learning_rate": 0.003,
+      "loss": 3.9529,
+      "step": 17532
+    },
+    {
+      "epoch": 0.17533,
+      "grad_norm": 0.7426732778549194,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 17533
+    },
+    {
+      "epoch": 0.17534,
+      "grad_norm": 0.6420416831970215,
+      "learning_rate": 0.003,
+      "loss": 3.9794,
+      "step": 17534
+    },
+    {
+      "epoch": 0.17535,
+      "grad_norm": 0.6831434369087219,
+      "learning_rate": 0.003,
+      "loss": 3.9599,
+      "step": 17535
+    },
+    {
+      "epoch": 0.17536,
+      "grad_norm": 0.7888033986091614,
+      "learning_rate": 0.003,
+      "loss": 3.9802,
+      "step": 17536
+    },
+    {
+      "epoch": 0.17537,
+      "grad_norm": 0.7804220914840698,
+      "learning_rate": 0.003,
+      "loss": 3.9977,
+      "step": 17537
+    },
+    {
+      "epoch": 0.17538,
+      "grad_norm": 0.7175537347793579,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 17538
+    },
+    {
+      "epoch": 0.17539,
+      "grad_norm": 0.867682933807373,
+      "learning_rate": 0.003,
+      "loss": 3.9798,
+      "step": 17539
+    },
+    {
+      "epoch": 0.1754,
+      "grad_norm": 0.8793272376060486,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 17540
+    },
+    {
+      "epoch": 0.17541,
+      "grad_norm": 0.8773455023765564,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 17541
+    },
+    {
+      "epoch": 0.17542,
+      "grad_norm": 0.8339686393737793,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 17542
+    },
+    {
+      "epoch": 0.17543,
+      "grad_norm": 0.8761228919029236,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 17543
+    },
+    {
+      "epoch": 0.17544,
+      "grad_norm": 0.9914465546607971,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 17544
+    },
+    {
+      "epoch": 0.17545,
+      "grad_norm": 1.115749478340149,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 17545
+    },
+    {
+      "epoch": 0.17546,
+      "grad_norm": 0.8497521877288818,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 17546
+    },
+    {
+      "epoch": 0.17547,
+      "grad_norm": 0.8092749118804932,
+      "learning_rate": 0.003,
+      "loss": 3.9872,
+      "step": 17547
+    },
+    {
+      "epoch": 0.17548,
+      "grad_norm": 0.732430636882782,
+      "learning_rate": 0.003,
+      "loss": 3.9849,
+      "step": 17548
+    },
+    {
+      "epoch": 0.17549,
+      "grad_norm": 0.6795975565910339,
+      "learning_rate": 0.003,
+      "loss": 3.985,
+      "step": 17549
+    },
+    {
+      "epoch": 0.1755,
+      "grad_norm": 0.5819090008735657,
+      "learning_rate": 0.003,
+      "loss": 3.9802,
+      "step": 17550
+    },
+    {
+      "epoch": 0.17551,
+      "grad_norm": 0.6748693585395813,
+      "learning_rate": 0.003,
+      "loss": 3.9916,
+      "step": 17551
+    },
+    {
+      "epoch": 0.17552,
+      "grad_norm": 0.9079766273498535,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 17552
+    },
+    {
+      "epoch": 0.17553,
+      "grad_norm": 1.239787220954895,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 17553
+    },
+    {
+      "epoch": 0.17554,
+      "grad_norm": 0.8258132934570312,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 17554
+    },
+    {
+      "epoch": 0.17555,
+      "grad_norm": 0.7910948395729065,
+      "learning_rate": 0.003,
+      "loss": 3.9695,
+      "step": 17555
+    },
+    {
+      "epoch": 0.17556,
+      "grad_norm": 1.1124839782714844,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 17556
+    },
+    {
+      "epoch": 0.17557,
+      "grad_norm": 1.1162309646606445,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 17557
+    },
+    {
+      "epoch": 0.17558,
+      "grad_norm": 0.9350470304489136,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 17558
+    },
+    {
+      "epoch": 0.17559,
+      "grad_norm": 0.8715490102767944,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 17559
+    },
+    {
+      "epoch": 0.1756,
+      "grad_norm": 0.7804958820343018,
+      "learning_rate": 0.003,
+      "loss": 3.9719,
+      "step": 17560
+    },
+    {
+      "epoch": 0.17561,
+      "grad_norm": 0.8452410697937012,
+      "learning_rate": 0.003,
+      "loss": 3.9838,
+      "step": 17561
+    },
+    {
+      "epoch": 0.17562,
+      "grad_norm": 0.81874680519104,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 17562
+    },
+    {
+      "epoch": 0.17563,
+      "grad_norm": 0.7408578991889954,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 17563
+    },
+    {
+      "epoch": 0.17564,
+      "grad_norm": 0.6901379227638245,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 17564
+    },
+    {
+      "epoch": 0.17565,
+      "grad_norm": 0.6317659616470337,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 17565
+    },
+    {
+      "epoch": 0.17566,
+      "grad_norm": 0.7332043647766113,
+      "learning_rate": 0.003,
+      "loss": 3.977,
+      "step": 17566
+    },
+    {
+      "epoch": 0.17567,
+      "grad_norm": 0.9280351400375366,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 17567
+    },
+    {
+      "epoch": 0.17568,
+      "grad_norm": 0.961087703704834,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 17568
+    },
+    {
+      "epoch": 0.17569,
+      "grad_norm": 1.0218905210494995,
+      "learning_rate": 0.003,
+      "loss": 4.0251,
+      "step": 17569
+    },
+    {
+      "epoch": 0.1757,
+      "grad_norm": 1.013644814491272,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 17570
+    },
+    {
+      "epoch": 0.17571,
+      "grad_norm": 0.866844654083252,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 17571
+    },
+    {
+      "epoch": 0.17572,
+      "grad_norm": 0.9111921787261963,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 17572
+    },
+    {
+      "epoch": 0.17573,
+      "grad_norm": 0.7767279148101807,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 17573
+    },
+    {
+      "epoch": 0.17574,
+      "grad_norm": 0.6741935014724731,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 17574
+    },
+    {
+      "epoch": 0.17575,
+      "grad_norm": 0.7317989468574524,
+      "learning_rate": 0.003,
+      "loss": 3.9923,
+      "step": 17575
+    },
+    {
+      "epoch": 0.17576,
+      "grad_norm": 0.9040318727493286,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 17576
+    },
+    {
+      "epoch": 0.17577,
+      "grad_norm": 1.078898549079895,
+      "learning_rate": 0.003,
+      "loss": 3.9896,
+      "step": 17577
+    },
+    {
+      "epoch": 0.17578,
+      "grad_norm": 0.9824024438858032,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 17578
+    },
+    {
+      "epoch": 0.17579,
+      "grad_norm": 1.0256359577178955,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 17579
+    },
+    {
+      "epoch": 0.1758,
+      "grad_norm": 0.9958242177963257,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 17580
+    },
+    {
+      "epoch": 0.17581,
+      "grad_norm": 0.9190477728843689,
+      "learning_rate": 0.003,
+      "loss": 4.0375,
+      "step": 17581
+    },
+    {
+      "epoch": 0.17582,
+      "grad_norm": 0.7948879599571228,
+      "learning_rate": 0.003,
+      "loss": 4.0351,
+      "step": 17582
+    },
+    {
+      "epoch": 0.17583,
+      "grad_norm": 0.7672145366668701,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 17583
+    },
+    {
+      "epoch": 0.17584,
+      "grad_norm": 0.7990474700927734,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 17584
+    },
+    {
+      "epoch": 0.17585,
+      "grad_norm": 0.8452971577644348,
+      "learning_rate": 0.003,
+      "loss": 4.0237,
+      "step": 17585
+    },
+    {
+      "epoch": 0.17586,
+      "grad_norm": 0.8646570444107056,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 17586
+    },
+    {
+      "epoch": 0.17587,
+      "grad_norm": 0.8637148141860962,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 17587
+    },
+    {
+      "epoch": 0.17588,
+      "grad_norm": 0.8240643739700317,
+      "learning_rate": 0.003,
+      "loss": 4.0463,
+      "step": 17588
+    },
+    {
+      "epoch": 0.17589,
+      "grad_norm": 0.9793906211853027,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 17589
+    },
+    {
+      "epoch": 0.1759,
+      "grad_norm": 1.1525877714157104,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 17590
+    },
+    {
+      "epoch": 0.17591,
+      "grad_norm": 0.8227669596672058,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 17591
+    },
+    {
+      "epoch": 0.17592,
+      "grad_norm": 0.6809818744659424,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 17592
+    },
+    {
+      "epoch": 0.17593,
+      "grad_norm": 0.559374213218689,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 17593
+    },
+    {
+      "epoch": 0.17594,
+      "grad_norm": 0.5239890217781067,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 17594
+    },
+    {
+      "epoch": 0.17595,
+      "grad_norm": 0.5396626591682434,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 17595
+    },
+    {
+      "epoch": 0.17596,
+      "grad_norm": 0.6020597219467163,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 17596
+    },
+    {
+      "epoch": 0.17597,
+      "grad_norm": 0.6247676610946655,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 17597
+    },
+    {
+      "epoch": 0.17598,
+      "grad_norm": 0.7158964276313782,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 17598
+    },
+    {
+      "epoch": 0.17599,
+      "grad_norm": 0.8355786800384521,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 17599
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 1.1567732095718384,
+      "learning_rate": 0.003,
+      "loss": 3.9953,
+      "step": 17600
+    },
+    {
+      "epoch": 0.17601,
+      "grad_norm": 1.0251920223236084,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 17601
+    },
+    {
+      "epoch": 0.17602,
+      "grad_norm": 0.8615152835845947,
+      "learning_rate": 0.003,
+      "loss": 3.9924,
+      "step": 17602
+    },
+    {
+      "epoch": 0.17603,
+      "grad_norm": 0.7872258424758911,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 17603
+    },
+    {
+      "epoch": 0.17604,
+      "grad_norm": 0.7728003859519958,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 17604
+    },
+    {
+      "epoch": 0.17605,
+      "grad_norm": 0.9679343104362488,
+      "learning_rate": 0.003,
+      "loss": 4.0289,
+      "step": 17605
+    },
+    {
+      "epoch": 0.17606,
+      "grad_norm": 0.9719768762588501,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 17606
+    },
+    {
+      "epoch": 0.17607,
+      "grad_norm": 1.1245415210723877,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 17607
+    },
+    {
+      "epoch": 0.17608,
+      "grad_norm": 0.9123148322105408,
+      "learning_rate": 0.003,
+      "loss": 3.9851,
+      "step": 17608
+    },
+    {
+      "epoch": 0.17609,
+      "grad_norm": 0.8522974252700806,
+      "learning_rate": 0.003,
+      "loss": 3.997,
+      "step": 17609
+    },
+    {
+      "epoch": 0.1761,
+      "grad_norm": 0.9533160328865051,
+      "learning_rate": 0.003,
+      "loss": 3.9835,
+      "step": 17610
+    },
+    {
+      "epoch": 0.17611,
+      "grad_norm": 1.060482382774353,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 17611
+    },
+    {
+      "epoch": 0.17612,
+      "grad_norm": 1.0258617401123047,
+      "learning_rate": 0.003,
+      "loss": 3.951,
+      "step": 17612
+    },
+    {
+      "epoch": 0.17613,
+      "grad_norm": 0.9553532600402832,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 17613
+    },
+    {
+      "epoch": 0.17614,
+      "grad_norm": 0.9101959466934204,
+      "learning_rate": 0.003,
+      "loss": 3.9857,
+      "step": 17614
+    },
+    {
+      "epoch": 0.17615,
+      "grad_norm": 0.8651487827301025,
+      "learning_rate": 0.003,
+      "loss": 3.9865,
+      "step": 17615
+    },
+    {
+      "epoch": 0.17616,
+      "grad_norm": 0.8503139019012451,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 17616
+    },
+    {
+      "epoch": 0.17617,
+      "grad_norm": 0.8145707845687866,
+      "learning_rate": 0.003,
+      "loss": 3.9924,
+      "step": 17617
+    },
+    {
+      "epoch": 0.17618,
+      "grad_norm": 0.8305964469909668,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 17618
+    },
+    {
+      "epoch": 0.17619,
+      "grad_norm": 0.784797728061676,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 17619
+    },
+    {
+      "epoch": 0.1762,
+      "grad_norm": 0.7057626247406006,
+      "learning_rate": 0.003,
+      "loss": 3.9881,
+      "step": 17620
+    },
+    {
+      "epoch": 0.17621,
+      "grad_norm": 0.722081184387207,
+      "learning_rate": 0.003,
+      "loss": 3.9897,
+      "step": 17621
+    },
+    {
+      "epoch": 0.17622,
+      "grad_norm": 0.7251399755477905,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 17622
+    },
+    {
+      "epoch": 0.17623,
+      "grad_norm": 0.8381239175796509,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 17623
+    },
+    {
+      "epoch": 0.17624,
+      "grad_norm": 0.9010316729545593,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 17624
+    },
+    {
+      "epoch": 0.17625,
+      "grad_norm": 0.8895271420478821,
+      "learning_rate": 0.003,
+      "loss": 3.9903,
+      "step": 17625
+    },
+    {
+      "epoch": 0.17626,
+      "grad_norm": 1.0438815355300903,
+      "learning_rate": 0.003,
+      "loss": 3.9879,
+      "step": 17626
+    },
+    {
+      "epoch": 0.17627,
+      "grad_norm": 1.0572185516357422,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 17627
+    },
+    {
+      "epoch": 0.17628,
+      "grad_norm": 0.9122909307479858,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 17628
+    },
+    {
+      "epoch": 0.17629,
+      "grad_norm": 0.8782704472541809,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 17629
+    },
+    {
+      "epoch": 0.1763,
+      "grad_norm": 0.7416739463806152,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 17630
+    },
+    {
+      "epoch": 0.17631,
+      "grad_norm": 0.6938516497612,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 17631
+    },
+    {
+      "epoch": 0.17632,
+      "grad_norm": 0.732664167881012,
+      "learning_rate": 0.003,
+      "loss": 4.0538,
+      "step": 17632
+    },
+    {
+      "epoch": 0.17633,
+      "grad_norm": 0.7271563410758972,
+      "learning_rate": 0.003,
+      "loss": 4.0002,
+      "step": 17633
+    },
+    {
+      "epoch": 0.17634,
+      "grad_norm": 0.6744954586029053,
+      "learning_rate": 0.003,
+      "loss": 3.9876,
+      "step": 17634
+    },
+    {
+      "epoch": 0.17635,
+      "grad_norm": 0.6522406339645386,
+      "learning_rate": 0.003,
+      "loss": 3.9837,
+      "step": 17635
+    },
+    {
+      "epoch": 0.17636,
+      "grad_norm": 0.7126379013061523,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 17636
+    },
+    {
+      "epoch": 0.17637,
+      "grad_norm": 0.8472570180892944,
+      "learning_rate": 0.003,
+      "loss": 3.9876,
+      "step": 17637
+    },
+    {
+      "epoch": 0.17638,
+      "grad_norm": 1.177600383758545,
+      "learning_rate": 0.003,
+      "loss": 4.0256,
+      "step": 17638
+    },
+    {
+      "epoch": 0.17639,
+      "grad_norm": 0.9043630361557007,
+      "learning_rate": 0.003,
+      "loss": 3.9977,
+      "step": 17639
+    },
+    {
+      "epoch": 0.1764,
+      "grad_norm": 0.6743602752685547,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 17640
+    },
+    {
+      "epoch": 0.17641,
+      "grad_norm": 0.540255606174469,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 17641
+    },
+    {
+      "epoch": 0.17642,
+      "grad_norm": 0.5593344569206238,
+      "learning_rate": 0.003,
+      "loss": 3.9563,
+      "step": 17642
+    },
+    {
+      "epoch": 0.17643,
+      "grad_norm": 0.6615741848945618,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 17643
+    },
+    {
+      "epoch": 0.17644,
+      "grad_norm": 0.8354815244674683,
+      "learning_rate": 0.003,
+      "loss": 3.9913,
+      "step": 17644
+    },
+    {
+      "epoch": 0.17645,
+      "grad_norm": 0.9365418553352356,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 17645
+    },
+    {
+      "epoch": 0.17646,
+      "grad_norm": 0.91257643699646,
+      "learning_rate": 0.003,
+      "loss": 3.9839,
+      "step": 17646
+    },
+    {
+      "epoch": 0.17647,
+      "grad_norm": 0.8841412663459778,
+      "learning_rate": 0.003,
+      "loss": 3.9836,
+      "step": 17647
+    },
+    {
+      "epoch": 0.17648,
+      "grad_norm": 0.8749768733978271,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 17648
+    },
+    {
+      "epoch": 0.17649,
+      "grad_norm": 1.000697135925293,
+      "learning_rate": 0.003,
+      "loss": 3.9848,
+      "step": 17649
+    },
+    {
+      "epoch": 0.1765,
+      "grad_norm": 1.0813151597976685,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 17650
+    },
+    {
+      "epoch": 0.17651,
+      "grad_norm": 0.8042722344398499,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 17651
+    },
+    {
+      "epoch": 0.17652,
+      "grad_norm": 0.7402278780937195,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 17652
+    },
+    {
+      "epoch": 0.17653,
+      "grad_norm": 0.7824121117591858,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 17653
+    },
+    {
+      "epoch": 0.17654,
+      "grad_norm": 0.7480524778366089,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 17654
+    },
+    {
+      "epoch": 0.17655,
+      "grad_norm": 0.6704444289207458,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 17655
+    },
+    {
+      "epoch": 0.17656,
+      "grad_norm": 0.8114060163497925,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 17656
+    },
+    {
+      "epoch": 0.17657,
+      "grad_norm": 0.8143280744552612,
+      "learning_rate": 0.003,
+      "loss": 3.9639,
+      "step": 17657
+    },
+    {
+      "epoch": 0.17658,
+      "grad_norm": 0.8782177567481995,
+      "learning_rate": 0.003,
+      "loss": 3.9926,
+      "step": 17658
+    },
+    {
+      "epoch": 0.17659,
+      "grad_norm": 0.9757961630821228,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 17659
+    },
+    {
+      "epoch": 0.1766,
+      "grad_norm": 0.8597797751426697,
+      "learning_rate": 0.003,
+      "loss": 4.0353,
+      "step": 17660
+    },
+    {
+      "epoch": 0.17661,
+      "grad_norm": 0.8613868355751038,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 17661
+    },
+    {
+      "epoch": 0.17662,
+      "grad_norm": 1.0099040269851685,
+      "learning_rate": 0.003,
+      "loss": 3.9875,
+      "step": 17662
+    },
+    {
+      "epoch": 0.17663,
+      "grad_norm": 0.9791454076766968,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 17663
+    },
+    {
+      "epoch": 0.17664,
+      "grad_norm": 0.9757301807403564,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 17664
+    },
+    {
+      "epoch": 0.17665,
+      "grad_norm": 1.0718437433242798,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 17665
+    },
+    {
+      "epoch": 0.17666,
+      "grad_norm": 1.065012812614441,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 17666
+    },
+    {
+      "epoch": 0.17667,
+      "grad_norm": 1.106485366821289,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 17667
+    },
+    {
+      "epoch": 0.17668,
+      "grad_norm": 0.8106964826583862,
+      "learning_rate": 0.003,
+      "loss": 4.0373,
+      "step": 17668
+    },
+    {
+      "epoch": 0.17669,
+      "grad_norm": 0.6266219019889832,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 17669
+    },
+    {
+      "epoch": 0.1767,
+      "grad_norm": 0.6873462796211243,
+      "learning_rate": 0.003,
+      "loss": 3.9904,
+      "step": 17670
+    },
+    {
+      "epoch": 0.17671,
+      "grad_norm": 0.9363231062889099,
+      "learning_rate": 0.003,
+      "loss": 4.0237,
+      "step": 17671
+    },
+    {
+      "epoch": 0.17672,
+      "grad_norm": 1.082198977470398,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 17672
+    },
+    {
+      "epoch": 0.17673,
+      "grad_norm": 0.9996940493583679,
+      "learning_rate": 0.003,
+      "loss": 3.99,
+      "step": 17673
+    },
+    {
+      "epoch": 0.17674,
+      "grad_norm": 0.9465346932411194,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 17674
+    },
+    {
+      "epoch": 0.17675,
+      "grad_norm": 0.9348333477973938,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 17675
+    },
+    {
+      "epoch": 0.17676,
+      "grad_norm": 0.8782925009727478,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 17676
+    },
+    {
+      "epoch": 0.17677,
+      "grad_norm": 0.7545113563537598,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 17677
+    },
+    {
+      "epoch": 0.17678,
+      "grad_norm": 0.7324469089508057,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 17678
+    },
+    {
+      "epoch": 0.17679,
+      "grad_norm": 0.6657153964042664,
+      "learning_rate": 0.003,
+      "loss": 3.986,
+      "step": 17679
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.6143937706947327,
+      "learning_rate": 0.003,
+      "loss": 3.972,
+      "step": 17680
+    },
+    {
+      "epoch": 0.17681,
+      "grad_norm": 0.5606423020362854,
+      "learning_rate": 0.003,
+      "loss": 4.0305,
+      "step": 17681
+    },
+    {
+      "epoch": 0.17682,
+      "grad_norm": 0.521074652671814,
+      "learning_rate": 0.003,
+      "loss": 3.9676,
+      "step": 17682
+    },
+    {
+      "epoch": 0.17683,
+      "grad_norm": 0.5197908878326416,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 17683
+    },
+    {
+      "epoch": 0.17684,
+      "grad_norm": 0.5607931017875671,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 17684
+    },
+    {
+      "epoch": 0.17685,
+      "grad_norm": 0.6494010090827942,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 17685
+    },
+    {
+      "epoch": 0.17686,
+      "grad_norm": 0.9178204536437988,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 17686
+    },
+    {
+      "epoch": 0.17687,
+      "grad_norm": 1.1997172832489014,
+      "learning_rate": 0.003,
+      "loss": 3.9823,
+      "step": 17687
+    },
+    {
+      "epoch": 0.17688,
+      "grad_norm": 0.9303160309791565,
+      "learning_rate": 0.003,
+      "loss": 3.983,
+      "step": 17688
+    },
+    {
+      "epoch": 0.17689,
+      "grad_norm": 0.7332910299301147,
+      "learning_rate": 0.003,
+      "loss": 3.9856,
+      "step": 17689
+    },
+    {
+      "epoch": 0.1769,
+      "grad_norm": 0.6390018463134766,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 17690
+    },
+    {
+      "epoch": 0.17691,
+      "grad_norm": 0.6789406538009644,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 17691
+    },
+    {
+      "epoch": 0.17692,
+      "grad_norm": 0.76705402135849,
+      "learning_rate": 0.003,
+      "loss": 3.9856,
+      "step": 17692
+    },
+    {
+      "epoch": 0.17693,
+      "grad_norm": 0.8636718392372131,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 17693
+    },
+    {
+      "epoch": 0.17694,
+      "grad_norm": 0.8098905682563782,
+      "learning_rate": 0.003,
+      "loss": 3.983,
+      "step": 17694
+    },
+    {
+      "epoch": 0.17695,
+      "grad_norm": 0.8161396384239197,
+      "learning_rate": 0.003,
+      "loss": 3.9973,
+      "step": 17695
+    },
+    {
+      "epoch": 0.17696,
+      "grad_norm": 0.9196518063545227,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 17696
+    },
+    {
+      "epoch": 0.17697,
+      "grad_norm": 1.0340486764907837,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 17697
+    },
+    {
+      "epoch": 0.17698,
+      "grad_norm": 0.9755218029022217,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 17698
+    },
+    {
+      "epoch": 0.17699,
+      "grad_norm": 1.0434486865997314,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 17699
+    },
+    {
+      "epoch": 0.177,
+      "grad_norm": 0.9362537264823914,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 17700
+    },
+    {
+      "epoch": 0.17701,
+      "grad_norm": 0.9179715514183044,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 17701
+    },
+    {
+      "epoch": 0.17702,
+      "grad_norm": 0.8468559384346008,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 17702
+    },
+    {
+      "epoch": 0.17703,
+      "grad_norm": 0.9051650762557983,
+      "learning_rate": 0.003,
+      "loss": 3.9721,
+      "step": 17703
+    },
+    {
+      "epoch": 0.17704,
+      "grad_norm": 0.9996410608291626,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 17704
+    },
+    {
+      "epoch": 0.17705,
+      "grad_norm": 0.9762026071548462,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 17705
+    },
+    {
+      "epoch": 0.17706,
+      "grad_norm": 0.8613827228546143,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 17706
+    },
+    {
+      "epoch": 0.17707,
+      "grad_norm": 0.8776628971099854,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 17707
+    },
+    {
+      "epoch": 0.17708,
+      "grad_norm": 1.010719895362854,
+      "learning_rate": 0.003,
+      "loss": 4.0056,
+      "step": 17708
+    },
+    {
+      "epoch": 0.17709,
+      "grad_norm": 1.096747875213623,
+      "learning_rate": 0.003,
+      "loss": 4.0453,
+      "step": 17709
+    },
+    {
+      "epoch": 0.1771,
+      "grad_norm": 1.0291926860809326,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 17710
+    },
+    {
+      "epoch": 0.17711,
+      "grad_norm": 1.0713984966278076,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 17711
+    },
+    {
+      "epoch": 0.17712,
+      "grad_norm": 0.9482304453849792,
+      "learning_rate": 0.003,
+      "loss": 3.978,
+      "step": 17712
+    },
+    {
+      "epoch": 0.17713,
+      "grad_norm": 0.9021075963973999,
+      "learning_rate": 0.003,
+      "loss": 3.9921,
+      "step": 17713
+    },
+    {
+      "epoch": 0.17714,
+      "grad_norm": 0.9854664206504822,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 17714
+    },
+    {
+      "epoch": 0.17715,
+      "grad_norm": 1.0893782377243042,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 17715
+    },
+    {
+      "epoch": 0.17716,
+      "grad_norm": 0.9051546454429626,
+      "learning_rate": 0.003,
+      "loss": 4.0341,
+      "step": 17716
+    },
+    {
+      "epoch": 0.17717,
+      "grad_norm": 0.9802340865135193,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 17717
+    },
+    {
+      "epoch": 0.17718,
+      "grad_norm": 0.7992175817489624,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 17718
+    },
+    {
+      "epoch": 0.17719,
+      "grad_norm": 0.6974555253982544,
+      "learning_rate": 0.003,
+      "loss": 3.9825,
+      "step": 17719
+    },
+    {
+      "epoch": 0.1772,
+      "grad_norm": 0.6435659527778625,
+      "learning_rate": 0.003,
+      "loss": 3.9948,
+      "step": 17720
+    },
+    {
+      "epoch": 0.17721,
+      "grad_norm": 0.6219088435173035,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 17721
+    },
+    {
+      "epoch": 0.17722,
+      "grad_norm": 0.5757900476455688,
+      "learning_rate": 0.003,
+      "loss": 3.9754,
+      "step": 17722
+    },
+    {
+      "epoch": 0.17723,
+      "grad_norm": 0.5606616735458374,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 17723
+    },
+    {
+      "epoch": 0.17724,
+      "grad_norm": 0.6454625725746155,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 17724
+    },
+    {
+      "epoch": 0.17725,
+      "grad_norm": 0.8007955551147461,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 17725
+    },
+    {
+      "epoch": 0.17726,
+      "grad_norm": 1.0168530941009521,
+      "learning_rate": 0.003,
+      "loss": 3.9891,
+      "step": 17726
+    },
+    {
+      "epoch": 0.17727,
+      "grad_norm": 1.1221123933792114,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 17727
+    },
+    {
+      "epoch": 0.17728,
+      "grad_norm": 0.8694456219673157,
+      "learning_rate": 0.003,
+      "loss": 4.0207,
+      "step": 17728
+    },
+    {
+      "epoch": 0.17729,
+      "grad_norm": 0.6848198771476746,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 17729
+    },
+    {
+      "epoch": 0.1773,
+      "grad_norm": 0.6626553535461426,
+      "learning_rate": 0.003,
+      "loss": 3.9835,
+      "step": 17730
+    },
+    {
+      "epoch": 0.17731,
+      "grad_norm": 0.9850336909294128,
+      "learning_rate": 0.003,
+      "loss": 4.0002,
+      "step": 17731
+    },
+    {
+      "epoch": 0.17732,
+      "grad_norm": 1.075743317604065,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 17732
+    },
+    {
+      "epoch": 0.17733,
+      "grad_norm": 0.8826678991317749,
+      "learning_rate": 0.003,
+      "loss": 3.9901,
+      "step": 17733
+    },
+    {
+      "epoch": 0.17734,
+      "grad_norm": 0.829764187335968,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 17734
+    },
+    {
+      "epoch": 0.17735,
+      "grad_norm": 0.62376469373703,
+      "learning_rate": 0.003,
+      "loss": 3.998,
+      "step": 17735
+    },
+    {
+      "epoch": 0.17736,
+      "grad_norm": 0.6764979362487793,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 17736
+    },
+    {
+      "epoch": 0.17737,
+      "grad_norm": 0.594389021396637,
+      "learning_rate": 0.003,
+      "loss": 3.9853,
+      "step": 17737
+    },
+    {
+      "epoch": 0.17738,
+      "grad_norm": 0.6828976273536682,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 17738
+    },
+    {
+      "epoch": 0.17739,
+      "grad_norm": 0.7793011665344238,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 17739
+    },
+    {
+      "epoch": 0.1774,
+      "grad_norm": 0.8825653791427612,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 17740
+    },
+    {
+      "epoch": 0.17741,
+      "grad_norm": 0.895940363407135,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 17741
+    },
+    {
+      "epoch": 0.17742,
+      "grad_norm": 0.8027079105377197,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 17742
+    },
+    {
+      "epoch": 0.17743,
+      "grad_norm": 0.7605303525924683,
+      "learning_rate": 0.003,
+      "loss": 4.0427,
+      "step": 17743
+    },
+    {
+      "epoch": 0.17744,
+      "grad_norm": 0.736718475818634,
+      "learning_rate": 0.003,
+      "loss": 3.9858,
+      "step": 17744
+    },
+    {
+      "epoch": 0.17745,
+      "grad_norm": 0.7389856576919556,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 17745
+    },
+    {
+      "epoch": 0.17746,
+      "grad_norm": 0.8453066945075989,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 17746
+    },
+    {
+      "epoch": 0.17747,
+      "grad_norm": 1.043473482131958,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 17747
+    },
+    {
+      "epoch": 0.17748,
+      "grad_norm": 1.101148247718811,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 17748
+    },
+    {
+      "epoch": 0.17749,
+      "grad_norm": 0.9800677299499512,
+      "learning_rate": 0.003,
+      "loss": 3.982,
+      "step": 17749
+    },
+    {
+      "epoch": 0.1775,
+      "grad_norm": 1.0185678005218506,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 17750
+    },
+    {
+      "epoch": 0.17751,
+      "grad_norm": 1.0174145698547363,
+      "learning_rate": 0.003,
+      "loss": 4.0311,
+      "step": 17751
+    },
+    {
+      "epoch": 0.17752,
+      "grad_norm": 0.8327456116676331,
+      "learning_rate": 0.003,
+      "loss": 4.0006,
+      "step": 17752
+    },
+    {
+      "epoch": 0.17753,
+      "grad_norm": 0.7818776369094849,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 17753
+    },
+    {
+      "epoch": 0.17754,
+      "grad_norm": 0.8434064388275146,
+      "learning_rate": 0.003,
+      "loss": 4.0355,
+      "step": 17754
+    },
+    {
+      "epoch": 0.17755,
+      "grad_norm": 0.8647765517234802,
+      "learning_rate": 0.003,
+      "loss": 4.0245,
+      "step": 17755
+    },
+    {
+      "epoch": 0.17756,
+      "grad_norm": 0.831998884677887,
+      "learning_rate": 0.003,
+      "loss": 3.9843,
+      "step": 17756
+    },
+    {
+      "epoch": 0.17757,
+      "grad_norm": 0.7533503174781799,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 17757
+    },
+    {
+      "epoch": 0.17758,
+      "grad_norm": 0.6916507482528687,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 17758
+    },
+    {
+      "epoch": 0.17759,
+      "grad_norm": 0.5789796710014343,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 17759
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.5916829705238342,
+      "learning_rate": 0.003,
+      "loss": 3.9806,
+      "step": 17760
+    },
+    {
+      "epoch": 0.17761,
+      "grad_norm": 0.5936658978462219,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 17761
+    },
+    {
+      "epoch": 0.17762,
+      "grad_norm": 0.6573385000228882,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 17762
+    },
+    {
+      "epoch": 0.17763,
+      "grad_norm": 0.7643323540687561,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 17763
+    },
+    {
+      "epoch": 0.17764,
+      "grad_norm": 1.0655878782272339,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 17764
+    },
+    {
+      "epoch": 0.17765,
+      "grad_norm": 1.1750019788742065,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 17765
+    },
+    {
+      "epoch": 0.17766,
+      "grad_norm": 0.6205437779426575,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 17766
+    },
+    {
+      "epoch": 0.17767,
+      "grad_norm": 0.6816059947013855,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 17767
+    },
+    {
+      "epoch": 0.17768,
+      "grad_norm": 0.936151385307312,
+      "learning_rate": 0.003,
+      "loss": 3.9856,
+      "step": 17768
+    },
+    {
+      "epoch": 0.17769,
+      "grad_norm": 0.905048131942749,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 17769
+    },
+    {
+      "epoch": 0.1777,
+      "grad_norm": 0.9736447334289551,
+      "learning_rate": 0.003,
+      "loss": 3.9811,
+      "step": 17770
+    },
+    {
+      "epoch": 0.17771,
+      "grad_norm": 0.9936445951461792,
+      "learning_rate": 0.003,
+      "loss": 3.9898,
+      "step": 17771
+    },
+    {
+      "epoch": 0.17772,
+      "grad_norm": 0.8724110722541809,
+      "learning_rate": 0.003,
+      "loss": 4.0052,
+      "step": 17772
+    },
+    {
+      "epoch": 0.17773,
+      "grad_norm": 0.8918551802635193,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 17773
+    },
+    {
+      "epoch": 0.17774,
+      "grad_norm": 1.0295112133026123,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 17774
+    },
+    {
+      "epoch": 0.17775,
+      "grad_norm": 1.0086634159088135,
+      "learning_rate": 0.003,
+      "loss": 3.976,
+      "step": 17775
+    },
+    {
+      "epoch": 0.17776,
+      "grad_norm": 0.9048023819923401,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 17776
+    },
+    {
+      "epoch": 0.17777,
+      "grad_norm": 0.7482655048370361,
+      "learning_rate": 0.003,
+      "loss": 3.9845,
+      "step": 17777
+    },
+    {
+      "epoch": 0.17778,
+      "grad_norm": 0.7453811168670654,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 17778
+    },
+    {
+      "epoch": 0.17779,
+      "grad_norm": 0.7985710501670837,
+      "learning_rate": 0.003,
+      "loss": 3.9874,
+      "step": 17779
+    },
+    {
+      "epoch": 0.1778,
+      "grad_norm": 0.8785742521286011,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 17780
+    },
+    {
+      "epoch": 0.17781,
+      "grad_norm": 0.9537194967269897,
+      "learning_rate": 0.003,
+      "loss": 3.9953,
+      "step": 17781
+    },
+    {
+      "epoch": 0.17782,
+      "grad_norm": 0.8183233141899109,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 17782
+    },
+    {
+      "epoch": 0.17783,
+      "grad_norm": 0.7182443737983704,
+      "learning_rate": 0.003,
+      "loss": 3.9866,
+      "step": 17783
+    },
+    {
+      "epoch": 0.17784,
+      "grad_norm": 0.7940345406532288,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 17784
+    },
+    {
+      "epoch": 0.17785,
+      "grad_norm": 0.6857940554618835,
+      "learning_rate": 0.003,
+      "loss": 3.9874,
+      "step": 17785
+    },
+    {
+      "epoch": 0.17786,
+      "grad_norm": 0.7146942019462585,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 17786
+    },
+    {
+      "epoch": 0.17787,
+      "grad_norm": 0.8008255362510681,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 17787
+    },
+    {
+      "epoch": 0.17788,
+      "grad_norm": 0.9737480282783508,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 17788
+    },
+    {
+      "epoch": 0.17789,
+      "grad_norm": 1.1552413702011108,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 17789
+    },
+    {
+      "epoch": 0.1779,
+      "grad_norm": 0.997272253036499,
+      "learning_rate": 0.003,
+      "loss": 3.9812,
+      "step": 17790
+    },
+    {
+      "epoch": 0.17791,
+      "grad_norm": 0.931243896484375,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 17791
+    },
+    {
+      "epoch": 0.17792,
+      "grad_norm": 0.8716450929641724,
+      "learning_rate": 0.003,
+      "loss": 3.9765,
+      "step": 17792
+    },
+    {
+      "epoch": 0.17793,
+      "grad_norm": 0.8943480253219604,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 17793
+    },
+    {
+      "epoch": 0.17794,
+      "grad_norm": 0.9397181868553162,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 17794
+    },
+    {
+      "epoch": 0.17795,
+      "grad_norm": 0.7583839893341064,
+      "learning_rate": 0.003,
+      "loss": 3.9992,
+      "step": 17795
+    },
+    {
+      "epoch": 0.17796,
+      "grad_norm": 0.6900312304496765,
+      "learning_rate": 0.003,
+      "loss": 3.983,
+      "step": 17796
+    },
+    {
+      "epoch": 0.17797,
+      "grad_norm": 0.8090738654136658,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 17797
+    },
+    {
+      "epoch": 0.17798,
+      "grad_norm": 0.8548142910003662,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 17798
+    },
+    {
+      "epoch": 0.17799,
+      "grad_norm": 0.8663538694381714,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 17799
+    },
+    {
+      "epoch": 0.178,
+      "grad_norm": 0.9889453649520874,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 17800
+    },
+    {
+      "epoch": 0.17801,
+      "grad_norm": 1.1211903095245361,
+      "learning_rate": 0.003,
+      "loss": 3.9967,
+      "step": 17801
+    },
+    {
+      "epoch": 0.17802,
+      "grad_norm": 0.9584589004516602,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 17802
+    },
+    {
+      "epoch": 0.17803,
+      "grad_norm": 1.0522129535675049,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 17803
+    },
+    {
+      "epoch": 0.17804,
+      "grad_norm": 0.9472935795783997,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 17804
+    },
+    {
+      "epoch": 0.17805,
+      "grad_norm": 0.9455471038818359,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 17805
+    },
+    {
+      "epoch": 0.17806,
+      "grad_norm": 0.8628377914428711,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 17806
+    },
+    {
+      "epoch": 0.17807,
+      "grad_norm": 0.7941399812698364,
+      "learning_rate": 0.003,
+      "loss": 3.9992,
+      "step": 17807
+    },
+    {
+      "epoch": 0.17808,
+      "grad_norm": 0.8009958863258362,
+      "learning_rate": 0.003,
+      "loss": 3.978,
+      "step": 17808
+    },
+    {
+      "epoch": 0.17809,
+      "grad_norm": 0.8584080338478088,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 17809
+    },
+    {
+      "epoch": 0.1781,
+      "grad_norm": 0.7621896862983704,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 17810
+    },
+    {
+      "epoch": 0.17811,
+      "grad_norm": 0.7111849188804626,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 17811
+    },
+    {
+      "epoch": 0.17812,
+      "grad_norm": 0.7301157712936401,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 17812
+    },
+    {
+      "epoch": 0.17813,
+      "grad_norm": 0.7553697228431702,
+      "learning_rate": 0.003,
+      "loss": 4.0394,
+      "step": 17813
+    },
+    {
+      "epoch": 0.17814,
+      "grad_norm": 0.9111964106559753,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 17814
+    },
+    {
+      "epoch": 0.17815,
+      "grad_norm": 1.1202391386032104,
+      "learning_rate": 0.003,
+      "loss": 4.0002,
+      "step": 17815
+    },
+    {
+      "epoch": 0.17816,
+      "grad_norm": 1.1454354524612427,
+      "learning_rate": 0.003,
+      "loss": 3.9929,
+      "step": 17816
+    },
+    {
+      "epoch": 0.17817,
+      "grad_norm": 0.9268150329589844,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 17817
+    },
+    {
+      "epoch": 0.17818,
+      "grad_norm": 0.9023136496543884,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 17818
+    },
+    {
+      "epoch": 0.17819,
+      "grad_norm": 0.8537625670433044,
+      "learning_rate": 0.003,
+      "loss": 4.0274,
+      "step": 17819
+    },
+    {
+      "epoch": 0.1782,
+      "grad_norm": 0.8082426190376282,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 17820
+    },
+    {
+      "epoch": 0.17821,
+      "grad_norm": 0.753790020942688,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 17821
+    },
+    {
+      "epoch": 0.17822,
+      "grad_norm": 0.6044871807098389,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 17822
+    },
+    {
+      "epoch": 0.17823,
+      "grad_norm": 0.5598384737968445,
+      "learning_rate": 0.003,
+      "loss": 3.9812,
+      "step": 17823
+    },
+    {
+      "epoch": 0.17824,
+      "grad_norm": 0.5412915945053101,
+      "learning_rate": 0.003,
+      "loss": 3.98,
+      "step": 17824
+    },
+    {
+      "epoch": 0.17825,
+      "grad_norm": 0.4867744445800781,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 17825
+    },
+    {
+      "epoch": 0.17826,
+      "grad_norm": 0.5870778560638428,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 17826
+    },
+    {
+      "epoch": 0.17827,
+      "grad_norm": 0.8643435835838318,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 17827
+    },
+    {
+      "epoch": 0.17828,
+      "grad_norm": 1.3006690740585327,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 17828
+    },
+    {
+      "epoch": 0.17829,
+      "grad_norm": 0.7963849902153015,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 17829
+    },
+    {
+      "epoch": 0.1783,
+      "grad_norm": 0.6619006991386414,
+      "learning_rate": 0.003,
+      "loss": 3.958,
+      "step": 17830
+    },
+    {
+      "epoch": 0.17831,
+      "grad_norm": 0.6422700881958008,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 17831
+    },
+    {
+      "epoch": 0.17832,
+      "grad_norm": 0.5084357261657715,
+      "learning_rate": 0.003,
+      "loss": 4.0002,
+      "step": 17832
+    },
+    {
+      "epoch": 0.17833,
+      "grad_norm": 0.5677739977836609,
+      "learning_rate": 0.003,
+      "loss": 3.9858,
+      "step": 17833
+    },
+    {
+      "epoch": 0.17834,
+      "grad_norm": 0.5317157506942749,
+      "learning_rate": 0.003,
+      "loss": 3.9858,
+      "step": 17834
+    },
+    {
+      "epoch": 0.17835,
+      "grad_norm": 0.5522458553314209,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 17835
+    },
+    {
+      "epoch": 0.17836,
+      "grad_norm": 0.6171836853027344,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 17836
+    },
+    {
+      "epoch": 0.17837,
+      "grad_norm": 0.7240206003189087,
+      "learning_rate": 0.003,
+      "loss": 3.9862,
+      "step": 17837
+    },
+    {
+      "epoch": 0.17838,
+      "grad_norm": 0.9882445335388184,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 17838
+    },
+    {
+      "epoch": 0.17839,
+      "grad_norm": 1.2277603149414062,
+      "learning_rate": 0.003,
+      "loss": 4.0251,
+      "step": 17839
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.5247293710708618,
+      "learning_rate": 0.003,
+      "loss": 4.0047,
+      "step": 17840
+    },
+    {
+      "epoch": 0.17841,
+      "grad_norm": 0.6750937104225159,
+      "learning_rate": 0.003,
+      "loss": 3.9708,
+      "step": 17841
+    },
+    {
+      "epoch": 0.17842,
+      "grad_norm": 0.897888720035553,
+      "learning_rate": 0.003,
+      "loss": 3.9719,
+      "step": 17842
+    },
+    {
+      "epoch": 0.17843,
+      "grad_norm": 0.9377698302268982,
+      "learning_rate": 0.003,
+      "loss": 3.9627,
+      "step": 17843
+    },
+    {
+      "epoch": 0.17844,
+      "grad_norm": 0.974871039390564,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 17844
+    },
+    {
+      "epoch": 0.17845,
+      "grad_norm": 0.8839754462242126,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 17845
+    },
+    {
+      "epoch": 0.17846,
+      "grad_norm": 0.772807240486145,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 17846
+    },
+    {
+      "epoch": 0.17847,
+      "grad_norm": 0.715358316898346,
+      "learning_rate": 0.003,
+      "loss": 3.9972,
+      "step": 17847
+    },
+    {
+      "epoch": 0.17848,
+      "grad_norm": 0.8007568120956421,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 17848
+    },
+    {
+      "epoch": 0.17849,
+      "grad_norm": 0.9028872847557068,
+      "learning_rate": 0.003,
+      "loss": 4.0283,
+      "step": 17849
+    },
+    {
+      "epoch": 0.1785,
+      "grad_norm": 0.976070761680603,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 17850
+    },
+    {
+      "epoch": 0.17851,
+      "grad_norm": 1.1514437198638916,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 17851
+    },
+    {
+      "epoch": 0.17852,
+      "grad_norm": 0.7972047328948975,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 17852
+    },
+    {
+      "epoch": 0.17853,
+      "grad_norm": 0.6677350401878357,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 17853
+    },
+    {
+      "epoch": 0.17854,
+      "grad_norm": 0.8710585832595825,
+      "learning_rate": 0.003,
+      "loss": 3.9891,
+      "step": 17854
+    },
+    {
+      "epoch": 0.17855,
+      "grad_norm": 1.0941059589385986,
+      "learning_rate": 0.003,
+      "loss": 3.9931,
+      "step": 17855
+    },
+    {
+      "epoch": 0.17856,
+      "grad_norm": 0.9320690631866455,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 17856
+    },
+    {
+      "epoch": 0.17857,
+      "grad_norm": 1.0536068677902222,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 17857
+    },
+    {
+      "epoch": 0.17858,
+      "grad_norm": 0.9712250828742981,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 17858
+    },
+    {
+      "epoch": 0.17859,
+      "grad_norm": 0.8574034571647644,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 17859
+    },
+    {
+      "epoch": 0.1786,
+      "grad_norm": 0.8647499084472656,
+      "learning_rate": 0.003,
+      "loss": 3.9938,
+      "step": 17860
+    },
+    {
+      "epoch": 0.17861,
+      "grad_norm": 0.7013272047042847,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 17861
+    },
+    {
+      "epoch": 0.17862,
+      "grad_norm": 0.746783971786499,
+      "learning_rate": 0.003,
+      "loss": 3.9994,
+      "step": 17862
+    },
+    {
+      "epoch": 0.17863,
+      "grad_norm": 0.8463944792747498,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 17863
+    },
+    {
+      "epoch": 0.17864,
+      "grad_norm": 0.998805582523346,
+      "learning_rate": 0.003,
+      "loss": 4.0369,
+      "step": 17864
+    },
+    {
+      "epoch": 0.17865,
+      "grad_norm": 0.8989658951759338,
+      "learning_rate": 0.003,
+      "loss": 3.989,
+      "step": 17865
+    },
+    {
+      "epoch": 0.17866,
+      "grad_norm": 0.8039606213569641,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 17866
+    },
+    {
+      "epoch": 0.17867,
+      "grad_norm": 0.8249936699867249,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 17867
+    },
+    {
+      "epoch": 0.17868,
+      "grad_norm": 0.9631441235542297,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 17868
+    },
+    {
+      "epoch": 0.17869,
+      "grad_norm": 1.0489217042922974,
+      "learning_rate": 0.003,
+      "loss": 4.0249,
+      "step": 17869
+    },
+    {
+      "epoch": 0.1787,
+      "grad_norm": 1.0523375272750854,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 17870
+    },
+    {
+      "epoch": 0.17871,
+      "grad_norm": 1.0117212533950806,
+      "learning_rate": 0.003,
+      "loss": 4.0108,
+      "step": 17871
+    },
+    {
+      "epoch": 0.17872,
+      "grad_norm": 0.9836920499801636,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 17872
+    },
+    {
+      "epoch": 0.17873,
+      "grad_norm": 0.9232009649276733,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 17873
+    },
+    {
+      "epoch": 0.17874,
+      "grad_norm": 0.9044313430786133,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 17874
+    },
+    {
+      "epoch": 0.17875,
+      "grad_norm": 0.9934422373771667,
+      "learning_rate": 0.003,
+      "loss": 4.0448,
+      "step": 17875
+    },
+    {
+      "epoch": 0.17876,
+      "grad_norm": 0.9941718578338623,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 17876
+    },
+    {
+      "epoch": 0.17877,
+      "grad_norm": 0.8592278957366943,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 17877
+    },
+    {
+      "epoch": 0.17878,
+      "grad_norm": 0.774692714214325,
+      "learning_rate": 0.003,
+      "loss": 3.9781,
+      "step": 17878
+    },
+    {
+      "epoch": 0.17879,
+      "grad_norm": 0.8519881367683411,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 17879
+    },
+    {
+      "epoch": 0.1788,
+      "grad_norm": 0.853733479976654,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 17880
+    },
+    {
+      "epoch": 0.17881,
+      "grad_norm": 0.8926007151603699,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 17881
+    },
+    {
+      "epoch": 0.17882,
+      "grad_norm": 1.009367823600769,
+      "learning_rate": 0.003,
+      "loss": 3.9886,
+      "step": 17882
+    },
+    {
+      "epoch": 0.17883,
+      "grad_norm": 1.2331897020339966,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 17883
+    },
+    {
+      "epoch": 0.17884,
+      "grad_norm": 0.7124277949333191,
+      "learning_rate": 0.003,
+      "loss": 3.9792,
+      "step": 17884
+    },
+    {
+      "epoch": 0.17885,
+      "grad_norm": 0.6617794632911682,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 17885
+    },
+    {
+      "epoch": 0.17886,
+      "grad_norm": 0.6539735794067383,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 17886
+    },
+    {
+      "epoch": 0.17887,
+      "grad_norm": 0.658829391002655,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 17887
+    },
+    {
+      "epoch": 0.17888,
+      "grad_norm": 0.621078610420227,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 17888
+    },
+    {
+      "epoch": 0.17889,
+      "grad_norm": 0.6504934430122375,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 17889
+    },
+    {
+      "epoch": 0.1789,
+      "grad_norm": 0.7506583333015442,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 17890
+    },
+    {
+      "epoch": 0.17891,
+      "grad_norm": 0.850108802318573,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 17891
+    },
+    {
+      "epoch": 0.17892,
+      "grad_norm": 0.8623435497283936,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 17892
+    },
+    {
+      "epoch": 0.17893,
+      "grad_norm": 0.9529052376747131,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 17893
+    },
+    {
+      "epoch": 0.17894,
+      "grad_norm": 1.041895866394043,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 17894
+    },
+    {
+      "epoch": 0.17895,
+      "grad_norm": 0.9011391401290894,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 17895
+    },
+    {
+      "epoch": 0.17896,
+      "grad_norm": 0.7943750619888306,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 17896
+    },
+    {
+      "epoch": 0.17897,
+      "grad_norm": 0.7554261088371277,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 17897
+    },
+    {
+      "epoch": 0.17898,
+      "grad_norm": 0.7515415549278259,
+      "learning_rate": 0.003,
+      "loss": 3.9764,
+      "step": 17898
+    },
+    {
+      "epoch": 0.17899,
+      "grad_norm": 0.7488419413566589,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 17899
+    },
+    {
+      "epoch": 0.179,
+      "grad_norm": 0.6361393928527832,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 17900
+    },
+    {
+      "epoch": 0.17901,
+      "grad_norm": 0.6696352958679199,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 17901
+    },
+    {
+      "epoch": 0.17902,
+      "grad_norm": 0.6741111874580383,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 17902
+    },
+    {
+      "epoch": 0.17903,
+      "grad_norm": 0.7678008079528809,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 17903
+    },
+    {
+      "epoch": 0.17904,
+      "grad_norm": 0.9445978403091431,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 17904
+    },
+    {
+      "epoch": 0.17905,
+      "grad_norm": 1.1481324434280396,
+      "learning_rate": 0.003,
+      "loss": 3.9988,
+      "step": 17905
+    },
+    {
+      "epoch": 0.17906,
+      "grad_norm": 0.8214532136917114,
+      "learning_rate": 0.003,
+      "loss": 3.9816,
+      "step": 17906
+    },
+    {
+      "epoch": 0.17907,
+      "grad_norm": 0.7733437418937683,
+      "learning_rate": 0.003,
+      "loss": 3.9644,
+      "step": 17907
+    },
+    {
+      "epoch": 0.17908,
+      "grad_norm": 0.6816405653953552,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 17908
+    },
+    {
+      "epoch": 0.17909,
+      "grad_norm": 0.6628265976905823,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 17909
+    },
+    {
+      "epoch": 0.1791,
+      "grad_norm": 0.69963538646698,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 17910
+    },
+    {
+      "epoch": 0.17911,
+      "grad_norm": 0.7206888794898987,
+      "learning_rate": 0.003,
+      "loss": 3.9873,
+      "step": 17911
+    },
+    {
+      "epoch": 0.17912,
+      "grad_norm": 0.6892799139022827,
+      "learning_rate": 0.003,
+      "loss": 3.9816,
+      "step": 17912
+    },
+    {
+      "epoch": 0.17913,
+      "grad_norm": 0.6421831250190735,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 17913
+    },
+    {
+      "epoch": 0.17914,
+      "grad_norm": 0.793516993522644,
+      "learning_rate": 0.003,
+      "loss": 3.9891,
+      "step": 17914
+    },
+    {
+      "epoch": 0.17915,
+      "grad_norm": 0.6671417355537415,
+      "learning_rate": 0.003,
+      "loss": 3.9926,
+      "step": 17915
+    },
+    {
+      "epoch": 0.17916,
+      "grad_norm": 0.6116956472396851,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 17916
+    },
+    {
+      "epoch": 0.17917,
+      "grad_norm": 0.7654872536659241,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 17917
+    },
+    {
+      "epoch": 0.17918,
+      "grad_norm": 1.0274306535720825,
+      "learning_rate": 0.003,
+      "loss": 3.9893,
+      "step": 17918
+    },
+    {
+      "epoch": 0.17919,
+      "grad_norm": 1.1414265632629395,
+      "learning_rate": 0.003,
+      "loss": 3.9867,
+      "step": 17919
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.8894082903862,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 17920
+    },
+    {
+      "epoch": 0.17921,
+      "grad_norm": 0.8943668007850647,
+      "learning_rate": 0.003,
+      "loss": 3.9928,
+      "step": 17921
+    },
+    {
+      "epoch": 0.17922,
+      "grad_norm": 0.8579666018486023,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 17922
+    },
+    {
+      "epoch": 0.17923,
+      "grad_norm": 0.9284310936927795,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 17923
+    },
+    {
+      "epoch": 0.17924,
+      "grad_norm": 1.0336041450500488,
+      "learning_rate": 0.003,
+      "loss": 4.0168,
+      "step": 17924
+    },
+    {
+      "epoch": 0.17925,
+      "grad_norm": 1.0972650051116943,
+      "learning_rate": 0.003,
+      "loss": 3.9929,
+      "step": 17925
+    },
+    {
+      "epoch": 0.17926,
+      "grad_norm": 0.9823690056800842,
+      "learning_rate": 0.003,
+      "loss": 3.9858,
+      "step": 17926
+    },
+    {
+      "epoch": 0.17927,
+      "grad_norm": 1.0724045038223267,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 17927
+    },
+    {
+      "epoch": 0.17928,
+      "grad_norm": 0.9181076288223267,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 17928
+    },
+    {
+      "epoch": 0.17929,
+      "grad_norm": 0.8142296075820923,
+      "learning_rate": 0.003,
+      "loss": 3.9934,
+      "step": 17929
+    },
+    {
+      "epoch": 0.1793,
+      "grad_norm": 0.7954555749893188,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 17930
+    },
+    {
+      "epoch": 0.17931,
+      "grad_norm": 0.8270518779754639,
+      "learning_rate": 0.003,
+      "loss": 3.9993,
+      "step": 17931
+    },
+    {
+      "epoch": 0.17932,
+      "grad_norm": 0.8402372002601624,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 17932
+    },
+    {
+      "epoch": 0.17933,
+      "grad_norm": 1.0668981075286865,
+      "learning_rate": 0.003,
+      "loss": 3.998,
+      "step": 17933
+    },
+    {
+      "epoch": 0.17934,
+      "grad_norm": 0.9972578287124634,
+      "learning_rate": 0.003,
+      "loss": 4.0421,
+      "step": 17934
+    },
+    {
+      "epoch": 0.17935,
+      "grad_norm": 0.9289863705635071,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 17935
+    },
+    {
+      "epoch": 0.17936,
+      "grad_norm": 0.8743131160736084,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 17936
+    },
+    {
+      "epoch": 0.17937,
+      "grad_norm": 0.848335325717926,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 17937
+    },
+    {
+      "epoch": 0.17938,
+      "grad_norm": 0.9841867089271545,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 17938
+    },
+    {
+      "epoch": 0.17939,
+      "grad_norm": 1.2921292781829834,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 17939
+    },
+    {
+      "epoch": 0.1794,
+      "grad_norm": 0.7182594537734985,
+      "learning_rate": 0.003,
+      "loss": 3.9909,
+      "step": 17940
+    },
+    {
+      "epoch": 0.17941,
+      "grad_norm": 0.5401883721351624,
+      "learning_rate": 0.003,
+      "loss": 3.9872,
+      "step": 17941
+    },
+    {
+      "epoch": 0.17942,
+      "grad_norm": 0.6777501106262207,
+      "learning_rate": 0.003,
+      "loss": 3.9779,
+      "step": 17942
+    },
+    {
+      "epoch": 0.17943,
+      "grad_norm": 0.7719506621360779,
+      "learning_rate": 0.003,
+      "loss": 3.9799,
+      "step": 17943
+    },
+    {
+      "epoch": 0.17944,
+      "grad_norm": 0.9649561643600464,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 17944
+    },
+    {
+      "epoch": 0.17945,
+      "grad_norm": 1.1592811346054077,
+      "learning_rate": 0.003,
+      "loss": 3.9827,
+      "step": 17945
+    },
+    {
+      "epoch": 0.17946,
+      "grad_norm": 0.7516922950744629,
+      "learning_rate": 0.003,
+      "loss": 3.9994,
+      "step": 17946
+    },
+    {
+      "epoch": 0.17947,
+      "grad_norm": 0.6371436715126038,
+      "learning_rate": 0.003,
+      "loss": 3.9906,
+      "step": 17947
+    },
+    {
+      "epoch": 0.17948,
+      "grad_norm": 0.6841728091239929,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 17948
+    },
+    {
+      "epoch": 0.17949,
+      "grad_norm": 0.7224218249320984,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 17949
+    },
+    {
+      "epoch": 0.1795,
+      "grad_norm": 0.7025465965270996,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 17950
+    },
+    {
+      "epoch": 0.17951,
+      "grad_norm": 0.7663628458976746,
+      "learning_rate": 0.003,
+      "loss": 3.9901,
+      "step": 17951
+    },
+    {
+      "epoch": 0.17952,
+      "grad_norm": 0.7727324366569519,
+      "learning_rate": 0.003,
+      "loss": 3.9693,
+      "step": 17952
+    },
+    {
+      "epoch": 0.17953,
+      "grad_norm": 0.750734269618988,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 17953
+    },
+    {
+      "epoch": 0.17954,
+      "grad_norm": 0.7506475448608398,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 17954
+    },
+    {
+      "epoch": 0.17955,
+      "grad_norm": 0.7635833024978638,
+      "learning_rate": 0.003,
+      "loss": 3.9858,
+      "step": 17955
+    },
+    {
+      "epoch": 0.17956,
+      "grad_norm": 0.7833163738250732,
+      "learning_rate": 0.003,
+      "loss": 3.981,
+      "step": 17956
+    },
+    {
+      "epoch": 0.17957,
+      "grad_norm": 0.7768687605857849,
+      "learning_rate": 0.003,
+      "loss": 3.9738,
+      "step": 17957
+    },
+    {
+      "epoch": 0.17958,
+      "grad_norm": 0.8088453412055969,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 17958
+    },
+    {
+      "epoch": 0.17959,
+      "grad_norm": 0.8046279549598694,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 17959
+    },
+    {
+      "epoch": 0.1796,
+      "grad_norm": 0.886318564414978,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 17960
+    },
+    {
+      "epoch": 0.17961,
+      "grad_norm": 1.0145245790481567,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 17961
+    },
+    {
+      "epoch": 0.17962,
+      "grad_norm": 1.001228928565979,
+      "learning_rate": 0.003,
+      "loss": 3.9853,
+      "step": 17962
+    },
+    {
+      "epoch": 0.17963,
+      "grad_norm": 0.8505359292030334,
+      "learning_rate": 0.003,
+      "loss": 3.9855,
+      "step": 17963
+    },
+    {
+      "epoch": 0.17964,
+      "grad_norm": 0.8970061540603638,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 17964
+    },
+    {
+      "epoch": 0.17965,
+      "grad_norm": 0.9306725263595581,
+      "learning_rate": 0.003,
+      "loss": 3.9931,
+      "step": 17965
+    },
+    {
+      "epoch": 0.17966,
+      "grad_norm": 1.0478090047836304,
+      "learning_rate": 0.003,
+      "loss": 3.9799,
+      "step": 17966
+    },
+    {
+      "epoch": 0.17967,
+      "grad_norm": 1.050741195678711,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 17967
+    },
+    {
+      "epoch": 0.17968,
+      "grad_norm": 0.9878668785095215,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 17968
+    },
+    {
+      "epoch": 0.17969,
+      "grad_norm": 1.103552222251892,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 17969
+    },
+    {
+      "epoch": 0.1797,
+      "grad_norm": 1.0772671699523926,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 17970
+    },
+    {
+      "epoch": 0.17971,
+      "grad_norm": 0.7761634588241577,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 17971
+    },
+    {
+      "epoch": 0.17972,
+      "grad_norm": 0.9355222582817078,
+      "learning_rate": 0.003,
+      "loss": 4.0451,
+      "step": 17972
+    },
+    {
+      "epoch": 0.17973,
+      "grad_norm": 1.0587787628173828,
+      "learning_rate": 0.003,
+      "loss": 4.0237,
+      "step": 17973
+    },
+    {
+      "epoch": 0.17974,
+      "grad_norm": 1.0033667087554932,
+      "learning_rate": 0.003,
+      "loss": 3.9817,
+      "step": 17974
+    },
+    {
+      "epoch": 0.17975,
+      "grad_norm": 0.9196949601173401,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 17975
+    },
+    {
+      "epoch": 0.17976,
+      "grad_norm": 0.8534526824951172,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 17976
+    },
+    {
+      "epoch": 0.17977,
+      "grad_norm": 0.9210339188575745,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 17977
+    },
+    {
+      "epoch": 0.17978,
+      "grad_norm": 0.9203281402587891,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 17978
+    },
+    {
+      "epoch": 0.17979,
+      "grad_norm": 1.0668666362762451,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 17979
+    },
+    {
+      "epoch": 0.1798,
+      "grad_norm": 1.1201149225234985,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 17980
+    },
+    {
+      "epoch": 0.17981,
+      "grad_norm": 0.7923021912574768,
+      "learning_rate": 0.003,
+      "loss": 3.939,
+      "step": 17981
+    },
+    {
+      "epoch": 0.17982,
+      "grad_norm": 0.5935437679290771,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 17982
+    },
+    {
+      "epoch": 0.17983,
+      "grad_norm": 0.5846838355064392,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 17983
+    },
+    {
+      "epoch": 0.17984,
+      "grad_norm": 0.43343862891197205,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 17984
+    },
+    {
+      "epoch": 0.17985,
+      "grad_norm": 0.5557079315185547,
+      "learning_rate": 0.003,
+      "loss": 3.9886,
+      "step": 17985
+    },
+    {
+      "epoch": 0.17986,
+      "grad_norm": 0.5190477967262268,
+      "learning_rate": 0.003,
+      "loss": 3.9991,
+      "step": 17986
+    },
+    {
+      "epoch": 0.17987,
+      "grad_norm": 0.5365965962409973,
+      "learning_rate": 0.003,
+      "loss": 3.9991,
+      "step": 17987
+    },
+    {
+      "epoch": 0.17988,
+      "grad_norm": 0.5814077258110046,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 17988
+    },
+    {
+      "epoch": 0.17989,
+      "grad_norm": 0.6688368320465088,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 17989
+    },
+    {
+      "epoch": 0.1799,
+      "grad_norm": 0.8527092933654785,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 17990
+    },
+    {
+      "epoch": 0.17991,
+      "grad_norm": 1.0361605882644653,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 17991
+    },
+    {
+      "epoch": 0.17992,
+      "grad_norm": 1.0612369775772095,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 17992
+    },
+    {
+      "epoch": 0.17993,
+      "grad_norm": 0.8605156540870667,
+      "learning_rate": 0.003,
+      "loss": 3.9983,
+      "step": 17993
+    },
+    {
+      "epoch": 0.17994,
+      "grad_norm": 0.6299520134925842,
+      "learning_rate": 0.003,
+      "loss": 3.9845,
+      "step": 17994
+    },
+    {
+      "epoch": 0.17995,
+      "grad_norm": 0.6558441519737244,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 17995
+    },
+    {
+      "epoch": 0.17996,
+      "grad_norm": 0.646214485168457,
+      "learning_rate": 0.003,
+      "loss": 3.9756,
+      "step": 17996
+    },
+    {
+      "epoch": 0.17997,
+      "grad_norm": 0.685188353061676,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 17997
+    },
+    {
+      "epoch": 0.17998,
+      "grad_norm": 0.7944631576538086,
+      "learning_rate": 0.003,
+      "loss": 3.9842,
+      "step": 17998
+    },
+    {
+      "epoch": 0.17999,
+      "grad_norm": 0.9150182604789734,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 17999
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.0325008630752563,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 18000
+    },
+    {
+      "epoch": 0.18001,
+      "grad_norm": 0.9318733215332031,
+      "learning_rate": 0.003,
+      "loss": 3.9845,
+      "step": 18001
+    },
+    {
+      "epoch": 0.18002,
+      "grad_norm": 0.8521453738212585,
+      "learning_rate": 0.003,
+      "loss": 3.9525,
+      "step": 18002
+    },
+    {
+      "epoch": 0.18003,
+      "grad_norm": 0.7938467860221863,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 18003
+    },
+    {
+      "epoch": 0.18004,
+      "grad_norm": 0.9001320004463196,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 18004
+    },
+    {
+      "epoch": 0.18005,
+      "grad_norm": 0.9790751338005066,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 18005
+    },
+    {
+      "epoch": 0.18006,
+      "grad_norm": 1.0301158428192139,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 18006
+    },
+    {
+      "epoch": 0.18007,
+      "grad_norm": 1.0088856220245361,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 18007
+    },
+    {
+      "epoch": 0.18008,
+      "grad_norm": 0.986687958240509,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 18008
+    },
+    {
+      "epoch": 0.18009,
+      "grad_norm": 0.918765127658844,
+      "learning_rate": 0.003,
+      "loss": 3.982,
+      "step": 18009
+    },
+    {
+      "epoch": 0.1801,
+      "grad_norm": 0.8073814511299133,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 18010
+    },
+    {
+      "epoch": 0.18011,
+      "grad_norm": 1.0299934148788452,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 18011
+    },
+    {
+      "epoch": 0.18012,
+      "grad_norm": 1.068831205368042,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 18012
+    },
+    {
+      "epoch": 0.18013,
+      "grad_norm": 1.015882968902588,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 18013
+    },
+    {
+      "epoch": 0.18014,
+      "grad_norm": 0.8612130880355835,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 18014
+    },
+    {
+      "epoch": 0.18015,
+      "grad_norm": 0.8178862929344177,
+      "learning_rate": 0.003,
+      "loss": 4.0571,
+      "step": 18015
+    },
+    {
+      "epoch": 0.18016,
+      "grad_norm": 0.9154928922653198,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 18016
+    },
+    {
+      "epoch": 0.18017,
+      "grad_norm": 1.0131648778915405,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 18017
+    },
+    {
+      "epoch": 0.18018,
+      "grad_norm": 0.968364417552948,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 18018
+    },
+    {
+      "epoch": 0.18019,
+      "grad_norm": 1.172088623046875,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 18019
+    },
+    {
+      "epoch": 0.1802,
+      "grad_norm": 0.922442615032196,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 18020
+    },
+    {
+      "epoch": 0.18021,
+      "grad_norm": 0.8483938574790955,
+      "learning_rate": 0.003,
+      "loss": 4.0381,
+      "step": 18021
+    },
+    {
+      "epoch": 0.18022,
+      "grad_norm": 0.8102627992630005,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 18022
+    },
+    {
+      "epoch": 0.18023,
+      "grad_norm": 0.5910297632217407,
+      "learning_rate": 0.003,
+      "loss": 3.987,
+      "step": 18023
+    },
+    {
+      "epoch": 0.18024,
+      "grad_norm": 0.6116005778312683,
+      "learning_rate": 0.003,
+      "loss": 3.993,
+      "step": 18024
+    },
+    {
+      "epoch": 0.18025,
+      "grad_norm": 0.5381456017494202,
+      "learning_rate": 0.003,
+      "loss": 3.9773,
+      "step": 18025
+    },
+    {
+      "epoch": 0.18026,
+      "grad_norm": 0.5271363854408264,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 18026
+    },
+    {
+      "epoch": 0.18027,
+      "grad_norm": 0.49178123474121094,
+      "learning_rate": 0.003,
+      "loss": 3.995,
+      "step": 18027
+    },
+    {
+      "epoch": 0.18028,
+      "grad_norm": 0.5047873258590698,
+      "learning_rate": 0.003,
+      "loss": 3.9724,
+      "step": 18028
+    },
+    {
+      "epoch": 0.18029,
+      "grad_norm": 0.5563161373138428,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 18029
+    },
+    {
+      "epoch": 0.1803,
+      "grad_norm": 0.6534048318862915,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 18030
+    },
+    {
+      "epoch": 0.18031,
+      "grad_norm": 0.7242395877838135,
+      "learning_rate": 0.003,
+      "loss": 3.9896,
+      "step": 18031
+    },
+    {
+      "epoch": 0.18032,
+      "grad_norm": 0.7478570938110352,
+      "learning_rate": 0.003,
+      "loss": 4.0645,
+      "step": 18032
+    },
+    {
+      "epoch": 0.18033,
+      "grad_norm": 0.8335338830947876,
+      "learning_rate": 0.003,
+      "loss": 3.9965,
+      "step": 18033
+    },
+    {
+      "epoch": 0.18034,
+      "grad_norm": 0.7908081412315369,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 18034
+    },
+    {
+      "epoch": 0.18035,
+      "grad_norm": 0.7753511071205139,
+      "learning_rate": 0.003,
+      "loss": 4.0002,
+      "step": 18035
+    },
+    {
+      "epoch": 0.18036,
+      "grad_norm": 0.9649012088775635,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 18036
+    },
+    {
+      "epoch": 0.18037,
+      "grad_norm": 1.1775555610656738,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 18037
+    },
+    {
+      "epoch": 0.18038,
+      "grad_norm": 0.8960131406784058,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 18038
+    },
+    {
+      "epoch": 0.18039,
+      "grad_norm": 0.8114643692970276,
+      "learning_rate": 0.003,
+      "loss": 3.998,
+      "step": 18039
+    },
+    {
+      "epoch": 0.1804,
+      "grad_norm": 0.8493918776512146,
+      "learning_rate": 0.003,
+      "loss": 3.9941,
+      "step": 18040
+    },
+    {
+      "epoch": 0.18041,
+      "grad_norm": 0.8604472279548645,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 18041
+    },
+    {
+      "epoch": 0.18042,
+      "grad_norm": 0.9029722213745117,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 18042
+    },
+    {
+      "epoch": 0.18043,
+      "grad_norm": 0.9265522360801697,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 18043
+    },
+    {
+      "epoch": 0.18044,
+      "grad_norm": 0.9179112911224365,
+      "learning_rate": 0.003,
+      "loss": 4.0468,
+      "step": 18044
+    },
+    {
+      "epoch": 0.18045,
+      "grad_norm": 1.0061774253845215,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 18045
+    },
+    {
+      "epoch": 0.18046,
+      "grad_norm": 0.8425312638282776,
+      "learning_rate": 0.003,
+      "loss": 3.9854,
+      "step": 18046
+    },
+    {
+      "epoch": 0.18047,
+      "grad_norm": 0.7756473422050476,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 18047
+    },
+    {
+      "epoch": 0.18048,
+      "grad_norm": 0.814687192440033,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 18048
+    },
+    {
+      "epoch": 0.18049,
+      "grad_norm": 0.8933665752410889,
+      "learning_rate": 0.003,
+      "loss": 4.0294,
+      "step": 18049
+    },
+    {
+      "epoch": 0.1805,
+      "grad_norm": 0.9201660752296448,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 18050
+    },
+    {
+      "epoch": 0.18051,
+      "grad_norm": 0.955101728439331,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 18051
+    },
+    {
+      "epoch": 0.18052,
+      "grad_norm": 0.8565345406532288,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 18052
+    },
+    {
+      "epoch": 0.18053,
+      "grad_norm": 0.8543877601623535,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 18053
+    },
+    {
+      "epoch": 0.18054,
+      "grad_norm": 0.8501734733581543,
+      "learning_rate": 0.003,
+      "loss": 3.9836,
+      "step": 18054
+    },
+    {
+      "epoch": 0.18055,
+      "grad_norm": 0.8169307708740234,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 18055
+    },
+    {
+      "epoch": 0.18056,
+      "grad_norm": 0.7411114573478699,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 18056
+    },
+    {
+      "epoch": 0.18057,
+      "grad_norm": 0.7389774918556213,
+      "learning_rate": 0.003,
+      "loss": 3.9972,
+      "step": 18057
+    },
+    {
+      "epoch": 0.18058,
+      "grad_norm": 0.8780634999275208,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 18058
+    },
+    {
+      "epoch": 0.18059,
+      "grad_norm": 0.9774113893508911,
+      "learning_rate": 0.003,
+      "loss": 3.9984,
+      "step": 18059
+    },
+    {
+      "epoch": 0.1806,
+      "grad_norm": 1.1417102813720703,
+      "learning_rate": 0.003,
+      "loss": 3.9852,
+      "step": 18060
+    },
+    {
+      "epoch": 0.18061,
+      "grad_norm": 0.9026036262512207,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 18061
+    },
+    {
+      "epoch": 0.18062,
+      "grad_norm": 1.0317175388336182,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 18062
+    },
+    {
+      "epoch": 0.18063,
+      "grad_norm": 0.9787594676017761,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 18063
+    },
+    {
+      "epoch": 0.18064,
+      "grad_norm": 0.8721433877944946,
+      "learning_rate": 0.003,
+      "loss": 3.9785,
+      "step": 18064
+    },
+    {
+      "epoch": 0.18065,
+      "grad_norm": 0.8833760023117065,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 18065
+    },
+    {
+      "epoch": 0.18066,
+      "grad_norm": 0.8670377731323242,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 18066
+    },
+    {
+      "epoch": 0.18067,
+      "grad_norm": 0.8080344200134277,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 18067
+    },
+    {
+      "epoch": 0.18068,
+      "grad_norm": 0.9190772175788879,
+      "learning_rate": 0.003,
+      "loss": 3.9847,
+      "step": 18068
+    },
+    {
+      "epoch": 0.18069,
+      "grad_norm": 0.9776795506477356,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 18069
+    },
+    {
+      "epoch": 0.1807,
+      "grad_norm": 0.9771139621734619,
+      "learning_rate": 0.003,
+      "loss": 3.9871,
+      "step": 18070
+    },
+    {
+      "epoch": 0.18071,
+      "grad_norm": 1.0068480968475342,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 18071
+    },
+    {
+      "epoch": 0.18072,
+      "grad_norm": 0.9160780310630798,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 18072
+    },
+    {
+      "epoch": 0.18073,
+      "grad_norm": 0.7548110485076904,
+      "learning_rate": 0.003,
+      "loss": 4.0412,
+      "step": 18073
+    },
+    {
+      "epoch": 0.18074,
+      "grad_norm": 0.6938346028327942,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 18074
+    },
+    {
+      "epoch": 0.18075,
+      "grad_norm": 0.6407344937324524,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 18075
+    },
+    {
+      "epoch": 0.18076,
+      "grad_norm": 0.5780854225158691,
+      "learning_rate": 0.003,
+      "loss": 3.9983,
+      "step": 18076
+    },
+    {
+      "epoch": 0.18077,
+      "grad_norm": 0.6665111184120178,
+      "learning_rate": 0.003,
+      "loss": 4.0229,
+      "step": 18077
+    },
+    {
+      "epoch": 0.18078,
+      "grad_norm": 0.7108888030052185,
+      "learning_rate": 0.003,
+      "loss": 3.9801,
+      "step": 18078
+    },
+    {
+      "epoch": 0.18079,
+      "grad_norm": 0.7258763313293457,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 18079
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.7347949147224426,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 18080
+    },
+    {
+      "epoch": 0.18081,
+      "grad_norm": 0.6717410087585449,
+      "learning_rate": 0.003,
+      "loss": 4.0314,
+      "step": 18081
+    },
+    {
+      "epoch": 0.18082,
+      "grad_norm": 0.5788361430168152,
+      "learning_rate": 0.003,
+      "loss": 3.995,
+      "step": 18082
+    },
+    {
+      "epoch": 0.18083,
+      "grad_norm": 0.5914314389228821,
+      "learning_rate": 0.003,
+      "loss": 3.9907,
+      "step": 18083
+    },
+    {
+      "epoch": 0.18084,
+      "grad_norm": 0.6312091946601868,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 18084
+    },
+    {
+      "epoch": 0.18085,
+      "grad_norm": 0.6804765462875366,
+      "learning_rate": 0.003,
+      "loss": 3.9733,
+      "step": 18085
+    },
+    {
+      "epoch": 0.18086,
+      "grad_norm": 0.6745128035545349,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 18086
+    },
+    {
+      "epoch": 0.18087,
+      "grad_norm": 0.6256589293479919,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 18087
+    },
+    {
+      "epoch": 0.18088,
+      "grad_norm": 0.5829675793647766,
+      "learning_rate": 0.003,
+      "loss": 4.0039,
+      "step": 18088
+    },
+    {
+      "epoch": 0.18089,
+      "grad_norm": 0.7264362573623657,
+      "learning_rate": 0.003,
+      "loss": 3.9741,
+      "step": 18089
+    },
+    {
+      "epoch": 0.1809,
+      "grad_norm": 1.0110256671905518,
+      "learning_rate": 0.003,
+      "loss": 4.0249,
+      "step": 18090
+    },
+    {
+      "epoch": 0.18091,
+      "grad_norm": 1.305204153060913,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 18091
+    },
+    {
+      "epoch": 0.18092,
+      "grad_norm": 0.9151637554168701,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 18092
+    },
+    {
+      "epoch": 0.18093,
+      "grad_norm": 0.8778868913650513,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 18093
+    },
+    {
+      "epoch": 0.18094,
+      "grad_norm": 0.7514395713806152,
+      "learning_rate": 0.003,
+      "loss": 3.9821,
+      "step": 18094
+    },
+    {
+      "epoch": 0.18095,
+      "grad_norm": 0.9953643679618835,
+      "learning_rate": 0.003,
+      "loss": 3.9746,
+      "step": 18095
+    },
+    {
+      "epoch": 0.18096,
+      "grad_norm": 1.349854826927185,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 18096
+    },
+    {
+      "epoch": 0.18097,
+      "grad_norm": 0.6634512543678284,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 18097
+    },
+    {
+      "epoch": 0.18098,
+      "grad_norm": 0.746289849281311,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 18098
+    },
+    {
+      "epoch": 0.18099,
+      "grad_norm": 0.7250983715057373,
+      "learning_rate": 0.003,
+      "loss": 3.9966,
+      "step": 18099
+    },
+    {
+      "epoch": 0.181,
+      "grad_norm": 0.723181962966919,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 18100
+    },
+    {
+      "epoch": 0.18101,
+      "grad_norm": 0.7149516940116882,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 18101
+    },
+    {
+      "epoch": 0.18102,
+      "grad_norm": 0.8280004262924194,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 18102
+    },
+    {
+      "epoch": 0.18103,
+      "grad_norm": 0.8888851404190063,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 18103
+    },
+    {
+      "epoch": 0.18104,
+      "grad_norm": 0.9927442669868469,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 18104
+    },
+    {
+      "epoch": 0.18105,
+      "grad_norm": 1.3739959001541138,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 18105
+    },
+    {
+      "epoch": 0.18106,
+      "grad_norm": 0.5654383897781372,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 18106
+    },
+    {
+      "epoch": 0.18107,
+      "grad_norm": 0.6091672778129578,
+      "learning_rate": 0.003,
+      "loss": 3.9785,
+      "step": 18107
+    },
+    {
+      "epoch": 0.18108,
+      "grad_norm": 0.6785678267478943,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 18108
+    },
+    {
+      "epoch": 0.18109,
+      "grad_norm": 0.7807140946388245,
+      "learning_rate": 0.003,
+      "loss": 3.9898,
+      "step": 18109
+    },
+    {
+      "epoch": 0.1811,
+      "grad_norm": 0.95677250623703,
+      "learning_rate": 0.003,
+      "loss": 4.0354,
+      "step": 18110
+    },
+    {
+      "epoch": 0.18111,
+      "grad_norm": 1.0871576070785522,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 18111
+    },
+    {
+      "epoch": 0.18112,
+      "grad_norm": 0.8533523082733154,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 18112
+    },
+    {
+      "epoch": 0.18113,
+      "grad_norm": 0.7580128908157349,
+      "learning_rate": 0.003,
+      "loss": 4.0372,
+      "step": 18113
+    },
+    {
+      "epoch": 0.18114,
+      "grad_norm": 0.8775519132614136,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 18114
+    },
+    {
+      "epoch": 0.18115,
+      "grad_norm": 0.9075474739074707,
+      "learning_rate": 0.003,
+      "loss": 3.9925,
+      "step": 18115
+    },
+    {
+      "epoch": 0.18116,
+      "grad_norm": 0.8368702530860901,
+      "learning_rate": 0.003,
+      "loss": 4.0423,
+      "step": 18116
+    },
+    {
+      "epoch": 0.18117,
+      "grad_norm": 0.9043243527412415,
+      "learning_rate": 0.003,
+      "loss": 4.0339,
+      "step": 18117
+    },
+    {
+      "epoch": 0.18118,
+      "grad_norm": 0.9974503517150879,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 18118
+    },
+    {
+      "epoch": 0.18119,
+      "grad_norm": 0.9757204055786133,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 18119
+    },
+    {
+      "epoch": 0.1812,
+      "grad_norm": 1.045078158378601,
+      "learning_rate": 0.003,
+      "loss": 4.009,
+      "step": 18120
+    },
+    {
+      "epoch": 0.18121,
+      "grad_norm": 1.0194590091705322,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 18121
+    },
+    {
+      "epoch": 0.18122,
+      "grad_norm": 0.9902092218399048,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 18122
+    },
+    {
+      "epoch": 0.18123,
+      "grad_norm": 0.964513897895813,
+      "learning_rate": 0.003,
+      "loss": 3.9835,
+      "step": 18123
+    },
+    {
+      "epoch": 0.18124,
+      "grad_norm": 0.9905357360839844,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 18124
+    },
+    {
+      "epoch": 0.18125,
+      "grad_norm": 1.0209734439849854,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 18125
+    },
+    {
+      "epoch": 0.18126,
+      "grad_norm": 0.9319140315055847,
+      "learning_rate": 0.003,
+      "loss": 4.0311,
+      "step": 18126
+    },
+    {
+      "epoch": 0.18127,
+      "grad_norm": 0.9061564207077026,
+      "learning_rate": 0.003,
+      "loss": 4.0445,
+      "step": 18127
+    },
+    {
+      "epoch": 0.18128,
+      "grad_norm": 0.8320205807685852,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 18128
+    },
+    {
+      "epoch": 0.18129,
+      "grad_norm": 0.77691650390625,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 18129
+    },
+    {
+      "epoch": 0.1813,
+      "grad_norm": 0.6979191303253174,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 18130
+    },
+    {
+      "epoch": 0.18131,
+      "grad_norm": 0.8580296635627747,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 18131
+    },
+    {
+      "epoch": 0.18132,
+      "grad_norm": 1.3209208250045776,
+      "learning_rate": 0.003,
+      "loss": 4.0294,
+      "step": 18132
+    },
+    {
+      "epoch": 0.18133,
+      "grad_norm": 0.8732667565345764,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 18133
+    },
+    {
+      "epoch": 0.18134,
+      "grad_norm": 0.7871861457824707,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 18134
+    },
+    {
+      "epoch": 0.18135,
+      "grad_norm": 0.7526431083679199,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 18135
+    },
+    {
+      "epoch": 0.18136,
+      "grad_norm": 0.604056179523468,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 18136
+    },
+    {
+      "epoch": 0.18137,
+      "grad_norm": 0.5123692154884338,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 18137
+    },
+    {
+      "epoch": 0.18138,
+      "grad_norm": 0.5805838108062744,
+      "learning_rate": 0.003,
+      "loss": 3.9982,
+      "step": 18138
+    },
+    {
+      "epoch": 0.18139,
+      "grad_norm": 0.7583757042884827,
+      "learning_rate": 0.003,
+      "loss": 3.9988,
+      "step": 18139
+    },
+    {
+      "epoch": 0.1814,
+      "grad_norm": 1.0473604202270508,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 18140
+    },
+    {
+      "epoch": 0.18141,
+      "grad_norm": 1.088240385055542,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 18141
+    },
+    {
+      "epoch": 0.18142,
+      "grad_norm": 0.8162732720375061,
+      "learning_rate": 0.003,
+      "loss": 3.9846,
+      "step": 18142
+    },
+    {
+      "epoch": 0.18143,
+      "grad_norm": 0.6132739782333374,
+      "learning_rate": 0.003,
+      "loss": 3.9814,
+      "step": 18143
+    },
+    {
+      "epoch": 0.18144,
+      "grad_norm": 0.5057741403579712,
+      "learning_rate": 0.003,
+      "loss": 4.0119,
+      "step": 18144
+    },
+    {
+      "epoch": 0.18145,
+      "grad_norm": 0.6503153443336487,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 18145
+    },
+    {
+      "epoch": 0.18146,
+      "grad_norm": 0.7001628279685974,
+      "learning_rate": 0.003,
+      "loss": 3.9861,
+      "step": 18146
+    },
+    {
+      "epoch": 0.18147,
+      "grad_norm": 0.6886793375015259,
+      "learning_rate": 0.003,
+      "loss": 3.996,
+      "step": 18147
+    },
+    {
+      "epoch": 0.18148,
+      "grad_norm": 0.6724410057067871,
+      "learning_rate": 0.003,
+      "loss": 3.9775,
+      "step": 18148
+    },
+    {
+      "epoch": 0.18149,
+      "grad_norm": 0.7682032585144043,
+      "learning_rate": 0.003,
+      "loss": 4.0004,
+      "step": 18149
+    },
+    {
+      "epoch": 0.1815,
+      "grad_norm": 0.9040877819061279,
+      "learning_rate": 0.003,
+      "loss": 3.9879,
+      "step": 18150
+    },
+    {
+      "epoch": 0.18151,
+      "grad_norm": 1.0435184240341187,
+      "learning_rate": 0.003,
+      "loss": 4.0157,
+      "step": 18151
+    },
+    {
+      "epoch": 0.18152,
+      "grad_norm": 0.9554518461227417,
+      "learning_rate": 0.003,
+      "loss": 3.9938,
+      "step": 18152
+    },
+    {
+      "epoch": 0.18153,
+      "grad_norm": 0.8426952362060547,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 18153
+    },
+    {
+      "epoch": 0.18154,
+      "grad_norm": 0.7699375152587891,
+      "learning_rate": 0.003,
+      "loss": 3.9748,
+      "step": 18154
+    },
+    {
+      "epoch": 0.18155,
+      "grad_norm": 0.7254806756973267,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 18155
+    },
+    {
+      "epoch": 0.18156,
+      "grad_norm": 0.6597497463226318,
+      "learning_rate": 0.003,
+      "loss": 4.0006,
+      "step": 18156
+    },
+    {
+      "epoch": 0.18157,
+      "grad_norm": 0.8115432262420654,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 18157
+    },
+    {
+      "epoch": 0.18158,
+      "grad_norm": 0.9796165823936462,
+      "learning_rate": 0.003,
+      "loss": 3.9801,
+      "step": 18158
+    },
+    {
+      "epoch": 0.18159,
+      "grad_norm": 0.8970003724098206,
+      "learning_rate": 0.003,
+      "loss": 3.9944,
+      "step": 18159
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.902718722820282,
+      "learning_rate": 0.003,
+      "loss": 4.0095,
+      "step": 18160
+    },
+    {
+      "epoch": 0.18161,
+      "grad_norm": 0.9734975695610046,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 18161
+    },
+    {
+      "epoch": 0.18162,
+      "grad_norm": 0.8031430840492249,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 18162
+    },
+    {
+      "epoch": 0.18163,
+      "grad_norm": 0.7273565530776978,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 18163
+    },
+    {
+      "epoch": 0.18164,
+      "grad_norm": 0.6953762769699097,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 18164
+    },
+    {
+      "epoch": 0.18165,
+      "grad_norm": 0.7612891793251038,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 18165
+    },
+    {
+      "epoch": 0.18166,
+      "grad_norm": 0.8077008724212646,
+      "learning_rate": 0.003,
+      "loss": 3.9864,
+      "step": 18166
+    },
+    {
+      "epoch": 0.18167,
+      "grad_norm": 0.9646387696266174,
+      "learning_rate": 0.003,
+      "loss": 4.0091,
+      "step": 18167
+    },
+    {
+      "epoch": 0.18168,
+      "grad_norm": 1.1055657863616943,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 18168
+    },
+    {
+      "epoch": 0.18169,
+      "grad_norm": 0.9168370366096497,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 18169
+    },
+    {
+      "epoch": 0.1817,
+      "grad_norm": 0.914101243019104,
+      "learning_rate": 0.003,
+      "loss": 3.9934,
+      "step": 18170
+    },
+    {
+      "epoch": 0.18171,
+      "grad_norm": 1.0902481079101562,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 18171
+    },
+    {
+      "epoch": 0.18172,
+      "grad_norm": 0.7929781675338745,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 18172
+    },
+    {
+      "epoch": 0.18173,
+      "grad_norm": 0.6111283898353577,
+      "learning_rate": 0.003,
+      "loss": 3.9851,
+      "step": 18173
+    },
+    {
+      "epoch": 0.18174,
+      "grad_norm": 0.5835784077644348,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 18174
+    },
+    {
+      "epoch": 0.18175,
+      "grad_norm": 0.823890209197998,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 18175
+    },
+    {
+      "epoch": 0.18176,
+      "grad_norm": 0.9427134990692139,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 18176
+    },
+    {
+      "epoch": 0.18177,
+      "grad_norm": 0.7927230596542358,
+      "learning_rate": 0.003,
+      "loss": 4.015,
+      "step": 18177
+    },
+    {
+      "epoch": 0.18178,
+      "grad_norm": 0.8689695000648499,
+      "learning_rate": 0.003,
+      "loss": 4.0207,
+      "step": 18178
+    },
+    {
+      "epoch": 0.18179,
+      "grad_norm": 1.0723934173583984,
+      "learning_rate": 0.003,
+      "loss": 3.9729,
+      "step": 18179
+    },
+    {
+      "epoch": 0.1818,
+      "grad_norm": 1.0629585981369019,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 18180
+    },
+    {
+      "epoch": 0.18181,
+      "grad_norm": 0.8775352239608765,
+      "learning_rate": 0.003,
+      "loss": 3.9984,
+      "step": 18181
+    },
+    {
+      "epoch": 0.18182,
+      "grad_norm": 0.9303977489471436,
+      "learning_rate": 0.003,
+      "loss": 3.9827,
+      "step": 18182
+    },
+    {
+      "epoch": 0.18183,
+      "grad_norm": 1.0644172430038452,
+      "learning_rate": 0.003,
+      "loss": 4.0022,
+      "step": 18183
+    },
+    {
+      "epoch": 0.18184,
+      "grad_norm": 1.0199772119522095,
+      "learning_rate": 0.003,
+      "loss": 4.0277,
+      "step": 18184
+    },
+    {
+      "epoch": 0.18185,
+      "grad_norm": 0.963274359703064,
+      "learning_rate": 0.003,
+      "loss": 4.0308,
+      "step": 18185
+    },
+    {
+      "epoch": 0.18186,
+      "grad_norm": 0.9673733115196228,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 18186
+    },
+    {
+      "epoch": 0.18187,
+      "grad_norm": 1.017067313194275,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 18187
+    },
+    {
+      "epoch": 0.18188,
+      "grad_norm": 0.920941174030304,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 18188
+    },
+    {
+      "epoch": 0.18189,
+      "grad_norm": 0.8725190162658691,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 18189
+    },
+    {
+      "epoch": 0.1819,
+      "grad_norm": 0.944081723690033,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 18190
+    },
+    {
+      "epoch": 0.18191,
+      "grad_norm": 0.9849206805229187,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 18191
+    },
+    {
+      "epoch": 0.18192,
+      "grad_norm": 1.0786023139953613,
+      "learning_rate": 0.003,
+      "loss": 4.0351,
+      "step": 18192
+    },
+    {
+      "epoch": 0.18193,
+      "grad_norm": 1.1245348453521729,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 18193
+    },
+    {
+      "epoch": 0.18194,
+      "grad_norm": 0.9885661005973816,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 18194
+    },
+    {
+      "epoch": 0.18195,
+      "grad_norm": 1.0349547863006592,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 18195
+    },
+    {
+      "epoch": 0.18196,
+      "grad_norm": 0.8013774156570435,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 18196
+    },
+    {
+      "epoch": 0.18197,
+      "grad_norm": 0.6548697352409363,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 18197
+    },
+    {
+      "epoch": 0.18198,
+      "grad_norm": 0.6461350321769714,
+      "learning_rate": 0.003,
+      "loss": 3.9985,
+      "step": 18198
+    },
+    {
+      "epoch": 0.18199,
+      "grad_norm": 0.6670629978179932,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 18199
+    },
+    {
+      "epoch": 0.182,
+      "grad_norm": 0.6904429793357849,
+      "learning_rate": 0.003,
+      "loss": 3.9912,
+      "step": 18200
+    },
+    {
+      "epoch": 0.18201,
+      "grad_norm": 0.7718185186386108,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 18201
+    },
+    {
+      "epoch": 0.18202,
+      "grad_norm": 0.8350328207015991,
+      "learning_rate": 0.003,
+      "loss": 3.9909,
+      "step": 18202
+    },
+    {
+      "epoch": 0.18203,
+      "grad_norm": 0.9042280316352844,
+      "learning_rate": 0.003,
+      "loss": 3.9923,
+      "step": 18203
+    },
+    {
+      "epoch": 0.18204,
+      "grad_norm": 0.9488075971603394,
+      "learning_rate": 0.003,
+      "loss": 3.9822,
+      "step": 18204
+    },
+    {
+      "epoch": 0.18205,
+      "grad_norm": 0.9429847002029419,
+      "learning_rate": 0.003,
+      "loss": 3.976,
+      "step": 18205
+    },
+    {
+      "epoch": 0.18206,
+      "grad_norm": 0.782609224319458,
+      "learning_rate": 0.003,
+      "loss": 3.986,
+      "step": 18206
+    },
+    {
+      "epoch": 0.18207,
+      "grad_norm": 0.6234448552131653,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 18207
+    },
+    {
+      "epoch": 0.18208,
+      "grad_norm": 0.547356128692627,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 18208
+    },
+    {
+      "epoch": 0.18209,
+      "grad_norm": 0.6364834308624268,
+      "learning_rate": 0.003,
+      "loss": 3.986,
+      "step": 18209
+    },
+    {
+      "epoch": 0.1821,
+      "grad_norm": 0.6573218703269958,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 18210
+    },
+    {
+      "epoch": 0.18211,
+      "grad_norm": 0.726592481136322,
+      "learning_rate": 0.003,
+      "loss": 3.9903,
+      "step": 18211
+    },
+    {
+      "epoch": 0.18212,
+      "grad_norm": 0.7832172513008118,
+      "learning_rate": 0.003,
+      "loss": 3.9847,
+      "step": 18212
+    },
+    {
+      "epoch": 0.18213,
+      "grad_norm": 0.8910562992095947,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 18213
+    },
+    {
+      "epoch": 0.18214,
+      "grad_norm": 0.8994468450546265,
+      "learning_rate": 0.003,
+      "loss": 3.9866,
+      "step": 18214
+    },
+    {
+      "epoch": 0.18215,
+      "grad_norm": 0.7811818718910217,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 18215
+    },
+    {
+      "epoch": 0.18216,
+      "grad_norm": 0.849797248840332,
+      "learning_rate": 0.003,
+      "loss": 3.9724,
+      "step": 18216
+    },
+    {
+      "epoch": 0.18217,
+      "grad_norm": 0.9664706587791443,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 18217
+    },
+    {
+      "epoch": 0.18218,
+      "grad_norm": 1.0398706197738647,
+      "learning_rate": 0.003,
+      "loss": 3.9712,
+      "step": 18218
+    },
+    {
+      "epoch": 0.18219,
+      "grad_norm": 0.838563084602356,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 18219
+    },
+    {
+      "epoch": 0.1822,
+      "grad_norm": 0.7078306078910828,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 18220
+    },
+    {
+      "epoch": 0.18221,
+      "grad_norm": 0.7152316570281982,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 18221
+    },
+    {
+      "epoch": 0.18222,
+      "grad_norm": 0.7453733086585999,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 18222
+    },
+    {
+      "epoch": 0.18223,
+      "grad_norm": 0.7840257287025452,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 18223
+    },
+    {
+      "epoch": 0.18224,
+      "grad_norm": 0.8685861825942993,
+      "learning_rate": 0.003,
+      "loss": 3.9822,
+      "step": 18224
+    },
+    {
+      "epoch": 0.18225,
+      "grad_norm": 1.0737313032150269,
+      "learning_rate": 0.003,
+      "loss": 4.0001,
+      "step": 18225
+    },
+    {
+      "epoch": 0.18226,
+      "grad_norm": 1.0292390584945679,
+      "learning_rate": 0.003,
+      "loss": 4.023,
+      "step": 18226
+    },
+    {
+      "epoch": 0.18227,
+      "grad_norm": 0.7910824418067932,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 18227
+    },
+    {
+      "epoch": 0.18228,
+      "grad_norm": 0.6173461675643921,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 18228
+    },
+    {
+      "epoch": 0.18229,
+      "grad_norm": 0.6397079229354858,
+      "learning_rate": 0.003,
+      "loss": 3.982,
+      "step": 18229
+    },
+    {
+      "epoch": 0.1823,
+      "grad_norm": 0.6707412600517273,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 18230
+    },
+    {
+      "epoch": 0.18231,
+      "grad_norm": 0.7674610018730164,
+      "learning_rate": 0.003,
+      "loss": 3.9853,
+      "step": 18231
+    },
+    {
+      "epoch": 0.18232,
+      "grad_norm": 0.851983904838562,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 18232
+    },
+    {
+      "epoch": 0.18233,
+      "grad_norm": 1.0599827766418457,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 18233
+    },
+    {
+      "epoch": 0.18234,
+      "grad_norm": 1.0854276418685913,
+      "learning_rate": 0.003,
+      "loss": 3.9905,
+      "step": 18234
+    },
+    {
+      "epoch": 0.18235,
+      "grad_norm": 0.8180278539657593,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 18235
+    },
+    {
+      "epoch": 0.18236,
+      "grad_norm": 0.7597306966781616,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 18236
+    },
+    {
+      "epoch": 0.18237,
+      "grad_norm": 0.7044374346733093,
+      "learning_rate": 0.003,
+      "loss": 3.9993,
+      "step": 18237
+    },
+    {
+      "epoch": 0.18238,
+      "grad_norm": 0.8133132457733154,
+      "learning_rate": 0.003,
+      "loss": 3.9965,
+      "step": 18238
+    },
+    {
+      "epoch": 0.18239,
+      "grad_norm": 1.0175007581710815,
+      "learning_rate": 0.003,
+      "loss": 3.9797,
+      "step": 18239
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 1.062023401260376,
+      "learning_rate": 0.003,
+      "loss": 3.972,
+      "step": 18240
+    },
+    {
+      "epoch": 0.18241,
+      "grad_norm": 0.6800371408462524,
+      "learning_rate": 0.003,
+      "loss": 4.0056,
+      "step": 18241
+    },
+    {
+      "epoch": 0.18242,
+      "grad_norm": 0.635016679763794,
+      "learning_rate": 0.003,
+      "loss": 3.9583,
+      "step": 18242
+    },
+    {
+      "epoch": 0.18243,
+      "grad_norm": 0.7265605330467224,
+      "learning_rate": 0.003,
+      "loss": 3.9683,
+      "step": 18243
+    },
+    {
+      "epoch": 0.18244,
+      "grad_norm": 0.7801223397254944,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 18244
+    },
+    {
+      "epoch": 0.18245,
+      "grad_norm": 0.781179666519165,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 18245
+    },
+    {
+      "epoch": 0.18246,
+      "grad_norm": 0.7372574210166931,
+      "learning_rate": 0.003,
+      "loss": 3.9873,
+      "step": 18246
+    },
+    {
+      "epoch": 0.18247,
+      "grad_norm": 0.7139351963996887,
+      "learning_rate": 0.003,
+      "loss": 3.9973,
+      "step": 18247
+    },
+    {
+      "epoch": 0.18248,
+      "grad_norm": 0.7166587114334106,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 18248
+    },
+    {
+      "epoch": 0.18249,
+      "grad_norm": 0.8879257440567017,
+      "learning_rate": 0.003,
+      "loss": 3.9931,
+      "step": 18249
+    },
+    {
+      "epoch": 0.1825,
+      "grad_norm": 1.0072455406188965,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 18250
+    },
+    {
+      "epoch": 0.18251,
+      "grad_norm": 1.1003729104995728,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 18251
+    },
+    {
+      "epoch": 0.18252,
+      "grad_norm": 0.8618923425674438,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 18252
+    },
+    {
+      "epoch": 0.18253,
+      "grad_norm": 0.8108559846878052,
+      "learning_rate": 0.003,
+      "loss": 3.9942,
+      "step": 18253
+    },
+    {
+      "epoch": 0.18254,
+      "grad_norm": 0.7113040685653687,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 18254
+    },
+    {
+      "epoch": 0.18255,
+      "grad_norm": 0.8643912672996521,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 18255
+    },
+    {
+      "epoch": 0.18256,
+      "grad_norm": 1.0000271797180176,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 18256
+    },
+    {
+      "epoch": 0.18257,
+      "grad_norm": 1.3140206336975098,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 18257
+    },
+    {
+      "epoch": 0.18258,
+      "grad_norm": 0.747545599937439,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 18258
+    },
+    {
+      "epoch": 0.18259,
+      "grad_norm": 0.7669658064842224,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 18259
+    },
+    {
+      "epoch": 0.1826,
+      "grad_norm": 0.7207746505737305,
+      "learning_rate": 0.003,
+      "loss": 3.9742,
+      "step": 18260
+    },
+    {
+      "epoch": 0.18261,
+      "grad_norm": 0.7719569802284241,
+      "learning_rate": 0.003,
+      "loss": 3.9834,
+      "step": 18261
+    },
+    {
+      "epoch": 0.18262,
+      "grad_norm": 0.7602952122688293,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 18262
+    },
+    {
+      "epoch": 0.18263,
+      "grad_norm": 0.7253813743591309,
+      "learning_rate": 0.003,
+      "loss": 4.0072,
+      "step": 18263
+    },
+    {
+      "epoch": 0.18264,
+      "grad_norm": 0.6444506645202637,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 18264
+    },
+    {
+      "epoch": 0.18265,
+      "grad_norm": 0.6988993287086487,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 18265
+    },
+    {
+      "epoch": 0.18266,
+      "grad_norm": 0.7080886960029602,
+      "learning_rate": 0.003,
+      "loss": 3.9916,
+      "step": 18266
+    },
+    {
+      "epoch": 0.18267,
+      "grad_norm": 0.7752986550331116,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 18267
+    },
+    {
+      "epoch": 0.18268,
+      "grad_norm": 0.9107593297958374,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 18268
+    },
+    {
+      "epoch": 0.18269,
+      "grad_norm": 1.1095401048660278,
+      "learning_rate": 0.003,
+      "loss": 4.0135,
+      "step": 18269
+    },
+    {
+      "epoch": 0.1827,
+      "grad_norm": 0.9966729283332825,
+      "learning_rate": 0.003,
+      "loss": 3.9985,
+      "step": 18270
+    },
+    {
+      "epoch": 0.18271,
+      "grad_norm": 0.93555748462677,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 18271
+    },
+    {
+      "epoch": 0.18272,
+      "grad_norm": 0.8204525709152222,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 18272
+    },
+    {
+      "epoch": 0.18273,
+      "grad_norm": 0.863254725933075,
+      "learning_rate": 0.003,
+      "loss": 3.9915,
+      "step": 18273
+    },
+    {
+      "epoch": 0.18274,
+      "grad_norm": 0.7821072936058044,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 18274
+    },
+    {
+      "epoch": 0.18275,
+      "grad_norm": 0.8419560194015503,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 18275
+    },
+    {
+      "epoch": 0.18276,
+      "grad_norm": 0.8162816762924194,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 18276
+    },
+    {
+      "epoch": 0.18277,
+      "grad_norm": 0.8232567310333252,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 18277
+    },
+    {
+      "epoch": 0.18278,
+      "grad_norm": 0.8190836310386658,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 18278
+    },
+    {
+      "epoch": 0.18279,
+      "grad_norm": 0.7407211065292358,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 18279
+    },
+    {
+      "epoch": 0.1828,
+      "grad_norm": 0.7084978222846985,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 18280
+    },
+    {
+      "epoch": 0.18281,
+      "grad_norm": 0.8958796858787537,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 18281
+    },
+    {
+      "epoch": 0.18282,
+      "grad_norm": 1.1421873569488525,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 18282
+    },
+    {
+      "epoch": 0.18283,
+      "grad_norm": 1.0158741474151611,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 18283
+    },
+    {
+      "epoch": 0.18284,
+      "grad_norm": 1.1559165716171265,
+      "learning_rate": 0.003,
+      "loss": 4.042,
+      "step": 18284
+    },
+    {
+      "epoch": 0.18285,
+      "grad_norm": 0.8005707859992981,
+      "learning_rate": 0.003,
+      "loss": 3.9854,
+      "step": 18285
+    },
+    {
+      "epoch": 0.18286,
+      "grad_norm": 0.8458976149559021,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 18286
+    },
+    {
+      "epoch": 0.18287,
+      "grad_norm": 0.8141799569129944,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 18287
+    },
+    {
+      "epoch": 0.18288,
+      "grad_norm": 0.8615123629570007,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 18288
+    },
+    {
+      "epoch": 0.18289,
+      "grad_norm": 0.8300474882125854,
+      "learning_rate": 0.003,
+      "loss": 3.9917,
+      "step": 18289
+    },
+    {
+      "epoch": 0.1829,
+      "grad_norm": 0.8556902408599854,
+      "learning_rate": 0.003,
+      "loss": 3.9773,
+      "step": 18290
+    },
+    {
+      "epoch": 0.18291,
+      "grad_norm": 0.8768019676208496,
+      "learning_rate": 0.003,
+      "loss": 3.9818,
+      "step": 18291
+    },
+    {
+      "epoch": 0.18292,
+      "grad_norm": 0.8756127953529358,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 18292
+    },
+    {
+      "epoch": 0.18293,
+      "grad_norm": 0.8733711838722229,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 18293
+    },
+    {
+      "epoch": 0.18294,
+      "grad_norm": 0.8341532349586487,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 18294
+    },
+    {
+      "epoch": 0.18295,
+      "grad_norm": 0.8398283123970032,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 18295
+    },
+    {
+      "epoch": 0.18296,
+      "grad_norm": 0.8624936938285828,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 18296
+    },
+    {
+      "epoch": 0.18297,
+      "grad_norm": 0.6987881064414978,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 18297
+    },
+    {
+      "epoch": 0.18298,
+      "grad_norm": 0.628121018409729,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 18298
+    },
+    {
+      "epoch": 0.18299,
+      "grad_norm": 0.7382155060768127,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 18299
+    },
+    {
+      "epoch": 0.183,
+      "grad_norm": 0.7305464148521423,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 18300
+    },
+    {
+      "epoch": 0.18301,
+      "grad_norm": 0.8464676737785339,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 18301
+    },
+    {
+      "epoch": 0.18302,
+      "grad_norm": 0.9060691595077515,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 18302
+    },
+    {
+      "epoch": 0.18303,
+      "grad_norm": 1.0593427419662476,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 18303
+    },
+    {
+      "epoch": 0.18304,
+      "grad_norm": 1.2884917259216309,
+      "learning_rate": 0.003,
+      "loss": 4.0374,
+      "step": 18304
+    },
+    {
+      "epoch": 0.18305,
+      "grad_norm": 0.8706284165382385,
+      "learning_rate": 0.003,
+      "loss": 3.9918,
+      "step": 18305
+    },
+    {
+      "epoch": 0.18306,
+      "grad_norm": 0.8215622305870056,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 18306
+    },
+    {
+      "epoch": 0.18307,
+      "grad_norm": 0.8193975687026978,
+      "learning_rate": 0.003,
+      "loss": 3.9775,
+      "step": 18307
+    },
+    {
+      "epoch": 0.18308,
+      "grad_norm": 0.7148175835609436,
+      "learning_rate": 0.003,
+      "loss": 3.9848,
+      "step": 18308
+    },
+    {
+      "epoch": 0.18309,
+      "grad_norm": 0.7559876441955566,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 18309
+    },
+    {
+      "epoch": 0.1831,
+      "grad_norm": 0.8626182079315186,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 18310
+    },
+    {
+      "epoch": 0.18311,
+      "grad_norm": 0.8986365795135498,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 18311
+    },
+    {
+      "epoch": 0.18312,
+      "grad_norm": 1.1372230052947998,
+      "learning_rate": 0.003,
+      "loss": 4.0237,
+      "step": 18312
+    },
+    {
+      "epoch": 0.18313,
+      "grad_norm": 0.8173007965087891,
+      "learning_rate": 0.003,
+      "loss": 3.9305,
+      "step": 18313
+    },
+    {
+      "epoch": 0.18314,
+      "grad_norm": 0.7488594651222229,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 18314
+    },
+    {
+      "epoch": 0.18315,
+      "grad_norm": 0.7553419470787048,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 18315
+    },
+    {
+      "epoch": 0.18316,
+      "grad_norm": 0.5957514047622681,
+      "learning_rate": 0.003,
+      "loss": 4.0039,
+      "step": 18316
+    },
+    {
+      "epoch": 0.18317,
+      "grad_norm": 0.5239382982254028,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 18317
+    },
+    {
+      "epoch": 0.18318,
+      "grad_norm": 0.5718421339988708,
+      "learning_rate": 0.003,
+      "loss": 3.9884,
+      "step": 18318
+    },
+    {
+      "epoch": 0.18319,
+      "grad_norm": 0.6599254608154297,
+      "learning_rate": 0.003,
+      "loss": 3.9512,
+      "step": 18319
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.6726394891738892,
+      "learning_rate": 0.003,
+      "loss": 3.9871,
+      "step": 18320
+    },
+    {
+      "epoch": 0.18321,
+      "grad_norm": 0.7308098077774048,
+      "learning_rate": 0.003,
+      "loss": 4.0549,
+      "step": 18321
+    },
+    {
+      "epoch": 0.18322,
+      "grad_norm": 0.7821747064590454,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 18322
+    },
+    {
+      "epoch": 0.18323,
+      "grad_norm": 0.7732888460159302,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 18323
+    },
+    {
+      "epoch": 0.18324,
+      "grad_norm": 0.6457021832466125,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 18324
+    },
+    {
+      "epoch": 0.18325,
+      "grad_norm": 0.6706361770629883,
+      "learning_rate": 0.003,
+      "loss": 3.9897,
+      "step": 18325
+    },
+    {
+      "epoch": 0.18326,
+      "grad_norm": 0.7337856888771057,
+      "learning_rate": 0.003,
+      "loss": 3.9857,
+      "step": 18326
+    },
+    {
+      "epoch": 0.18327,
+      "grad_norm": 0.7500945329666138,
+      "learning_rate": 0.003,
+      "loss": 3.9871,
+      "step": 18327
+    },
+    {
+      "epoch": 0.18328,
+      "grad_norm": 0.9249593019485474,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 18328
+    },
+    {
+      "epoch": 0.18329,
+      "grad_norm": 1.1571935415267944,
+      "learning_rate": 0.003,
+      "loss": 3.977,
+      "step": 18329
+    },
+    {
+      "epoch": 0.1833,
+      "grad_norm": 0.886870265007019,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 18330
+    },
+    {
+      "epoch": 0.18331,
+      "grad_norm": 0.7807754874229431,
+      "learning_rate": 0.003,
+      "loss": 3.9745,
+      "step": 18331
+    },
+    {
+      "epoch": 0.18332,
+      "grad_norm": 0.7621500492095947,
+      "learning_rate": 0.003,
+      "loss": 3.9854,
+      "step": 18332
+    },
+    {
+      "epoch": 0.18333,
+      "grad_norm": 0.8085874915122986,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 18333
+    },
+    {
+      "epoch": 0.18334,
+      "grad_norm": 0.7710543870925903,
+      "learning_rate": 0.003,
+      "loss": 3.9649,
+      "step": 18334
+    },
+    {
+      "epoch": 0.18335,
+      "grad_norm": 0.7949119210243225,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 18335
+    },
+    {
+      "epoch": 0.18336,
+      "grad_norm": 1.0128542184829712,
+      "learning_rate": 0.003,
+      "loss": 3.9772,
+      "step": 18336
+    },
+    {
+      "epoch": 0.18337,
+      "grad_norm": 1.1886122226715088,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 18337
+    },
+    {
+      "epoch": 0.18338,
+      "grad_norm": 0.817167341709137,
+      "learning_rate": 0.003,
+      "loss": 3.9775,
+      "step": 18338
+    },
+    {
+      "epoch": 0.18339,
+      "grad_norm": 0.7243970036506653,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 18339
+    },
+    {
+      "epoch": 0.1834,
+      "grad_norm": 0.6925933957099915,
+      "learning_rate": 0.003,
+      "loss": 3.9988,
+      "step": 18340
+    },
+    {
+      "epoch": 0.18341,
+      "grad_norm": 0.6670451760292053,
+      "learning_rate": 0.003,
+      "loss": 3.9762,
+      "step": 18341
+    },
+    {
+      "epoch": 0.18342,
+      "grad_norm": 0.7994633913040161,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 18342
+    },
+    {
+      "epoch": 0.18343,
+      "grad_norm": 0.9264425039291382,
+      "learning_rate": 0.003,
+      "loss": 3.9857,
+      "step": 18343
+    },
+    {
+      "epoch": 0.18344,
+      "grad_norm": 1.0609327554702759,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 18344
+    },
+    {
+      "epoch": 0.18345,
+      "grad_norm": 1.0058785676956177,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 18345
+    },
+    {
+      "epoch": 0.18346,
+      "grad_norm": 1.009697675704956,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 18346
+    },
+    {
+      "epoch": 0.18347,
+      "grad_norm": 0.9138298034667969,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 18347
+    },
+    {
+      "epoch": 0.18348,
+      "grad_norm": 0.8665712475776672,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 18348
+    },
+    {
+      "epoch": 0.18349,
+      "grad_norm": 0.9085323810577393,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 18349
+    },
+    {
+      "epoch": 0.1835,
+      "grad_norm": 0.8555454611778259,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 18350
+    },
+    {
+      "epoch": 0.18351,
+      "grad_norm": 0.8092401027679443,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 18351
+    },
+    {
+      "epoch": 0.18352,
+      "grad_norm": 0.7945643663406372,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 18352
+    },
+    {
+      "epoch": 0.18353,
+      "grad_norm": 0.9307811856269836,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 18353
+    },
+    {
+      "epoch": 0.18354,
+      "grad_norm": 0.9853703379631042,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 18354
+    },
+    {
+      "epoch": 0.18355,
+      "grad_norm": 1.050830364227295,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 18355
+    },
+    {
+      "epoch": 0.18356,
+      "grad_norm": 0.8668641448020935,
+      "learning_rate": 0.003,
+      "loss": 3.9802,
+      "step": 18356
+    },
+    {
+      "epoch": 0.18357,
+      "grad_norm": 0.7032343149185181,
+      "learning_rate": 0.003,
+      "loss": 3.9881,
+      "step": 18357
+    },
+    {
+      "epoch": 0.18358,
+      "grad_norm": 0.6883024573326111,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 18358
+    },
+    {
+      "epoch": 0.18359,
+      "grad_norm": 0.7748553156852722,
+      "learning_rate": 0.003,
+      "loss": 4.0331,
+      "step": 18359
+    },
+    {
+      "epoch": 0.1836,
+      "grad_norm": 0.9895306825637817,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 18360
+    },
+    {
+      "epoch": 0.18361,
+      "grad_norm": 1.1863415241241455,
+      "learning_rate": 0.003,
+      "loss": 3.9825,
+      "step": 18361
+    },
+    {
+      "epoch": 0.18362,
+      "grad_norm": 0.914741039276123,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 18362
+    },
+    {
+      "epoch": 0.18363,
+      "grad_norm": 0.8926665186882019,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 18363
+    },
+    {
+      "epoch": 0.18364,
+      "grad_norm": 0.9242895245552063,
+      "learning_rate": 0.003,
+      "loss": 4.0468,
+      "step": 18364
+    },
+    {
+      "epoch": 0.18365,
+      "grad_norm": 0.8577793836593628,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 18365
+    },
+    {
+      "epoch": 0.18366,
+      "grad_norm": 0.8163846731185913,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 18366
+    },
+    {
+      "epoch": 0.18367,
+      "grad_norm": 0.7950718402862549,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 18367
+    },
+    {
+      "epoch": 0.18368,
+      "grad_norm": 0.8748380541801453,
+      "learning_rate": 0.003,
+      "loss": 3.985,
+      "step": 18368
+    },
+    {
+      "epoch": 0.18369,
+      "grad_norm": 0.9596766829490662,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 18369
+    },
+    {
+      "epoch": 0.1837,
+      "grad_norm": 0.9462250471115112,
+      "learning_rate": 0.003,
+      "loss": 3.9891,
+      "step": 18370
+    },
+    {
+      "epoch": 0.18371,
+      "grad_norm": 0.954529345035553,
+      "learning_rate": 0.003,
+      "loss": 3.9861,
+      "step": 18371
+    },
+    {
+      "epoch": 0.18372,
+      "grad_norm": 0.8737909197807312,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 18372
+    },
+    {
+      "epoch": 0.18373,
+      "grad_norm": 0.8716749548912048,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 18373
+    },
+    {
+      "epoch": 0.18374,
+      "grad_norm": 0.8134250044822693,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 18374
+    },
+    {
+      "epoch": 0.18375,
+      "grad_norm": 0.9541025757789612,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 18375
+    },
+    {
+      "epoch": 0.18376,
+      "grad_norm": 0.9908921718597412,
+      "learning_rate": 0.003,
+      "loss": 4.036,
+      "step": 18376
+    },
+    {
+      "epoch": 0.18377,
+      "grad_norm": 1.1104575395584106,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 18377
+    },
+    {
+      "epoch": 0.18378,
+      "grad_norm": 0.9030351042747498,
+      "learning_rate": 0.003,
+      "loss": 4.0002,
+      "step": 18378
+    },
+    {
+      "epoch": 0.18379,
+      "grad_norm": 0.851387619972229,
+      "learning_rate": 0.003,
+      "loss": 4.0206,
+      "step": 18379
+    },
+    {
+      "epoch": 0.1838,
+      "grad_norm": 0.8044515252113342,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 18380
+    },
+    {
+      "epoch": 0.18381,
+      "grad_norm": 0.7882006764411926,
+      "learning_rate": 0.003,
+      "loss": 4.0331,
+      "step": 18381
+    },
+    {
+      "epoch": 0.18382,
+      "grad_norm": 0.6093977093696594,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 18382
+    },
+    {
+      "epoch": 0.18383,
+      "grad_norm": 0.52236008644104,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 18383
+    },
+    {
+      "epoch": 0.18384,
+      "grad_norm": 0.47218433022499084,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 18384
+    },
+    {
+      "epoch": 0.18385,
+      "grad_norm": 0.5042617917060852,
+      "learning_rate": 0.003,
+      "loss": 3.9938,
+      "step": 18385
+    },
+    {
+      "epoch": 0.18386,
+      "grad_norm": 0.5072876811027527,
+      "learning_rate": 0.003,
+      "loss": 3.9718,
+      "step": 18386
+    },
+    {
+      "epoch": 0.18387,
+      "grad_norm": 0.5613670349121094,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 18387
+    },
+    {
+      "epoch": 0.18388,
+      "grad_norm": 0.5694621205329895,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 18388
+    },
+    {
+      "epoch": 0.18389,
+      "grad_norm": 0.6198713183403015,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 18389
+    },
+    {
+      "epoch": 0.1839,
+      "grad_norm": 0.694005012512207,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 18390
+    },
+    {
+      "epoch": 0.18391,
+      "grad_norm": 0.781154215335846,
+      "learning_rate": 0.003,
+      "loss": 3.9826,
+      "step": 18391
+    },
+    {
+      "epoch": 0.18392,
+      "grad_norm": 0.8123658895492554,
+      "learning_rate": 0.003,
+      "loss": 3.97,
+      "step": 18392
+    },
+    {
+      "epoch": 0.18393,
+      "grad_norm": 0.9070790410041809,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 18393
+    },
+    {
+      "epoch": 0.18394,
+      "grad_norm": 1.081210970878601,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 18394
+    },
+    {
+      "epoch": 0.18395,
+      "grad_norm": 1.0967609882354736,
+      "learning_rate": 0.003,
+      "loss": 3.9847,
+      "step": 18395
+    },
+    {
+      "epoch": 0.18396,
+      "grad_norm": 1.050869345664978,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 18396
+    },
+    {
+      "epoch": 0.18397,
+      "grad_norm": 1.079639196395874,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 18397
+    },
+    {
+      "epoch": 0.18398,
+      "grad_norm": 0.7882301807403564,
+      "learning_rate": 0.003,
+      "loss": 3.9919,
+      "step": 18398
+    },
+    {
+      "epoch": 0.18399,
+      "grad_norm": 0.63349848985672,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 18399
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.8485949635505676,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 18400
+    },
+    {
+      "epoch": 0.18401,
+      "grad_norm": 0.8544250130653381,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 18401
+    },
+    {
+      "epoch": 0.18402,
+      "grad_norm": 0.7469441890716553,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 18402
+    },
+    {
+      "epoch": 0.18403,
+      "grad_norm": 0.9119636416435242,
+      "learning_rate": 0.003,
+      "loss": 3.9687,
+      "step": 18403
+    },
+    {
+      "epoch": 0.18404,
+      "grad_norm": 1.0456124544143677,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 18404
+    },
+    {
+      "epoch": 0.18405,
+      "grad_norm": 1.1618139743804932,
+      "learning_rate": 0.003,
+      "loss": 4.0022,
+      "step": 18405
+    },
+    {
+      "epoch": 0.18406,
+      "grad_norm": 0.9747986793518066,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 18406
+    },
+    {
+      "epoch": 0.18407,
+      "grad_norm": 0.9599617123603821,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 18407
+    },
+    {
+      "epoch": 0.18408,
+      "grad_norm": 1.067495346069336,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 18408
+    },
+    {
+      "epoch": 0.18409,
+      "grad_norm": 0.8262394666671753,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 18409
+    },
+    {
+      "epoch": 0.1841,
+      "grad_norm": 0.7933745384216309,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 18410
+    },
+    {
+      "epoch": 0.18411,
+      "grad_norm": 0.8479217886924744,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 18411
+    },
+    {
+      "epoch": 0.18412,
+      "grad_norm": 0.855431079864502,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 18412
+    },
+    {
+      "epoch": 0.18413,
+      "grad_norm": 0.8933448791503906,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 18413
+    },
+    {
+      "epoch": 0.18414,
+      "grad_norm": 0.9668148756027222,
+      "learning_rate": 0.003,
+      "loss": 4.0056,
+      "step": 18414
+    },
+    {
+      "epoch": 0.18415,
+      "grad_norm": 1.121019959449768,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 18415
+    },
+    {
+      "epoch": 0.18416,
+      "grad_norm": 0.9086769819259644,
+      "learning_rate": 0.003,
+      "loss": 3.9965,
+      "step": 18416
+    },
+    {
+      "epoch": 0.18417,
+      "grad_norm": 0.9084094166755676,
+      "learning_rate": 0.003,
+      "loss": 4.0404,
+      "step": 18417
+    },
+    {
+      "epoch": 0.18418,
+      "grad_norm": 0.8724955916404724,
+      "learning_rate": 0.003,
+      "loss": 3.9885,
+      "step": 18418
+    },
+    {
+      "epoch": 0.18419,
+      "grad_norm": 0.7465152144432068,
+      "learning_rate": 0.003,
+      "loss": 3.99,
+      "step": 18419
+    },
+    {
+      "epoch": 0.1842,
+      "grad_norm": 0.768031895160675,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 18420
+    },
+    {
+      "epoch": 0.18421,
+      "grad_norm": 0.8390229940414429,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 18421
+    },
+    {
+      "epoch": 0.18422,
+      "grad_norm": 0.8336851596832275,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 18422
+    },
+    {
+      "epoch": 0.18423,
+      "grad_norm": 0.7277620434761047,
+      "learning_rate": 0.003,
+      "loss": 3.9862,
+      "step": 18423
+    },
+    {
+      "epoch": 0.18424,
+      "grad_norm": 0.7897154688835144,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 18424
+    },
+    {
+      "epoch": 0.18425,
+      "grad_norm": 1.0583573579788208,
+      "learning_rate": 0.003,
+      "loss": 3.9679,
+      "step": 18425
+    },
+    {
+      "epoch": 0.18426,
+      "grad_norm": 1.0781868696212769,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 18426
+    },
+    {
+      "epoch": 0.18427,
+      "grad_norm": 0.8976330161094666,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 18427
+    },
+    {
+      "epoch": 0.18428,
+      "grad_norm": 0.7958218455314636,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 18428
+    },
+    {
+      "epoch": 0.18429,
+      "grad_norm": 0.6814959645271301,
+      "learning_rate": 0.003,
+      "loss": 4.0117,
+      "step": 18429
+    },
+    {
+      "epoch": 0.1843,
+      "grad_norm": 0.6394864916801453,
+      "learning_rate": 0.003,
+      "loss": 3.9754,
+      "step": 18430
+    },
+    {
+      "epoch": 0.18431,
+      "grad_norm": 0.685542643070221,
+      "learning_rate": 0.003,
+      "loss": 3.9847,
+      "step": 18431
+    },
+    {
+      "epoch": 0.18432,
+      "grad_norm": 0.8013325929641724,
+      "learning_rate": 0.003,
+      "loss": 3.9884,
+      "step": 18432
+    },
+    {
+      "epoch": 0.18433,
+      "grad_norm": 0.8234718441963196,
+      "learning_rate": 0.003,
+      "loss": 3.9803,
+      "step": 18433
+    },
+    {
+      "epoch": 0.18434,
+      "grad_norm": 0.7327991724014282,
+      "learning_rate": 0.003,
+      "loss": 3.9983,
+      "step": 18434
+    },
+    {
+      "epoch": 0.18435,
+      "grad_norm": 0.6646810173988342,
+      "learning_rate": 0.003,
+      "loss": 4.0027,
+      "step": 18435
+    },
+    {
+      "epoch": 0.18436,
+      "grad_norm": 0.7256752848625183,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 18436
+    },
+    {
+      "epoch": 0.18437,
+      "grad_norm": 0.8754481673240662,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 18437
+    },
+    {
+      "epoch": 0.18438,
+      "grad_norm": 0.9004335999488831,
+      "learning_rate": 0.003,
+      "loss": 3.9936,
+      "step": 18438
+    },
+    {
+      "epoch": 0.18439,
+      "grad_norm": 0.8405164480209351,
+      "learning_rate": 0.003,
+      "loss": 3.979,
+      "step": 18439
+    },
+    {
+      "epoch": 0.1844,
+      "grad_norm": 0.8154463768005371,
+      "learning_rate": 0.003,
+      "loss": 3.9891,
+      "step": 18440
+    },
+    {
+      "epoch": 0.18441,
+      "grad_norm": 0.9057301878929138,
+      "learning_rate": 0.003,
+      "loss": 3.9924,
+      "step": 18441
+    },
+    {
+      "epoch": 0.18442,
+      "grad_norm": 1.1145095825195312,
+      "learning_rate": 0.003,
+      "loss": 3.9686,
+      "step": 18442
+    },
+    {
+      "epoch": 0.18443,
+      "grad_norm": 0.9151156544685364,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 18443
+    },
+    {
+      "epoch": 0.18444,
+      "grad_norm": 0.9163838028907776,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 18444
+    },
+    {
+      "epoch": 0.18445,
+      "grad_norm": 1.0952199697494507,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 18445
+    },
+    {
+      "epoch": 0.18446,
+      "grad_norm": 0.7924932837486267,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 18446
+    },
+    {
+      "epoch": 0.18447,
+      "grad_norm": 0.7247111201286316,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 18447
+    },
+    {
+      "epoch": 0.18448,
+      "grad_norm": 0.7970155477523804,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 18448
+    },
+    {
+      "epoch": 0.18449,
+      "grad_norm": 0.759702742099762,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 18449
+    },
+    {
+      "epoch": 0.1845,
+      "grad_norm": 0.7928878664970398,
+      "learning_rate": 0.003,
+      "loss": 3.9841,
+      "step": 18450
+    },
+    {
+      "epoch": 0.18451,
+      "grad_norm": 0.7595303058624268,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 18451
+    },
+    {
+      "epoch": 0.18452,
+      "grad_norm": 0.7852331399917603,
+      "learning_rate": 0.003,
+      "loss": 3.992,
+      "step": 18452
+    },
+    {
+      "epoch": 0.18453,
+      "grad_norm": 0.9528810977935791,
+      "learning_rate": 0.003,
+      "loss": 4.0486,
+      "step": 18453
+    },
+    {
+      "epoch": 0.18454,
+      "grad_norm": 1.0351529121398926,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 18454
+    },
+    {
+      "epoch": 0.18455,
+      "grad_norm": 0.801206111907959,
+      "learning_rate": 0.003,
+      "loss": 3.9793,
+      "step": 18455
+    },
+    {
+      "epoch": 0.18456,
+      "grad_norm": 0.7646304965019226,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 18456
+    },
+    {
+      "epoch": 0.18457,
+      "grad_norm": 0.7039245963096619,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 18457
+    },
+    {
+      "epoch": 0.18458,
+      "grad_norm": 0.6283014416694641,
+      "learning_rate": 0.003,
+      "loss": 4.0271,
+      "step": 18458
+    },
+    {
+      "epoch": 0.18459,
+      "grad_norm": 0.6567989587783813,
+      "learning_rate": 0.003,
+      "loss": 3.9837,
+      "step": 18459
+    },
+    {
+      "epoch": 0.1846,
+      "grad_norm": 0.9653926491737366,
+      "learning_rate": 0.003,
+      "loss": 3.9687,
+      "step": 18460
+    },
+    {
+      "epoch": 0.18461,
+      "grad_norm": 1.556738257408142,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 18461
+    },
+    {
+      "epoch": 0.18462,
+      "grad_norm": 0.5955246686935425,
+      "learning_rate": 0.003,
+      "loss": 4.0057,
+      "step": 18462
+    },
+    {
+      "epoch": 0.18463,
+      "grad_norm": 0.7615717649459839,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 18463
+    },
+    {
+      "epoch": 0.18464,
+      "grad_norm": 0.8574117422103882,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 18464
+    },
+    {
+      "epoch": 0.18465,
+      "grad_norm": 1.0039912462234497,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 18465
+    },
+    {
+      "epoch": 0.18466,
+      "grad_norm": 1.3914353847503662,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 18466
+    },
+    {
+      "epoch": 0.18467,
+      "grad_norm": 0.5892895460128784,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 18467
+    },
+    {
+      "epoch": 0.18468,
+      "grad_norm": 0.8511627316474915,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 18468
+    },
+    {
+      "epoch": 0.18469,
+      "grad_norm": 0.9471988677978516,
+      "learning_rate": 0.003,
+      "loss": 4.0334,
+      "step": 18469
+    },
+    {
+      "epoch": 0.1847,
+      "grad_norm": 1.0093071460723877,
+      "learning_rate": 0.003,
+      "loss": 3.979,
+      "step": 18470
+    },
+    {
+      "epoch": 0.18471,
+      "grad_norm": 1.2240053415298462,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 18471
+    },
+    {
+      "epoch": 0.18472,
+      "grad_norm": 0.7324399948120117,
+      "learning_rate": 0.003,
+      "loss": 3.9864,
+      "step": 18472
+    },
+    {
+      "epoch": 0.18473,
+      "grad_norm": 0.6687946915626526,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 18473
+    },
+    {
+      "epoch": 0.18474,
+      "grad_norm": 0.8151763081550598,
+      "learning_rate": 0.003,
+      "loss": 3.9985,
+      "step": 18474
+    },
+    {
+      "epoch": 0.18475,
+      "grad_norm": 0.8017345666885376,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 18475
+    },
+    {
+      "epoch": 0.18476,
+      "grad_norm": 0.8550369739532471,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 18476
+    },
+    {
+      "epoch": 0.18477,
+      "grad_norm": 1.0418061017990112,
+      "learning_rate": 0.003,
+      "loss": 3.9944,
+      "step": 18477
+    },
+    {
+      "epoch": 0.18478,
+      "grad_norm": 0.9475833773612976,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 18478
+    },
+    {
+      "epoch": 0.18479,
+      "grad_norm": 1.0286717414855957,
+      "learning_rate": 0.003,
+      "loss": 4.009,
+      "step": 18479
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 1.016998291015625,
+      "learning_rate": 0.003,
+      "loss": 3.9833,
+      "step": 18480
+    },
+    {
+      "epoch": 0.18481,
+      "grad_norm": 1.0216963291168213,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 18481
+    },
+    {
+      "epoch": 0.18482,
+      "grad_norm": 0.9507943391799927,
+      "learning_rate": 0.003,
+      "loss": 3.9861,
+      "step": 18482
+    },
+    {
+      "epoch": 0.18483,
+      "grad_norm": 0.8995311260223389,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 18483
+    },
+    {
+      "epoch": 0.18484,
+      "grad_norm": 0.7644209265708923,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 18484
+    },
+    {
+      "epoch": 0.18485,
+      "grad_norm": 0.9264317154884338,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 18485
+    },
+    {
+      "epoch": 0.18486,
+      "grad_norm": 0.9133664965629578,
+      "learning_rate": 0.003,
+      "loss": 3.9792,
+      "step": 18486
+    },
+    {
+      "epoch": 0.18487,
+      "grad_norm": 0.9255028963088989,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 18487
+    },
+    {
+      "epoch": 0.18488,
+      "grad_norm": 0.9390118718147278,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 18488
+    },
+    {
+      "epoch": 0.18489,
+      "grad_norm": 1.0404934883117676,
+      "learning_rate": 0.003,
+      "loss": 4.0271,
+      "step": 18489
+    },
+    {
+      "epoch": 0.1849,
+      "grad_norm": 0.8677457571029663,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 18490
+    },
+    {
+      "epoch": 0.18491,
+      "grad_norm": 0.7368670105934143,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 18491
+    },
+    {
+      "epoch": 0.18492,
+      "grad_norm": 0.6713873744010925,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 18492
+    },
+    {
+      "epoch": 0.18493,
+      "grad_norm": 0.6910095810890198,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 18493
+    },
+    {
+      "epoch": 0.18494,
+      "grad_norm": 0.6025274991989136,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 18494
+    },
+    {
+      "epoch": 0.18495,
+      "grad_norm": 0.6360000371932983,
+      "learning_rate": 0.003,
+      "loss": 3.994,
+      "step": 18495
+    },
+    {
+      "epoch": 0.18496,
+      "grad_norm": 0.7263234853744507,
+      "learning_rate": 0.003,
+      "loss": 3.9606,
+      "step": 18496
+    },
+    {
+      "epoch": 0.18497,
+      "grad_norm": 0.7698788046836853,
+      "learning_rate": 0.003,
+      "loss": 3.9817,
+      "step": 18497
+    },
+    {
+      "epoch": 0.18498,
+      "grad_norm": 0.9591576457023621,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 18498
+    },
+    {
+      "epoch": 0.18499,
+      "grad_norm": 1.0750406980514526,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 18499
+    },
+    {
+      "epoch": 0.185,
+      "grad_norm": 0.9537850022315979,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 18500
+    },
+    {
+      "epoch": 0.18501,
+      "grad_norm": 0.8107244372367859,
+      "learning_rate": 0.003,
+      "loss": 3.9678,
+      "step": 18501
+    },
+    {
+      "epoch": 0.18502,
+      "grad_norm": 0.5002862215042114,
+      "learning_rate": 0.003,
+      "loss": 3.9866,
+      "step": 18502
+    },
+    {
+      "epoch": 0.18503,
+      "grad_norm": 0.5681113600730896,
+      "learning_rate": 0.003,
+      "loss": 3.9988,
+      "step": 18503
+    },
+    {
+      "epoch": 0.18504,
+      "grad_norm": 0.5782869458198547,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 18504
+    },
+    {
+      "epoch": 0.18505,
+      "grad_norm": 0.6032190322875977,
+      "learning_rate": 0.003,
+      "loss": 3.9756,
+      "step": 18505
+    },
+    {
+      "epoch": 0.18506,
+      "grad_norm": 0.759202778339386,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 18506
+    },
+    {
+      "epoch": 0.18507,
+      "grad_norm": 0.9024019837379456,
+      "learning_rate": 0.003,
+      "loss": 3.9818,
+      "step": 18507
+    },
+    {
+      "epoch": 0.18508,
+      "grad_norm": 0.9643970727920532,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 18508
+    },
+    {
+      "epoch": 0.18509,
+      "grad_norm": 1.0283045768737793,
+      "learning_rate": 0.003,
+      "loss": 3.9763,
+      "step": 18509
+    },
+    {
+      "epoch": 0.1851,
+      "grad_norm": 0.8149793744087219,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 18510
+    },
+    {
+      "epoch": 0.18511,
+      "grad_norm": 0.6232828497886658,
+      "learning_rate": 0.003,
+      "loss": 4.0312,
+      "step": 18511
+    },
+    {
+      "epoch": 0.18512,
+      "grad_norm": 0.6650475859642029,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 18512
+    },
+    {
+      "epoch": 0.18513,
+      "grad_norm": 0.6731092929840088,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 18513
+    },
+    {
+      "epoch": 0.18514,
+      "grad_norm": 0.7437854409217834,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 18514
+    },
+    {
+      "epoch": 0.18515,
+      "grad_norm": 0.7701714038848877,
+      "learning_rate": 0.003,
+      "loss": 3.9541,
+      "step": 18515
+    },
+    {
+      "epoch": 0.18516,
+      "grad_norm": 0.6693366765975952,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 18516
+    },
+    {
+      "epoch": 0.18517,
+      "grad_norm": 0.7216953635215759,
+      "learning_rate": 0.003,
+      "loss": 3.9943,
+      "step": 18517
+    },
+    {
+      "epoch": 0.18518,
+      "grad_norm": 0.7274855375289917,
+      "learning_rate": 0.003,
+      "loss": 3.9687,
+      "step": 18518
+    },
+    {
+      "epoch": 0.18519,
+      "grad_norm": 0.7271580696105957,
+      "learning_rate": 0.003,
+      "loss": 3.9793,
+      "step": 18519
+    },
+    {
+      "epoch": 0.1852,
+      "grad_norm": 0.7157036662101746,
+      "learning_rate": 0.003,
+      "loss": 3.9832,
+      "step": 18520
+    },
+    {
+      "epoch": 0.18521,
+      "grad_norm": 0.6716972589492798,
+      "learning_rate": 0.003,
+      "loss": 3.9929,
+      "step": 18521
+    },
+    {
+      "epoch": 0.18522,
+      "grad_norm": 0.634327232837677,
+      "learning_rate": 0.003,
+      "loss": 3.9896,
+      "step": 18522
+    },
+    {
+      "epoch": 0.18523,
+      "grad_norm": 0.756804883480072,
+      "learning_rate": 0.003,
+      "loss": 3.9805,
+      "step": 18523
+    },
+    {
+      "epoch": 0.18524,
+      "grad_norm": 0.978564441204071,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 18524
+    },
+    {
+      "epoch": 0.18525,
+      "grad_norm": 0.9920554757118225,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 18525
+    },
+    {
+      "epoch": 0.18526,
+      "grad_norm": 0.9714943170547485,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 18526
+    },
+    {
+      "epoch": 0.18527,
+      "grad_norm": 0.8749993443489075,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 18527
+    },
+    {
+      "epoch": 0.18528,
+      "grad_norm": 0.9511075615882874,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 18528
+    },
+    {
+      "epoch": 0.18529,
+      "grad_norm": 1.1085911989212036,
+      "learning_rate": 0.003,
+      "loss": 3.9869,
+      "step": 18529
+    },
+    {
+      "epoch": 0.1853,
+      "grad_norm": 0.9463942050933838,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 18530
+    },
+    {
+      "epoch": 0.18531,
+      "grad_norm": 0.9208247065544128,
+      "learning_rate": 0.003,
+      "loss": 3.9878,
+      "step": 18531
+    },
+    {
+      "epoch": 0.18532,
+      "grad_norm": 0.9658119678497314,
+      "learning_rate": 0.003,
+      "loss": 4.0263,
+      "step": 18532
+    },
+    {
+      "epoch": 0.18533,
+      "grad_norm": 1.1414157152175903,
+      "learning_rate": 0.003,
+      "loss": 4.0324,
+      "step": 18533
+    },
+    {
+      "epoch": 0.18534,
+      "grad_norm": 0.9629420042037964,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 18534
+    },
+    {
+      "epoch": 0.18535,
+      "grad_norm": 0.9314190149307251,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 18535
+    },
+    {
+      "epoch": 0.18536,
+      "grad_norm": 0.9639930129051208,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 18536
+    },
+    {
+      "epoch": 0.18537,
+      "grad_norm": 0.9416884183883667,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 18537
+    },
+    {
+      "epoch": 0.18538,
+      "grad_norm": 0.8365035653114319,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 18538
+    },
+    {
+      "epoch": 0.18539,
+      "grad_norm": 0.908024787902832,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 18539
+    },
+    {
+      "epoch": 0.1854,
+      "grad_norm": 0.9564443826675415,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 18540
+    },
+    {
+      "epoch": 0.18541,
+      "grad_norm": 1.1441367864608765,
+      "learning_rate": 0.003,
+      "loss": 3.998,
+      "step": 18541
+    },
+    {
+      "epoch": 0.18542,
+      "grad_norm": 0.7297008037567139,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 18542
+    },
+    {
+      "epoch": 0.18543,
+      "grad_norm": 0.6466062068939209,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 18543
+    },
+    {
+      "epoch": 0.18544,
+      "grad_norm": 0.8631340265274048,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 18544
+    },
+    {
+      "epoch": 0.18545,
+      "grad_norm": 0.8763319253921509,
+      "learning_rate": 0.003,
+      "loss": 3.9689,
+      "step": 18545
+    },
+    {
+      "epoch": 0.18546,
+      "grad_norm": 0.8306936025619507,
+      "learning_rate": 0.003,
+      "loss": 3.9846,
+      "step": 18546
+    },
+    {
+      "epoch": 0.18547,
+      "grad_norm": 0.7532835006713867,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 18547
+    },
+    {
+      "epoch": 0.18548,
+      "grad_norm": 0.6749123930931091,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 18548
+    },
+    {
+      "epoch": 0.18549,
+      "grad_norm": 0.7876031398773193,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 18549
+    },
+    {
+      "epoch": 0.1855,
+      "grad_norm": 0.8753268718719482,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 18550
+    },
+    {
+      "epoch": 0.18551,
+      "grad_norm": 1.1003737449645996,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 18551
+    },
+    {
+      "epoch": 0.18552,
+      "grad_norm": 0.9563855528831482,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 18552
+    },
+    {
+      "epoch": 0.18553,
+      "grad_norm": 0.9888499975204468,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 18553
+    },
+    {
+      "epoch": 0.18554,
+      "grad_norm": 0.88606196641922,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 18554
+    },
+    {
+      "epoch": 0.18555,
+      "grad_norm": 0.6885478496551514,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 18555
+    },
+    {
+      "epoch": 0.18556,
+      "grad_norm": 0.7235888838768005,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 18556
+    },
+    {
+      "epoch": 0.18557,
+      "grad_norm": 0.7943369150161743,
+      "learning_rate": 0.003,
+      "loss": 3.9752,
+      "step": 18557
+    },
+    {
+      "epoch": 0.18558,
+      "grad_norm": 0.8080901503562927,
+      "learning_rate": 0.003,
+      "loss": 3.9924,
+      "step": 18558
+    },
+    {
+      "epoch": 0.18559,
+      "grad_norm": 0.6975395679473877,
+      "learning_rate": 0.003,
+      "loss": 3.9881,
+      "step": 18559
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.624427855014801,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 18560
+    },
+    {
+      "epoch": 0.18561,
+      "grad_norm": 0.5854123830795288,
+      "learning_rate": 0.003,
+      "loss": 3.9833,
+      "step": 18561
+    },
+    {
+      "epoch": 0.18562,
+      "grad_norm": 0.731335461139679,
+      "learning_rate": 0.003,
+      "loss": 3.9892,
+      "step": 18562
+    },
+    {
+      "epoch": 0.18563,
+      "grad_norm": 0.8278244733810425,
+      "learning_rate": 0.003,
+      "loss": 3.9679,
+      "step": 18563
+    },
+    {
+      "epoch": 0.18564,
+      "grad_norm": 0.8942627906799316,
+      "learning_rate": 0.003,
+      "loss": 3.9801,
+      "step": 18564
+    },
+    {
+      "epoch": 0.18565,
+      "grad_norm": 1.1789724826812744,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 18565
+    },
+    {
+      "epoch": 0.18566,
+      "grad_norm": 0.8848671913146973,
+      "learning_rate": 0.003,
+      "loss": 3.9784,
+      "step": 18566
+    },
+    {
+      "epoch": 0.18567,
+      "grad_norm": 0.6374181509017944,
+      "learning_rate": 0.003,
+      "loss": 3.9905,
+      "step": 18567
+    },
+    {
+      "epoch": 0.18568,
+      "grad_norm": 0.49829450249671936,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 18568
+    },
+    {
+      "epoch": 0.18569,
+      "grad_norm": 0.5674188137054443,
+      "learning_rate": 0.003,
+      "loss": 3.9998,
+      "step": 18569
+    },
+    {
+      "epoch": 0.1857,
+      "grad_norm": 0.6359483003616333,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 18570
+    },
+    {
+      "epoch": 0.18571,
+      "grad_norm": 0.6313909888267517,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 18571
+    },
+    {
+      "epoch": 0.18572,
+      "grad_norm": 0.6858205199241638,
+      "learning_rate": 0.003,
+      "loss": 3.9887,
+      "step": 18572
+    },
+    {
+      "epoch": 0.18573,
+      "grad_norm": 0.8699098825454712,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 18573
+    },
+    {
+      "epoch": 0.18574,
+      "grad_norm": 1.1460634469985962,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 18574
+    },
+    {
+      "epoch": 0.18575,
+      "grad_norm": 0.8282952308654785,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 18575
+    },
+    {
+      "epoch": 0.18576,
+      "grad_norm": 0.8061177730560303,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 18576
+    },
+    {
+      "epoch": 0.18577,
+      "grad_norm": 0.8096114993095398,
+      "learning_rate": 0.003,
+      "loss": 3.9774,
+      "step": 18577
+    },
+    {
+      "epoch": 0.18578,
+      "grad_norm": 0.764124870300293,
+      "learning_rate": 0.003,
+      "loss": 3.9727,
+      "step": 18578
+    },
+    {
+      "epoch": 0.18579,
+      "grad_norm": 0.7354459762573242,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 18579
+    },
+    {
+      "epoch": 0.1858,
+      "grad_norm": 0.917849063873291,
+      "learning_rate": 0.003,
+      "loss": 3.9665,
+      "step": 18580
+    },
+    {
+      "epoch": 0.18581,
+      "grad_norm": 1.1212433576583862,
+      "learning_rate": 0.003,
+      "loss": 3.973,
+      "step": 18581
+    },
+    {
+      "epoch": 0.18582,
+      "grad_norm": 1.0859284400939941,
+      "learning_rate": 0.003,
+      "loss": 4.0057,
+      "step": 18582
+    },
+    {
+      "epoch": 0.18583,
+      "grad_norm": 0.7617725133895874,
+      "learning_rate": 0.003,
+      "loss": 3.9873,
+      "step": 18583
+    },
+    {
+      "epoch": 0.18584,
+      "grad_norm": 0.6631203293800354,
+      "learning_rate": 0.003,
+      "loss": 3.9896,
+      "step": 18584
+    },
+    {
+      "epoch": 0.18585,
+      "grad_norm": 0.7645079493522644,
+      "learning_rate": 0.003,
+      "loss": 3.9815,
+      "step": 18585
+    },
+    {
+      "epoch": 0.18586,
+      "grad_norm": 0.7495100498199463,
+      "learning_rate": 0.003,
+      "loss": 3.9782,
+      "step": 18586
+    },
+    {
+      "epoch": 0.18587,
+      "grad_norm": 0.810477077960968,
+      "learning_rate": 0.003,
+      "loss": 3.9728,
+      "step": 18587
+    },
+    {
+      "epoch": 0.18588,
+      "grad_norm": 0.8579960465431213,
+      "learning_rate": 0.003,
+      "loss": 3.987,
+      "step": 18588
+    },
+    {
+      "epoch": 0.18589,
+      "grad_norm": 0.8831098675727844,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 18589
+    },
+    {
+      "epoch": 0.1859,
+      "grad_norm": 0.8370000720024109,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 18590
+    },
+    {
+      "epoch": 0.18591,
+      "grad_norm": 0.8281912803649902,
+      "learning_rate": 0.003,
+      "loss": 3.973,
+      "step": 18591
+    },
+    {
+      "epoch": 0.18592,
+      "grad_norm": 0.926630437374115,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 18592
+    },
+    {
+      "epoch": 0.18593,
+      "grad_norm": 0.9408486485481262,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 18593
+    },
+    {
+      "epoch": 0.18594,
+      "grad_norm": 1.0552456378936768,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 18594
+    },
+    {
+      "epoch": 0.18595,
+      "grad_norm": 1.0246379375457764,
+      "learning_rate": 0.003,
+      "loss": 4.009,
+      "step": 18595
+    },
+    {
+      "epoch": 0.18596,
+      "grad_norm": 0.9521511793136597,
+      "learning_rate": 0.003,
+      "loss": 4.0407,
+      "step": 18596
+    },
+    {
+      "epoch": 0.18597,
+      "grad_norm": 0.8722307682037354,
+      "learning_rate": 0.003,
+      "loss": 4.0337,
+      "step": 18597
+    },
+    {
+      "epoch": 0.18598,
+      "grad_norm": 0.93885338306427,
+      "learning_rate": 0.003,
+      "loss": 4.0411,
+      "step": 18598
+    },
+    {
+      "epoch": 0.18599,
+      "grad_norm": 1.083472728729248,
+      "learning_rate": 0.003,
+      "loss": 3.9952,
+      "step": 18599
+    },
+    {
+      "epoch": 0.186,
+      "grad_norm": 1.032609462738037,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 18600
+    },
+    {
+      "epoch": 0.18601,
+      "grad_norm": 0.8510088324546814,
+      "learning_rate": 0.003,
+      "loss": 3.9834,
+      "step": 18601
+    },
+    {
+      "epoch": 0.18602,
+      "grad_norm": 0.8725912570953369,
+      "learning_rate": 0.003,
+      "loss": 3.981,
+      "step": 18602
+    },
+    {
+      "epoch": 0.18603,
+      "grad_norm": 1.2701548337936401,
+      "learning_rate": 0.003,
+      "loss": 3.9865,
+      "step": 18603
+    },
+    {
+      "epoch": 0.18604,
+      "grad_norm": 1.0352787971496582,
+      "learning_rate": 0.003,
+      "loss": 4.0429,
+      "step": 18604
+    },
+    {
+      "epoch": 0.18605,
+      "grad_norm": 0.9110256433486938,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 18605
+    },
+    {
+      "epoch": 0.18606,
+      "grad_norm": 0.8785024285316467,
+      "learning_rate": 0.003,
+      "loss": 4.0303,
+      "step": 18606
+    },
+    {
+      "epoch": 0.18607,
+      "grad_norm": 0.7071681022644043,
+      "learning_rate": 0.003,
+      "loss": 4.0302,
+      "step": 18607
+    },
+    {
+      "epoch": 0.18608,
+      "grad_norm": 0.6964113116264343,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 18608
+    },
+    {
+      "epoch": 0.18609,
+      "grad_norm": 0.654489278793335,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 18609
+    },
+    {
+      "epoch": 0.1861,
+      "grad_norm": 0.6638596653938293,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 18610
+    },
+    {
+      "epoch": 0.18611,
+      "grad_norm": 0.6475112438201904,
+      "learning_rate": 0.003,
+      "loss": 3.9966,
+      "step": 18611
+    },
+    {
+      "epoch": 0.18612,
+      "grad_norm": 0.6679487824440002,
+      "learning_rate": 0.003,
+      "loss": 4.0091,
+      "step": 18612
+    },
+    {
+      "epoch": 0.18613,
+      "grad_norm": 0.7296201586723328,
+      "learning_rate": 0.003,
+      "loss": 4.0397,
+      "step": 18613
+    },
+    {
+      "epoch": 0.18614,
+      "grad_norm": 0.7591899037361145,
+      "learning_rate": 0.003,
+      "loss": 3.9601,
+      "step": 18614
+    },
+    {
+      "epoch": 0.18615,
+      "grad_norm": 0.8773738145828247,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 18615
+    },
+    {
+      "epoch": 0.18616,
+      "grad_norm": 0.9669382572174072,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 18616
+    },
+    {
+      "epoch": 0.18617,
+      "grad_norm": 1.0774128437042236,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 18617
+    },
+    {
+      "epoch": 0.18618,
+      "grad_norm": 0.9947457909584045,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 18618
+    },
+    {
+      "epoch": 0.18619,
+      "grad_norm": 1.021847128868103,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 18619
+    },
+    {
+      "epoch": 0.1862,
+      "grad_norm": 0.8634188175201416,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 18620
+    },
+    {
+      "epoch": 0.18621,
+      "grad_norm": 0.8271879553794861,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 18621
+    },
+    {
+      "epoch": 0.18622,
+      "grad_norm": 0.871752142906189,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 18622
+    },
+    {
+      "epoch": 0.18623,
+      "grad_norm": 0.7900510430335999,
+      "learning_rate": 0.003,
+      "loss": 3.997,
+      "step": 18623
+    },
+    {
+      "epoch": 0.18624,
+      "grad_norm": 0.7515827417373657,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 18624
+    },
+    {
+      "epoch": 0.18625,
+      "grad_norm": 0.6824390888214111,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 18625
+    },
+    {
+      "epoch": 0.18626,
+      "grad_norm": 0.7836955785751343,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 18626
+    },
+    {
+      "epoch": 0.18627,
+      "grad_norm": 0.9825575947761536,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 18627
+    },
+    {
+      "epoch": 0.18628,
+      "grad_norm": 1.022945761680603,
+      "learning_rate": 0.003,
+      "loss": 3.9975,
+      "step": 18628
+    },
+    {
+      "epoch": 0.18629,
+      "grad_norm": 1.023468255996704,
+      "learning_rate": 0.003,
+      "loss": 3.9952,
+      "step": 18629
+    },
+    {
+      "epoch": 0.1863,
+      "grad_norm": 0.9927089810371399,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 18630
+    },
+    {
+      "epoch": 0.18631,
+      "grad_norm": 0.9182254076004028,
+      "learning_rate": 0.003,
+      "loss": 4.0392,
+      "step": 18631
+    },
+    {
+      "epoch": 0.18632,
+      "grad_norm": 0.9914144277572632,
+      "learning_rate": 0.003,
+      "loss": 3.9915,
+      "step": 18632
+    },
+    {
+      "epoch": 0.18633,
+      "grad_norm": 1.050477385520935,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 18633
+    },
+    {
+      "epoch": 0.18634,
+      "grad_norm": 0.7706889510154724,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 18634
+    },
+    {
+      "epoch": 0.18635,
+      "grad_norm": 0.6501280069351196,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 18635
+    },
+    {
+      "epoch": 0.18636,
+      "grad_norm": 0.5715956687927246,
+      "learning_rate": 0.003,
+      "loss": 3.9853,
+      "step": 18636
+    },
+    {
+      "epoch": 0.18637,
+      "grad_norm": 0.538364589214325,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 18637
+    },
+    {
+      "epoch": 0.18638,
+      "grad_norm": 0.556035578250885,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 18638
+    },
+    {
+      "epoch": 0.18639,
+      "grad_norm": 0.5113933086395264,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 18639
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.4925510585308075,
+      "learning_rate": 0.003,
+      "loss": 3.9991,
+      "step": 18640
+    },
+    {
+      "epoch": 0.18641,
+      "grad_norm": 0.4664783179759979,
+      "learning_rate": 0.003,
+      "loss": 3.9731,
+      "step": 18641
+    },
+    {
+      "epoch": 0.18642,
+      "grad_norm": 0.5348178744316101,
+      "learning_rate": 0.003,
+      "loss": 3.9718,
+      "step": 18642
+    },
+    {
+      "epoch": 0.18643,
+      "grad_norm": 0.6688624620437622,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 18643
+    },
+    {
+      "epoch": 0.18644,
+      "grad_norm": 0.7310923337936401,
+      "learning_rate": 0.003,
+      "loss": 3.9813,
+      "step": 18644
+    },
+    {
+      "epoch": 0.18645,
+      "grad_norm": 0.8424361348152161,
+      "learning_rate": 0.003,
+      "loss": 3.9863,
+      "step": 18645
+    },
+    {
+      "epoch": 0.18646,
+      "grad_norm": 0.922663152217865,
+      "learning_rate": 0.003,
+      "loss": 3.9878,
+      "step": 18646
+    },
+    {
+      "epoch": 0.18647,
+      "grad_norm": 0.9829749464988708,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 18647
+    },
+    {
+      "epoch": 0.18648,
+      "grad_norm": 1.1478034257888794,
+      "learning_rate": 0.003,
+      "loss": 3.9918,
+      "step": 18648
+    },
+    {
+      "epoch": 0.18649,
+      "grad_norm": 0.9126496315002441,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 18649
+    },
+    {
+      "epoch": 0.1865,
+      "grad_norm": 0.9763796925544739,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 18650
+    },
+    {
+      "epoch": 0.18651,
+      "grad_norm": 1.0723530054092407,
+      "learning_rate": 0.003,
+      "loss": 3.9972,
+      "step": 18651
+    },
+    {
+      "epoch": 0.18652,
+      "grad_norm": 0.7517349123954773,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 18652
+    },
+    {
+      "epoch": 0.18653,
+      "grad_norm": 0.6144676804542542,
+      "learning_rate": 0.003,
+      "loss": 3.9728,
+      "step": 18653
+    },
+    {
+      "epoch": 0.18654,
+      "grad_norm": 0.6148786544799805,
+      "learning_rate": 0.003,
+      "loss": 3.9803,
+      "step": 18654
+    },
+    {
+      "epoch": 0.18655,
+      "grad_norm": 0.8592705726623535,
+      "learning_rate": 0.003,
+      "loss": 3.9941,
+      "step": 18655
+    },
+    {
+      "epoch": 0.18656,
+      "grad_norm": 1.053800344467163,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 18656
+    },
+    {
+      "epoch": 0.18657,
+      "grad_norm": 0.9127708673477173,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 18657
+    },
+    {
+      "epoch": 0.18658,
+      "grad_norm": 0.7173557281494141,
+      "learning_rate": 0.003,
+      "loss": 3.9967,
+      "step": 18658
+    },
+    {
+      "epoch": 0.18659,
+      "grad_norm": 0.6722642779350281,
+      "learning_rate": 0.003,
+      "loss": 3.9954,
+      "step": 18659
+    },
+    {
+      "epoch": 0.1866,
+      "grad_norm": 0.7511748671531677,
+      "learning_rate": 0.003,
+      "loss": 3.9771,
+      "step": 18660
+    },
+    {
+      "epoch": 0.18661,
+      "grad_norm": 0.8174583315849304,
+      "learning_rate": 0.003,
+      "loss": 3.9699,
+      "step": 18661
+    },
+    {
+      "epoch": 0.18662,
+      "grad_norm": 0.9325646162033081,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 18662
+    },
+    {
+      "epoch": 0.18663,
+      "grad_norm": 0.9656684398651123,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 18663
+    },
+    {
+      "epoch": 0.18664,
+      "grad_norm": 0.9210911989212036,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 18664
+    },
+    {
+      "epoch": 0.18665,
+      "grad_norm": 1.0134801864624023,
+      "learning_rate": 0.003,
+      "loss": 3.9954,
+      "step": 18665
+    },
+    {
+      "epoch": 0.18666,
+      "grad_norm": 1.0047428607940674,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 18666
+    },
+    {
+      "epoch": 0.18667,
+      "grad_norm": 0.9935968518257141,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 18667
+    },
+    {
+      "epoch": 0.18668,
+      "grad_norm": 1.0019229650497437,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 18668
+    },
+    {
+      "epoch": 0.18669,
+      "grad_norm": 1.104162335395813,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 18669
+    },
+    {
+      "epoch": 0.1867,
+      "grad_norm": 0.913928747177124,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 18670
+    },
+    {
+      "epoch": 0.18671,
+      "grad_norm": 0.9824994206428528,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 18671
+    },
+    {
+      "epoch": 0.18672,
+      "grad_norm": 1.0232406854629517,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 18672
+    },
+    {
+      "epoch": 0.18673,
+      "grad_norm": 1.027138113975525,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 18673
+    },
+    {
+      "epoch": 0.18674,
+      "grad_norm": 1.1209120750427246,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 18674
+    },
+    {
+      "epoch": 0.18675,
+      "grad_norm": 0.9312422275543213,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 18675
+    },
+    {
+      "epoch": 0.18676,
+      "grad_norm": 0.7721902132034302,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 18676
+    },
+    {
+      "epoch": 0.18677,
+      "grad_norm": 0.8749040961265564,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 18677
+    },
+    {
+      "epoch": 0.18678,
+      "grad_norm": 0.8137281537055969,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 18678
+    },
+    {
+      "epoch": 0.18679,
+      "grad_norm": 0.7488441467285156,
+      "learning_rate": 0.003,
+      "loss": 3.9863,
+      "step": 18679
+    },
+    {
+      "epoch": 0.1868,
+      "grad_norm": 0.6902167797088623,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 18680
+    },
+    {
+      "epoch": 0.18681,
+      "grad_norm": 0.6958896517753601,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 18681
+    },
+    {
+      "epoch": 0.18682,
+      "grad_norm": 0.7173922657966614,
+      "learning_rate": 0.003,
+      "loss": 3.9992,
+      "step": 18682
+    },
+    {
+      "epoch": 0.18683,
+      "grad_norm": 0.7283394932746887,
+      "learning_rate": 0.003,
+      "loss": 3.9916,
+      "step": 18683
+    },
+    {
+      "epoch": 0.18684,
+      "grad_norm": 0.7280787825584412,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 18684
+    },
+    {
+      "epoch": 0.18685,
+      "grad_norm": 0.7102197408676147,
+      "learning_rate": 0.003,
+      "loss": 3.9908,
+      "step": 18685
+    },
+    {
+      "epoch": 0.18686,
+      "grad_norm": 0.7416349649429321,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 18686
+    },
+    {
+      "epoch": 0.18687,
+      "grad_norm": 0.7413874268531799,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 18687
+    },
+    {
+      "epoch": 0.18688,
+      "grad_norm": 0.7215458154678345,
+      "learning_rate": 0.003,
+      "loss": 3.992,
+      "step": 18688
+    },
+    {
+      "epoch": 0.18689,
+      "grad_norm": 0.6806061863899231,
+      "learning_rate": 0.003,
+      "loss": 3.967,
+      "step": 18689
+    },
+    {
+      "epoch": 0.1869,
+      "grad_norm": 0.8180581331253052,
+      "learning_rate": 0.003,
+      "loss": 3.9973,
+      "step": 18690
+    },
+    {
+      "epoch": 0.18691,
+      "grad_norm": 1.1742573976516724,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 18691
+    },
+    {
+      "epoch": 0.18692,
+      "grad_norm": 1.1151024103164673,
+      "learning_rate": 0.003,
+      "loss": 3.9834,
+      "step": 18692
+    },
+    {
+      "epoch": 0.18693,
+      "grad_norm": 0.6225581169128418,
+      "learning_rate": 0.003,
+      "loss": 3.9918,
+      "step": 18693
+    },
+    {
+      "epoch": 0.18694,
+      "grad_norm": 0.5789169073104858,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 18694
+    },
+    {
+      "epoch": 0.18695,
+      "grad_norm": 0.8254528045654297,
+      "learning_rate": 0.003,
+      "loss": 3.9832,
+      "step": 18695
+    },
+    {
+      "epoch": 0.18696,
+      "grad_norm": 1.1789621114730835,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 18696
+    },
+    {
+      "epoch": 0.18697,
+      "grad_norm": 1.0584477186203003,
+      "learning_rate": 0.003,
+      "loss": 3.9808,
+      "step": 18697
+    },
+    {
+      "epoch": 0.18698,
+      "grad_norm": 0.8069787621498108,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 18698
+    },
+    {
+      "epoch": 0.18699,
+      "grad_norm": 0.7804241180419922,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 18699
+    },
+    {
+      "epoch": 0.187,
+      "grad_norm": 0.7091956734657288,
+      "learning_rate": 0.003,
+      "loss": 3.9857,
+      "step": 18700
+    },
+    {
+      "epoch": 0.18701,
+      "grad_norm": 0.6648358106613159,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 18701
+    },
+    {
+      "epoch": 0.18702,
+      "grad_norm": 0.6405989527702332,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 18702
+    },
+    {
+      "epoch": 0.18703,
+      "grad_norm": 0.6682977080345154,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 18703
+    },
+    {
+      "epoch": 0.18704,
+      "grad_norm": 0.7034135460853577,
+      "learning_rate": 0.003,
+      "loss": 3.9815,
+      "step": 18704
+    },
+    {
+      "epoch": 0.18705,
+      "grad_norm": 0.7313877940177917,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 18705
+    },
+    {
+      "epoch": 0.18706,
+      "grad_norm": 0.7735674977302551,
+      "learning_rate": 0.003,
+      "loss": 3.9773,
+      "step": 18706
+    },
+    {
+      "epoch": 0.18707,
+      "grad_norm": 0.8979125618934631,
+      "learning_rate": 0.003,
+      "loss": 3.9875,
+      "step": 18707
+    },
+    {
+      "epoch": 0.18708,
+      "grad_norm": 0.9722280502319336,
+      "learning_rate": 0.003,
+      "loss": 3.9686,
+      "step": 18708
+    },
+    {
+      "epoch": 0.18709,
+      "grad_norm": 1.004054307937622,
+      "learning_rate": 0.003,
+      "loss": 4.0028,
+      "step": 18709
+    },
+    {
+      "epoch": 0.1871,
+      "grad_norm": 0.8962096571922302,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 18710
+    },
+    {
+      "epoch": 0.18711,
+      "grad_norm": 0.7256183624267578,
+      "learning_rate": 0.003,
+      "loss": 3.9975,
+      "step": 18711
+    },
+    {
+      "epoch": 0.18712,
+      "grad_norm": 0.7422738075256348,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 18712
+    },
+    {
+      "epoch": 0.18713,
+      "grad_norm": 0.7318476438522339,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 18713
+    },
+    {
+      "epoch": 0.18714,
+      "grad_norm": 0.9459980726242065,
+      "learning_rate": 0.003,
+      "loss": 3.9868,
+      "step": 18714
+    },
+    {
+      "epoch": 0.18715,
+      "grad_norm": 1.1251165866851807,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 18715
+    },
+    {
+      "epoch": 0.18716,
+      "grad_norm": 0.9163543581962585,
+      "learning_rate": 0.003,
+      "loss": 4.0002,
+      "step": 18716
+    },
+    {
+      "epoch": 0.18717,
+      "grad_norm": 1.009063720703125,
+      "learning_rate": 0.003,
+      "loss": 3.9727,
+      "step": 18717
+    },
+    {
+      "epoch": 0.18718,
+      "grad_norm": 1.2291123867034912,
+      "learning_rate": 0.003,
+      "loss": 3.9898,
+      "step": 18718
+    },
+    {
+      "epoch": 0.18719,
+      "grad_norm": 0.8537262678146362,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 18719
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.675536036491394,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 18720
+    },
+    {
+      "epoch": 0.18721,
+      "grad_norm": 0.6935260891914368,
+      "learning_rate": 0.003,
+      "loss": 3.996,
+      "step": 18721
+    },
+    {
+      "epoch": 0.18722,
+      "grad_norm": 0.7502979040145874,
+      "learning_rate": 0.003,
+      "loss": 3.9929,
+      "step": 18722
+    },
+    {
+      "epoch": 0.18723,
+      "grad_norm": 0.8143985867500305,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 18723
+    },
+    {
+      "epoch": 0.18724,
+      "grad_norm": 0.8948759436607361,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 18724
+    },
+    {
+      "epoch": 0.18725,
+      "grad_norm": 0.8538362979888916,
+      "learning_rate": 0.003,
+      "loss": 3.9898,
+      "step": 18725
+    },
+    {
+      "epoch": 0.18726,
+      "grad_norm": 1.0155415534973145,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 18726
+    },
+    {
+      "epoch": 0.18727,
+      "grad_norm": 1.1329224109649658,
+      "learning_rate": 0.003,
+      "loss": 3.9857,
+      "step": 18727
+    },
+    {
+      "epoch": 0.18728,
+      "grad_norm": 0.8613154292106628,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 18728
+    },
+    {
+      "epoch": 0.18729,
+      "grad_norm": 0.8102912306785583,
+      "learning_rate": 0.003,
+      "loss": 4.0431,
+      "step": 18729
+    },
+    {
+      "epoch": 0.1873,
+      "grad_norm": 0.720447301864624,
+      "learning_rate": 0.003,
+      "loss": 3.9913,
+      "step": 18730
+    },
+    {
+      "epoch": 0.18731,
+      "grad_norm": 0.7747088670730591,
+      "learning_rate": 0.003,
+      "loss": 4.0001,
+      "step": 18731
+    },
+    {
+      "epoch": 0.18732,
+      "grad_norm": 0.8366368412971497,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 18732
+    },
+    {
+      "epoch": 0.18733,
+      "grad_norm": 1.0016186237335205,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 18733
+    },
+    {
+      "epoch": 0.18734,
+      "grad_norm": 1.1299272775650024,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 18734
+    },
+    {
+      "epoch": 0.18735,
+      "grad_norm": 0.7979495525360107,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 18735
+    },
+    {
+      "epoch": 0.18736,
+      "grad_norm": 0.7674964666366577,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 18736
+    },
+    {
+      "epoch": 0.18737,
+      "grad_norm": 0.8326266407966614,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 18737
+    },
+    {
+      "epoch": 0.18738,
+      "grad_norm": 1.0282922983169556,
+      "learning_rate": 0.003,
+      "loss": 3.992,
+      "step": 18738
+    },
+    {
+      "epoch": 0.18739,
+      "grad_norm": 1.325468897819519,
+      "learning_rate": 0.003,
+      "loss": 4.0341,
+      "step": 18739
+    },
+    {
+      "epoch": 0.1874,
+      "grad_norm": 0.7673645615577698,
+      "learning_rate": 0.003,
+      "loss": 3.9694,
+      "step": 18740
+    },
+    {
+      "epoch": 0.18741,
+      "grad_norm": 0.6764647960662842,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 18741
+    },
+    {
+      "epoch": 0.18742,
+      "grad_norm": 0.8184046149253845,
+      "learning_rate": 0.003,
+      "loss": 3.9985,
+      "step": 18742
+    },
+    {
+      "epoch": 0.18743,
+      "grad_norm": 0.7239495515823364,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 18743
+    },
+    {
+      "epoch": 0.18744,
+      "grad_norm": 0.6987858414649963,
+      "learning_rate": 0.003,
+      "loss": 3.9868,
+      "step": 18744
+    },
+    {
+      "epoch": 0.18745,
+      "grad_norm": 0.8750888109207153,
+      "learning_rate": 0.003,
+      "loss": 3.987,
+      "step": 18745
+    },
+    {
+      "epoch": 0.18746,
+      "grad_norm": 0.8903581500053406,
+      "learning_rate": 0.003,
+      "loss": 3.9811,
+      "step": 18746
+    },
+    {
+      "epoch": 0.18747,
+      "grad_norm": 0.8389039635658264,
+      "learning_rate": 0.003,
+      "loss": 4.028,
+      "step": 18747
+    },
+    {
+      "epoch": 0.18748,
+      "grad_norm": 0.8459964394569397,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 18748
+    },
+    {
+      "epoch": 0.18749,
+      "grad_norm": 0.8524193167686462,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 18749
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 0.8610021471977234,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 18750
+    },
+    {
+      "epoch": 0.18751,
+      "grad_norm": 0.9103726744651794,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 18751
+    },
+    {
+      "epoch": 0.18752,
+      "grad_norm": 0.8229556679725647,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 18752
+    },
+    {
+      "epoch": 0.18753,
+      "grad_norm": 0.8255165219306946,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 18753
+    },
+    {
+      "epoch": 0.18754,
+      "grad_norm": 0.7489179372787476,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 18754
+    },
+    {
+      "epoch": 0.18755,
+      "grad_norm": 0.8602535128593445,
+      "learning_rate": 0.003,
+      "loss": 3.9855,
+      "step": 18755
+    },
+    {
+      "epoch": 0.18756,
+      "grad_norm": 0.8368017673492432,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 18756
+    },
+    {
+      "epoch": 0.18757,
+      "grad_norm": 0.7005070447921753,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 18757
+    },
+    {
+      "epoch": 0.18758,
+      "grad_norm": 0.7822446823120117,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 18758
+    },
+    {
+      "epoch": 0.18759,
+      "grad_norm": 0.775924026966095,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 18759
+    },
+    {
+      "epoch": 0.1876,
+      "grad_norm": 0.7569155693054199,
+      "learning_rate": 0.003,
+      "loss": 3.982,
+      "step": 18760
+    },
+    {
+      "epoch": 0.18761,
+      "grad_norm": 0.7290402054786682,
+      "learning_rate": 0.003,
+      "loss": 4.0006,
+      "step": 18761
+    },
+    {
+      "epoch": 0.18762,
+      "grad_norm": 0.7267779111862183,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 18762
+    },
+    {
+      "epoch": 0.18763,
+      "grad_norm": 0.6764037013053894,
+      "learning_rate": 0.003,
+      "loss": 3.9966,
+      "step": 18763
+    },
+    {
+      "epoch": 0.18764,
+      "grad_norm": 0.7469682693481445,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 18764
+    },
+    {
+      "epoch": 0.18765,
+      "grad_norm": 0.9405340552330017,
+      "learning_rate": 0.003,
+      "loss": 3.9825,
+      "step": 18765
+    },
+    {
+      "epoch": 0.18766,
+      "grad_norm": 1.1264852285385132,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 18766
+    },
+    {
+      "epoch": 0.18767,
+      "grad_norm": 1.2135547399520874,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 18767
+    },
+    {
+      "epoch": 0.18768,
+      "grad_norm": 0.7948119044303894,
+      "learning_rate": 0.003,
+      "loss": 3.9948,
+      "step": 18768
+    },
+    {
+      "epoch": 0.18769,
+      "grad_norm": 0.8168572783470154,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 18769
+    },
+    {
+      "epoch": 0.1877,
+      "grad_norm": 0.8725208044052124,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 18770
+    },
+    {
+      "epoch": 0.18771,
+      "grad_norm": 0.886518120765686,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 18771
+    },
+    {
+      "epoch": 0.18772,
+      "grad_norm": 0.8297489881515503,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 18772
+    },
+    {
+      "epoch": 0.18773,
+      "grad_norm": 0.7882468700408936,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 18773
+    },
+    {
+      "epoch": 0.18774,
+      "grad_norm": 0.7615990042686462,
+      "learning_rate": 0.003,
+      "loss": 3.9848,
+      "step": 18774
+    },
+    {
+      "epoch": 0.18775,
+      "grad_norm": 0.7042850255966187,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 18775
+    },
+    {
+      "epoch": 0.18776,
+      "grad_norm": 0.6992802619934082,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 18776
+    },
+    {
+      "epoch": 0.18777,
+      "grad_norm": 0.7362062931060791,
+      "learning_rate": 0.003,
+      "loss": 3.9985,
+      "step": 18777
+    },
+    {
+      "epoch": 0.18778,
+      "grad_norm": 0.991096556186676,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 18778
+    },
+    {
+      "epoch": 0.18779,
+      "grad_norm": 1.3604698181152344,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 18779
+    },
+    {
+      "epoch": 0.1878,
+      "grad_norm": 0.6573266386985779,
+      "learning_rate": 0.003,
+      "loss": 3.9834,
+      "step": 18780
+    },
+    {
+      "epoch": 0.18781,
+      "grad_norm": 0.6684979796409607,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 18781
+    },
+    {
+      "epoch": 0.18782,
+      "grad_norm": 0.6320002675056458,
+      "learning_rate": 0.003,
+      "loss": 3.9605,
+      "step": 18782
+    },
+    {
+      "epoch": 0.18783,
+      "grad_norm": 0.675640881061554,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 18783
+    },
+    {
+      "epoch": 0.18784,
+      "grad_norm": 0.670727014541626,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 18784
+    },
+    {
+      "epoch": 0.18785,
+      "grad_norm": 0.6947329640388489,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 18785
+    },
+    {
+      "epoch": 0.18786,
+      "grad_norm": 0.813109815120697,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 18786
+    },
+    {
+      "epoch": 0.18787,
+      "grad_norm": 0.9462290406227112,
+      "learning_rate": 0.003,
+      "loss": 3.997,
+      "step": 18787
+    },
+    {
+      "epoch": 0.18788,
+      "grad_norm": 1.2447432279586792,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 18788
+    },
+    {
+      "epoch": 0.18789,
+      "grad_norm": 0.8113349676132202,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 18789
+    },
+    {
+      "epoch": 0.1879,
+      "grad_norm": 0.6997334361076355,
+      "learning_rate": 0.003,
+      "loss": 4.0004,
+      "step": 18790
+    },
+    {
+      "epoch": 0.18791,
+      "grad_norm": 0.7395490407943726,
+      "learning_rate": 0.003,
+      "loss": 3.9763,
+      "step": 18791
+    },
+    {
+      "epoch": 0.18792,
+      "grad_norm": 0.653052031993866,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 18792
+    },
+    {
+      "epoch": 0.18793,
+      "grad_norm": 0.6461042761802673,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 18793
+    },
+    {
+      "epoch": 0.18794,
+      "grad_norm": 0.8122109770774841,
+      "learning_rate": 0.003,
+      "loss": 3.9975,
+      "step": 18794
+    },
+    {
+      "epoch": 0.18795,
+      "grad_norm": 1.1163297891616821,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 18795
+    },
+    {
+      "epoch": 0.18796,
+      "grad_norm": 1.1696635484695435,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 18796
+    },
+    {
+      "epoch": 0.18797,
+      "grad_norm": 0.8114674091339111,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 18797
+    },
+    {
+      "epoch": 0.18798,
+      "grad_norm": 0.7650184035301208,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 18798
+    },
+    {
+      "epoch": 0.18799,
+      "grad_norm": 0.8254711031913757,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 18799
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.9358893632888794,
+      "learning_rate": 0.003,
+      "loss": 4.0027,
+      "step": 18800
+    },
+    {
+      "epoch": 0.18801,
+      "grad_norm": 1.0084511041641235,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 18801
+    },
+    {
+      "epoch": 0.18802,
+      "grad_norm": 0.9665206670761108,
+      "learning_rate": 0.003,
+      "loss": 3.9991,
+      "step": 18802
+    },
+    {
+      "epoch": 0.18803,
+      "grad_norm": 0.9788021445274353,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 18803
+    },
+    {
+      "epoch": 0.18804,
+      "grad_norm": 0.9931485652923584,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 18804
+    },
+    {
+      "epoch": 0.18805,
+      "grad_norm": 1.2854509353637695,
+      "learning_rate": 0.003,
+      "loss": 4.047,
+      "step": 18805
+    },
+    {
+      "epoch": 0.18806,
+      "grad_norm": 0.9651660919189453,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 18806
+    },
+    {
+      "epoch": 0.18807,
+      "grad_norm": 1.035075068473816,
+      "learning_rate": 0.003,
+      "loss": 3.9893,
+      "step": 18807
+    },
+    {
+      "epoch": 0.18808,
+      "grad_norm": 1.0494685173034668,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 18808
+    },
+    {
+      "epoch": 0.18809,
+      "grad_norm": 0.9226809144020081,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 18809
+    },
+    {
+      "epoch": 0.1881,
+      "grad_norm": 0.9085474014282227,
+      "learning_rate": 0.003,
+      "loss": 3.9879,
+      "step": 18810
+    },
+    {
+      "epoch": 0.18811,
+      "grad_norm": 0.9232672452926636,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 18811
+    },
+    {
+      "epoch": 0.18812,
+      "grad_norm": 0.8821086883544922,
+      "learning_rate": 0.003,
+      "loss": 3.981,
+      "step": 18812
+    },
+    {
+      "epoch": 0.18813,
+      "grad_norm": 0.783420205116272,
+      "learning_rate": 0.003,
+      "loss": 3.9888,
+      "step": 18813
+    },
+    {
+      "epoch": 0.18814,
+      "grad_norm": 0.7733287215232849,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 18814
+    },
+    {
+      "epoch": 0.18815,
+      "grad_norm": 0.6677088737487793,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 18815
+    },
+    {
+      "epoch": 0.18816,
+      "grad_norm": 0.6723862886428833,
+      "learning_rate": 0.003,
+      "loss": 3.9751,
+      "step": 18816
+    },
+    {
+      "epoch": 0.18817,
+      "grad_norm": 0.9003461003303528,
+      "learning_rate": 0.003,
+      "loss": 4.0427,
+      "step": 18817
+    },
+    {
+      "epoch": 0.18818,
+      "grad_norm": 1.1270079612731934,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 18818
+    },
+    {
+      "epoch": 0.18819,
+      "grad_norm": 0.8880910873413086,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 18819
+    },
+    {
+      "epoch": 0.1882,
+      "grad_norm": 0.7390077710151672,
+      "learning_rate": 0.003,
+      "loss": 3.9913,
+      "step": 18820
+    },
+    {
+      "epoch": 0.18821,
+      "grad_norm": 0.5664216876029968,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 18821
+    },
+    {
+      "epoch": 0.18822,
+      "grad_norm": 0.5701670050621033,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 18822
+    },
+    {
+      "epoch": 0.18823,
+      "grad_norm": 0.5538231134414673,
+      "learning_rate": 0.003,
+      "loss": 3.9916,
+      "step": 18823
+    },
+    {
+      "epoch": 0.18824,
+      "grad_norm": 0.5930655598640442,
+      "learning_rate": 0.003,
+      "loss": 3.9723,
+      "step": 18824
+    },
+    {
+      "epoch": 0.18825,
+      "grad_norm": 0.6896167397499084,
+      "learning_rate": 0.003,
+      "loss": 3.9844,
+      "step": 18825
+    },
+    {
+      "epoch": 0.18826,
+      "grad_norm": 0.7935672998428345,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 18826
+    },
+    {
+      "epoch": 0.18827,
+      "grad_norm": 0.7783662676811218,
+      "learning_rate": 0.003,
+      "loss": 4.009,
+      "step": 18827
+    },
+    {
+      "epoch": 0.18828,
+      "grad_norm": 0.838836133480072,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 18828
+    },
+    {
+      "epoch": 0.18829,
+      "grad_norm": 1.024140477180481,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 18829
+    },
+    {
+      "epoch": 0.1883,
+      "grad_norm": 1.1629550457000732,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 18830
+    },
+    {
+      "epoch": 0.18831,
+      "grad_norm": 0.7892631888389587,
+      "learning_rate": 0.003,
+      "loss": 3.975,
+      "step": 18831
+    },
+    {
+      "epoch": 0.18832,
+      "grad_norm": 0.7690302729606628,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 18832
+    },
+    {
+      "epoch": 0.18833,
+      "grad_norm": 0.8231732845306396,
+      "learning_rate": 0.003,
+      "loss": 3.9801,
+      "step": 18833
+    },
+    {
+      "epoch": 0.18834,
+      "grad_norm": 0.766808271408081,
+      "learning_rate": 0.003,
+      "loss": 4.0178,
+      "step": 18834
+    },
+    {
+      "epoch": 0.18835,
+      "grad_norm": 0.8518886566162109,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 18835
+    },
+    {
+      "epoch": 0.18836,
+      "grad_norm": 0.8174446821212769,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 18836
+    },
+    {
+      "epoch": 0.18837,
+      "grad_norm": 0.7299855351448059,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 18837
+    },
+    {
+      "epoch": 0.18838,
+      "grad_norm": 0.8137608766555786,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 18838
+    },
+    {
+      "epoch": 0.18839,
+      "grad_norm": 0.8550666570663452,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 18839
+    },
+    {
+      "epoch": 0.1884,
+      "grad_norm": 0.9712225198745728,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 18840
+    },
+    {
+      "epoch": 0.18841,
+      "grad_norm": 1.0288853645324707,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 18841
+    },
+    {
+      "epoch": 0.18842,
+      "grad_norm": 1.2778677940368652,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 18842
+    },
+    {
+      "epoch": 0.18843,
+      "grad_norm": 0.6840931177139282,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 18843
+    },
+    {
+      "epoch": 0.18844,
+      "grad_norm": 0.6637877225875854,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 18844
+    },
+    {
+      "epoch": 0.18845,
+      "grad_norm": 0.6504637002944946,
+      "learning_rate": 0.003,
+      "loss": 3.9935,
+      "step": 18845
+    },
+    {
+      "epoch": 0.18846,
+      "grad_norm": 0.726256787776947,
+      "learning_rate": 0.003,
+      "loss": 3.9846,
+      "step": 18846
+    },
+    {
+      "epoch": 0.18847,
+      "grad_norm": 0.9666273593902588,
+      "learning_rate": 0.003,
+      "loss": 4.0027,
+      "step": 18847
+    },
+    {
+      "epoch": 0.18848,
+      "grad_norm": 0.9877365827560425,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 18848
+    },
+    {
+      "epoch": 0.18849,
+      "grad_norm": 0.9723947048187256,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 18849
+    },
+    {
+      "epoch": 0.1885,
+      "grad_norm": 1.0214213132858276,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 18850
+    },
+    {
+      "epoch": 0.18851,
+      "grad_norm": 0.7957757711410522,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 18851
+    },
+    {
+      "epoch": 0.18852,
+      "grad_norm": 0.7545115947723389,
+      "learning_rate": 0.003,
+      "loss": 4.0389,
+      "step": 18852
+    },
+    {
+      "epoch": 0.18853,
+      "grad_norm": 0.743475615978241,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 18853
+    },
+    {
+      "epoch": 0.18854,
+      "grad_norm": 0.7947479486465454,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 18854
+    },
+    {
+      "epoch": 0.18855,
+      "grad_norm": 0.7675407528877258,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 18855
+    },
+    {
+      "epoch": 0.18856,
+      "grad_norm": 0.8497804999351501,
+      "learning_rate": 0.003,
+      "loss": 3.9778,
+      "step": 18856
+    },
+    {
+      "epoch": 0.18857,
+      "grad_norm": 1.2066291570663452,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 18857
+    },
+    {
+      "epoch": 0.18858,
+      "grad_norm": 1.2024381160736084,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 18858
+    },
+    {
+      "epoch": 0.18859,
+      "grad_norm": 0.6818134188652039,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 18859
+    },
+    {
+      "epoch": 0.1886,
+      "grad_norm": 0.6120265126228333,
+      "learning_rate": 0.003,
+      "loss": 4.0236,
+      "step": 18860
+    },
+    {
+      "epoch": 0.18861,
+      "grad_norm": 0.7907565236091614,
+      "learning_rate": 0.003,
+      "loss": 4.0226,
+      "step": 18861
+    },
+    {
+      "epoch": 0.18862,
+      "grad_norm": 1.1322532892227173,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 18862
+    },
+    {
+      "epoch": 0.18863,
+      "grad_norm": 1.09321129322052,
+      "learning_rate": 0.003,
+      "loss": 3.9838,
+      "step": 18863
+    },
+    {
+      "epoch": 0.18864,
+      "grad_norm": 0.7416861057281494,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 18864
+    },
+    {
+      "epoch": 0.18865,
+      "grad_norm": 0.6728402376174927,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 18865
+    },
+    {
+      "epoch": 0.18866,
+      "grad_norm": 0.7465229630470276,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 18866
+    },
+    {
+      "epoch": 0.18867,
+      "grad_norm": 0.8051891922950745,
+      "learning_rate": 0.003,
+      "loss": 3.9977,
+      "step": 18867
+    },
+    {
+      "epoch": 0.18868,
+      "grad_norm": 0.8132879137992859,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 18868
+    },
+    {
+      "epoch": 0.18869,
+      "grad_norm": 0.7474648356437683,
+      "learning_rate": 0.003,
+      "loss": 4.0091,
+      "step": 18869
+    },
+    {
+      "epoch": 0.1887,
+      "grad_norm": 0.7271384000778198,
+      "learning_rate": 0.003,
+      "loss": 3.9893,
+      "step": 18870
+    },
+    {
+      "epoch": 0.18871,
+      "grad_norm": 0.7931719422340393,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 18871
+    },
+    {
+      "epoch": 0.18872,
+      "grad_norm": 0.7534982562065125,
+      "learning_rate": 0.003,
+      "loss": 4.0002,
+      "step": 18872
+    },
+    {
+      "epoch": 0.18873,
+      "grad_norm": 0.7345370054244995,
+      "learning_rate": 0.003,
+      "loss": 3.9859,
+      "step": 18873
+    },
+    {
+      "epoch": 0.18874,
+      "grad_norm": 0.7610151767730713,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 18874
+    },
+    {
+      "epoch": 0.18875,
+      "grad_norm": 0.7372151613235474,
+      "learning_rate": 0.003,
+      "loss": 4.0361,
+      "step": 18875
+    },
+    {
+      "epoch": 0.18876,
+      "grad_norm": 0.8092154264450073,
+      "learning_rate": 0.003,
+      "loss": 3.9953,
+      "step": 18876
+    },
+    {
+      "epoch": 0.18877,
+      "grad_norm": 1.111495852470398,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 18877
+    },
+    {
+      "epoch": 0.18878,
+      "grad_norm": 1.1198842525482178,
+      "learning_rate": 0.003,
+      "loss": 3.983,
+      "step": 18878
+    },
+    {
+      "epoch": 0.18879,
+      "grad_norm": 0.9886401295661926,
+      "learning_rate": 0.003,
+      "loss": 4.0072,
+      "step": 18879
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 1.0660293102264404,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 18880
+    },
+    {
+      "epoch": 0.18881,
+      "grad_norm": 0.9938003420829773,
+      "learning_rate": 0.003,
+      "loss": 3.9871,
+      "step": 18881
+    },
+    {
+      "epoch": 0.18882,
+      "grad_norm": 0.8295101523399353,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 18882
+    },
+    {
+      "epoch": 0.18883,
+      "grad_norm": 0.7040286064147949,
+      "learning_rate": 0.003,
+      "loss": 3.9848,
+      "step": 18883
+    },
+    {
+      "epoch": 0.18884,
+      "grad_norm": 0.7084823250770569,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 18884
+    },
+    {
+      "epoch": 0.18885,
+      "grad_norm": 0.7420821189880371,
+      "learning_rate": 0.003,
+      "loss": 3.9753,
+      "step": 18885
+    },
+    {
+      "epoch": 0.18886,
+      "grad_norm": 0.7304326891899109,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 18886
+    },
+    {
+      "epoch": 0.18887,
+      "grad_norm": 0.8151340484619141,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 18887
+    },
+    {
+      "epoch": 0.18888,
+      "grad_norm": 1.025665283203125,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 18888
+    },
+    {
+      "epoch": 0.18889,
+      "grad_norm": 1.026984691619873,
+      "learning_rate": 0.003,
+      "loss": 3.9822,
+      "step": 18889
+    },
+    {
+      "epoch": 0.1889,
+      "grad_norm": 0.9665703177452087,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 18890
+    },
+    {
+      "epoch": 0.18891,
+      "grad_norm": 0.8499642014503479,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 18891
+    },
+    {
+      "epoch": 0.18892,
+      "grad_norm": 0.7022868990898132,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 18892
+    },
+    {
+      "epoch": 0.18893,
+      "grad_norm": 0.7038338780403137,
+      "learning_rate": 0.003,
+      "loss": 3.9795,
+      "step": 18893
+    },
+    {
+      "epoch": 0.18894,
+      "grad_norm": 0.7314153909683228,
+      "learning_rate": 0.003,
+      "loss": 3.994,
+      "step": 18894
+    },
+    {
+      "epoch": 0.18895,
+      "grad_norm": 0.7689028382301331,
+      "learning_rate": 0.003,
+      "loss": 3.9814,
+      "step": 18895
+    },
+    {
+      "epoch": 0.18896,
+      "grad_norm": 0.6190382838249207,
+      "learning_rate": 0.003,
+      "loss": 3.9775,
+      "step": 18896
+    },
+    {
+      "epoch": 0.18897,
+      "grad_norm": 0.6298418045043945,
+      "learning_rate": 0.003,
+      "loss": 3.977,
+      "step": 18897
+    },
+    {
+      "epoch": 0.18898,
+      "grad_norm": 0.6815434098243713,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 18898
+    },
+    {
+      "epoch": 0.18899,
+      "grad_norm": 0.7141243815422058,
+      "learning_rate": 0.003,
+      "loss": 3.9866,
+      "step": 18899
+    },
+    {
+      "epoch": 0.189,
+      "grad_norm": 0.9865851402282715,
+      "learning_rate": 0.003,
+      "loss": 3.978,
+      "step": 18900
+    },
+    {
+      "epoch": 0.18901,
+      "grad_norm": 1.3485716581344604,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 18901
+    },
+    {
+      "epoch": 0.18902,
+      "grad_norm": 0.6510710120201111,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 18902
+    },
+    {
+      "epoch": 0.18903,
+      "grad_norm": 0.6483829021453857,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 18903
+    },
+    {
+      "epoch": 0.18904,
+      "grad_norm": 0.7830336689949036,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 18904
+    },
+    {
+      "epoch": 0.18905,
+      "grad_norm": 1.0328387022018433,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 18905
+    },
+    {
+      "epoch": 0.18906,
+      "grad_norm": 1.0227880477905273,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 18906
+    },
+    {
+      "epoch": 0.18907,
+      "grad_norm": 0.9766803979873657,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 18907
+    },
+    {
+      "epoch": 0.18908,
+      "grad_norm": 0.9505747556686401,
+      "learning_rate": 0.003,
+      "loss": 3.9934,
+      "step": 18908
+    },
+    {
+      "epoch": 0.18909,
+      "grad_norm": 0.9789347052574158,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 18909
+    },
+    {
+      "epoch": 0.1891,
+      "grad_norm": 1.2579947710037231,
+      "learning_rate": 0.003,
+      "loss": 4.0377,
+      "step": 18910
+    },
+    {
+      "epoch": 0.18911,
+      "grad_norm": 0.6997345685958862,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 18911
+    },
+    {
+      "epoch": 0.18912,
+      "grad_norm": 0.741587221622467,
+      "learning_rate": 0.003,
+      "loss": 4.0176,
+      "step": 18912
+    },
+    {
+      "epoch": 0.18913,
+      "grad_norm": 0.8010073900222778,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 18913
+    },
+    {
+      "epoch": 0.18914,
+      "grad_norm": 0.8004756569862366,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 18914
+    },
+    {
+      "epoch": 0.18915,
+      "grad_norm": 0.8011345863342285,
+      "learning_rate": 0.003,
+      "loss": 3.9895,
+      "step": 18915
+    },
+    {
+      "epoch": 0.18916,
+      "grad_norm": 0.7441811561584473,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 18916
+    },
+    {
+      "epoch": 0.18917,
+      "grad_norm": 0.6722574234008789,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 18917
+    },
+    {
+      "epoch": 0.18918,
+      "grad_norm": 0.702165961265564,
+      "learning_rate": 0.003,
+      "loss": 3.9892,
+      "step": 18918
+    },
+    {
+      "epoch": 0.18919,
+      "grad_norm": 0.7118338346481323,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 18919
+    },
+    {
+      "epoch": 0.1892,
+      "grad_norm": 0.6990228295326233,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 18920
+    },
+    {
+      "epoch": 0.18921,
+      "grad_norm": 0.8419510722160339,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 18921
+    },
+    {
+      "epoch": 0.18922,
+      "grad_norm": 1.246588110923767,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 18922
+    },
+    {
+      "epoch": 0.18923,
+      "grad_norm": 0.9596294164657593,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 18923
+    },
+    {
+      "epoch": 0.18924,
+      "grad_norm": 0.8327478766441345,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 18924
+    },
+    {
+      "epoch": 0.18925,
+      "grad_norm": 0.856834352016449,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 18925
+    },
+    {
+      "epoch": 0.18926,
+      "grad_norm": 0.8613009452819824,
+      "learning_rate": 0.003,
+      "loss": 4.038,
+      "step": 18926
+    },
+    {
+      "epoch": 0.18927,
+      "grad_norm": 0.861362874507904,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 18927
+    },
+    {
+      "epoch": 0.18928,
+      "grad_norm": 0.8827495574951172,
+      "learning_rate": 0.003,
+      "loss": 3.9735,
+      "step": 18928
+    },
+    {
+      "epoch": 0.18929,
+      "grad_norm": 0.9191708564758301,
+      "learning_rate": 0.003,
+      "loss": 4.0219,
+      "step": 18929
+    },
+    {
+      "epoch": 0.1893,
+      "grad_norm": 1.1141074895858765,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 18930
+    },
+    {
+      "epoch": 0.18931,
+      "grad_norm": 0.9931434392929077,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 18931
+    },
+    {
+      "epoch": 0.18932,
+      "grad_norm": 0.8666828274726868,
+      "learning_rate": 0.003,
+      "loss": 4.0461,
+      "step": 18932
+    },
+    {
+      "epoch": 0.18933,
+      "grad_norm": 0.7398908138275146,
+      "learning_rate": 0.003,
+      "loss": 3.9851,
+      "step": 18933
+    },
+    {
+      "epoch": 0.18934,
+      "grad_norm": 0.6053960919380188,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 18934
+    },
+    {
+      "epoch": 0.18935,
+      "grad_norm": 0.5848657488822937,
+      "learning_rate": 0.003,
+      "loss": 3.9818,
+      "step": 18935
+    },
+    {
+      "epoch": 0.18936,
+      "grad_norm": 0.6164701581001282,
+      "learning_rate": 0.003,
+      "loss": 4.0185,
+      "step": 18936
+    },
+    {
+      "epoch": 0.18937,
+      "grad_norm": 0.6691602468490601,
+      "learning_rate": 0.003,
+      "loss": 3.9741,
+      "step": 18937
+    },
+    {
+      "epoch": 0.18938,
+      "grad_norm": 0.7680910229682922,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 18938
+    },
+    {
+      "epoch": 0.18939,
+      "grad_norm": 0.906510055065155,
+      "learning_rate": 0.003,
+      "loss": 3.9973,
+      "step": 18939
+    },
+    {
+      "epoch": 0.1894,
+      "grad_norm": 1.1088699102401733,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 18940
+    },
+    {
+      "epoch": 0.18941,
+      "grad_norm": 0.809749960899353,
+      "learning_rate": 0.003,
+      "loss": 3.9874,
+      "step": 18941
+    },
+    {
+      "epoch": 0.18942,
+      "grad_norm": 0.7769731283187866,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 18942
+    },
+    {
+      "epoch": 0.18943,
+      "grad_norm": 0.7073752284049988,
+      "learning_rate": 0.003,
+      "loss": 3.987,
+      "step": 18943
+    },
+    {
+      "epoch": 0.18944,
+      "grad_norm": 0.6503266096115112,
+      "learning_rate": 0.003,
+      "loss": 3.987,
+      "step": 18944
+    },
+    {
+      "epoch": 0.18945,
+      "grad_norm": 0.6349859237670898,
+      "learning_rate": 0.003,
+      "loss": 3.9623,
+      "step": 18945
+    },
+    {
+      "epoch": 0.18946,
+      "grad_norm": 0.6565268039703369,
+      "learning_rate": 0.003,
+      "loss": 3.9724,
+      "step": 18946
+    },
+    {
+      "epoch": 0.18947,
+      "grad_norm": 0.689110517501831,
+      "learning_rate": 0.003,
+      "loss": 3.9756,
+      "step": 18947
+    },
+    {
+      "epoch": 0.18948,
+      "grad_norm": 0.6516901850700378,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 18948
+    },
+    {
+      "epoch": 0.18949,
+      "grad_norm": 0.6973980665206909,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 18949
+    },
+    {
+      "epoch": 0.1895,
+      "grad_norm": 0.8721265196800232,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 18950
+    },
+    {
+      "epoch": 0.18951,
+      "grad_norm": 1.3417309522628784,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 18951
+    },
+    {
+      "epoch": 0.18952,
+      "grad_norm": 0.8508768677711487,
+      "learning_rate": 0.003,
+      "loss": 3.9745,
+      "step": 18952
+    },
+    {
+      "epoch": 0.18953,
+      "grad_norm": 0.8588517308235168,
+      "learning_rate": 0.003,
+      "loss": 3.9988,
+      "step": 18953
+    },
+    {
+      "epoch": 0.18954,
+      "grad_norm": 0.9401569962501526,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 18954
+    },
+    {
+      "epoch": 0.18955,
+      "grad_norm": 0.9512640833854675,
+      "learning_rate": 0.003,
+      "loss": 3.9866,
+      "step": 18955
+    },
+    {
+      "epoch": 0.18956,
+      "grad_norm": 1.1748946905136108,
+      "learning_rate": 0.003,
+      "loss": 4.0282,
+      "step": 18956
+    },
+    {
+      "epoch": 0.18957,
+      "grad_norm": 0.8970866203308105,
+      "learning_rate": 0.003,
+      "loss": 3.9839,
+      "step": 18957
+    },
+    {
+      "epoch": 0.18958,
+      "grad_norm": 0.970586359500885,
+      "learning_rate": 0.003,
+      "loss": 3.9903,
+      "step": 18958
+    },
+    {
+      "epoch": 0.18959,
+      "grad_norm": 1.0547508001327515,
+      "learning_rate": 0.003,
+      "loss": 3.9793,
+      "step": 18959
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 1.1379255056381226,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 18960
+    },
+    {
+      "epoch": 0.18961,
+      "grad_norm": 1.0756850242614746,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 18961
+    },
+    {
+      "epoch": 0.18962,
+      "grad_norm": 0.9282955527305603,
+      "learning_rate": 0.003,
+      "loss": 4.0391,
+      "step": 18962
+    },
+    {
+      "epoch": 0.18963,
+      "grad_norm": 0.9438974261283875,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 18963
+    },
+    {
+      "epoch": 0.18964,
+      "grad_norm": 0.9445635080337524,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 18964
+    },
+    {
+      "epoch": 0.18965,
+      "grad_norm": 1.079493761062622,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 18965
+    },
+    {
+      "epoch": 0.18966,
+      "grad_norm": 1.120069146156311,
+      "learning_rate": 0.003,
+      "loss": 3.9892,
+      "step": 18966
+    },
+    {
+      "epoch": 0.18967,
+      "grad_norm": 0.7256782054901123,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 18967
+    },
+    {
+      "epoch": 0.18968,
+      "grad_norm": 0.6131082773208618,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 18968
+    },
+    {
+      "epoch": 0.18969,
+      "grad_norm": 0.6501275897026062,
+      "learning_rate": 0.003,
+      "loss": 4.0095,
+      "step": 18969
+    },
+    {
+      "epoch": 0.1897,
+      "grad_norm": 0.6911957859992981,
+      "learning_rate": 0.003,
+      "loss": 3.9772,
+      "step": 18970
+    },
+    {
+      "epoch": 0.18971,
+      "grad_norm": 0.653016984462738,
+      "learning_rate": 0.003,
+      "loss": 3.9767,
+      "step": 18971
+    },
+    {
+      "epoch": 0.18972,
+      "grad_norm": 0.5951827168464661,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 18972
+    },
+    {
+      "epoch": 0.18973,
+      "grad_norm": 0.5641265511512756,
+      "learning_rate": 0.003,
+      "loss": 3.968,
+      "step": 18973
+    },
+    {
+      "epoch": 0.18974,
+      "grad_norm": 0.615556538105011,
+      "learning_rate": 0.003,
+      "loss": 4.0119,
+      "step": 18974
+    },
+    {
+      "epoch": 0.18975,
+      "grad_norm": 0.5846765637397766,
+      "learning_rate": 0.003,
+      "loss": 3.9859,
+      "step": 18975
+    },
+    {
+      "epoch": 0.18976,
+      "grad_norm": 0.642020046710968,
+      "learning_rate": 0.003,
+      "loss": 4.0207,
+      "step": 18976
+    },
+    {
+      "epoch": 0.18977,
+      "grad_norm": 0.6776500344276428,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 18977
+    },
+    {
+      "epoch": 0.18978,
+      "grad_norm": 0.8168145418167114,
+      "learning_rate": 0.003,
+      "loss": 3.9943,
+      "step": 18978
+    },
+    {
+      "epoch": 0.18979,
+      "grad_norm": 1.0492795705795288,
+      "learning_rate": 0.003,
+      "loss": 3.9975,
+      "step": 18979
+    },
+    {
+      "epoch": 0.1898,
+      "grad_norm": 1.2692487239837646,
+      "learning_rate": 0.003,
+      "loss": 3.9872,
+      "step": 18980
+    },
+    {
+      "epoch": 0.18981,
+      "grad_norm": 0.7477732300758362,
+      "learning_rate": 0.003,
+      "loss": 4.0004,
+      "step": 18981
+    },
+    {
+      "epoch": 0.18982,
+      "grad_norm": 0.7049261331558228,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 18982
+    },
+    {
+      "epoch": 0.18983,
+      "grad_norm": 0.8238551020622253,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 18983
+    },
+    {
+      "epoch": 0.18984,
+      "grad_norm": 0.8532552719116211,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 18984
+    },
+    {
+      "epoch": 0.18985,
+      "grad_norm": 0.854641854763031,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 18985
+    },
+    {
+      "epoch": 0.18986,
+      "grad_norm": 0.9322021007537842,
+      "learning_rate": 0.003,
+      "loss": 3.9653,
+      "step": 18986
+    },
+    {
+      "epoch": 0.18987,
+      "grad_norm": 1.0125163793563843,
+      "learning_rate": 0.003,
+      "loss": 3.9734,
+      "step": 18987
+    },
+    {
+      "epoch": 0.18988,
+      "grad_norm": 1.0155051946640015,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 18988
+    },
+    {
+      "epoch": 0.18989,
+      "grad_norm": 0.958838939666748,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 18989
+    },
+    {
+      "epoch": 0.1899,
+      "grad_norm": 1.0734632015228271,
+      "learning_rate": 0.003,
+      "loss": 3.9928,
+      "step": 18990
+    },
+    {
+      "epoch": 0.18991,
+      "grad_norm": 0.9874166250228882,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 18991
+    },
+    {
+      "epoch": 0.18992,
+      "grad_norm": 1.1062909364700317,
+      "learning_rate": 0.003,
+      "loss": 3.997,
+      "step": 18992
+    },
+    {
+      "epoch": 0.18993,
+      "grad_norm": 0.9920216798782349,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 18993
+    },
+    {
+      "epoch": 0.18994,
+      "grad_norm": 0.9129654765129089,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 18994
+    },
+    {
+      "epoch": 0.18995,
+      "grad_norm": 0.9695618748664856,
+      "learning_rate": 0.003,
+      "loss": 3.9521,
+      "step": 18995
+    },
+    {
+      "epoch": 0.18996,
+      "grad_norm": 1.1071839332580566,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 18996
+    },
+    {
+      "epoch": 0.18997,
+      "grad_norm": 0.8101666569709778,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 18997
+    },
+    {
+      "epoch": 0.18998,
+      "grad_norm": 0.7454321980476379,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 18998
+    },
+    {
+      "epoch": 0.18999,
+      "grad_norm": 0.76347416639328,
+      "learning_rate": 0.003,
+      "loss": 3.9803,
+      "step": 18999
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.676956057548523,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 19000
+    },
+    {
+      "epoch": 0.19001,
+      "grad_norm": 0.5809410214424133,
+      "learning_rate": 0.003,
+      "loss": 4.0057,
+      "step": 19001
+    },
+    {
+      "epoch": 0.19002,
+      "grad_norm": 0.6385517120361328,
+      "learning_rate": 0.003,
+      "loss": 3.9783,
+      "step": 19002
+    },
+    {
+      "epoch": 0.19003,
+      "grad_norm": 0.72422856092453,
+      "learning_rate": 0.003,
+      "loss": 3.9984,
+      "step": 19003
+    },
+    {
+      "epoch": 0.19004,
+      "grad_norm": 0.8340731263160706,
+      "learning_rate": 0.003,
+      "loss": 3.9992,
+      "step": 19004
+    },
+    {
+      "epoch": 0.19005,
+      "grad_norm": 0.9573755860328674,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 19005
+    },
+    {
+      "epoch": 0.19006,
+      "grad_norm": 1.1783071756362915,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 19006
+    },
+    {
+      "epoch": 0.19007,
+      "grad_norm": 0.8298691511154175,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 19007
+    },
+    {
+      "epoch": 0.19008,
+      "grad_norm": 0.6903021931648254,
+      "learning_rate": 0.003,
+      "loss": 3.9879,
+      "step": 19008
+    },
+    {
+      "epoch": 0.19009,
+      "grad_norm": 0.6378114223480225,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 19009
+    },
+    {
+      "epoch": 0.1901,
+      "grad_norm": 0.6948537230491638,
+      "learning_rate": 0.003,
+      "loss": 3.9866,
+      "step": 19010
+    },
+    {
+      "epoch": 0.19011,
+      "grad_norm": 0.8318427801132202,
+      "learning_rate": 0.003,
+      "loss": 3.9883,
+      "step": 19011
+    },
+    {
+      "epoch": 0.19012,
+      "grad_norm": 1.0148303508758545,
+      "learning_rate": 0.003,
+      "loss": 3.9855,
+      "step": 19012
+    },
+    {
+      "epoch": 0.19013,
+      "grad_norm": 1.0587249994277954,
+      "learning_rate": 0.003,
+      "loss": 3.9815,
+      "step": 19013
+    },
+    {
+      "epoch": 0.19014,
+      "grad_norm": 0.9003660082817078,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 19014
+    },
+    {
+      "epoch": 0.19015,
+      "grad_norm": 0.869741678237915,
+      "learning_rate": 0.003,
+      "loss": 3.9635,
+      "step": 19015
+    },
+    {
+      "epoch": 0.19016,
+      "grad_norm": 0.8709609508514404,
+      "learning_rate": 0.003,
+      "loss": 3.9894,
+      "step": 19016
+    },
+    {
+      "epoch": 0.19017,
+      "grad_norm": 0.8974391222000122,
+      "learning_rate": 0.003,
+      "loss": 4.0267,
+      "step": 19017
+    },
+    {
+      "epoch": 0.19018,
+      "grad_norm": 0.9310207962989807,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 19018
+    },
+    {
+      "epoch": 0.19019,
+      "grad_norm": 0.9279577136039734,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 19019
+    },
+    {
+      "epoch": 0.1902,
+      "grad_norm": 1.0117379426956177,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 19020
+    },
+    {
+      "epoch": 0.19021,
+      "grad_norm": 1.0991427898406982,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 19021
+    },
+    {
+      "epoch": 0.19022,
+      "grad_norm": 0.9016971588134766,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 19022
+    },
+    {
+      "epoch": 0.19023,
+      "grad_norm": 0.9247835278511047,
+      "learning_rate": 0.003,
+      "loss": 4.0556,
+      "step": 19023
+    },
+    {
+      "epoch": 0.19024,
+      "grad_norm": 1.007381558418274,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 19024
+    },
+    {
+      "epoch": 0.19025,
+      "grad_norm": 1.206356167793274,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 19025
+    },
+    {
+      "epoch": 0.19026,
+      "grad_norm": 0.7278400659561157,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 19026
+    },
+    {
+      "epoch": 0.19027,
+      "grad_norm": 0.6956689953804016,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 19027
+    },
+    {
+      "epoch": 0.19028,
+      "grad_norm": 0.6170609593391418,
+      "learning_rate": 0.003,
+      "loss": 3.9922,
+      "step": 19028
+    },
+    {
+      "epoch": 0.19029,
+      "grad_norm": 0.5841855406761169,
+      "learning_rate": 0.003,
+      "loss": 3.9728,
+      "step": 19029
+    },
+    {
+      "epoch": 0.1903,
+      "grad_norm": 0.615669846534729,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 19030
+    },
+    {
+      "epoch": 0.19031,
+      "grad_norm": 0.7128541469573975,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 19031
+    },
+    {
+      "epoch": 0.19032,
+      "grad_norm": 0.9266338348388672,
+      "learning_rate": 0.003,
+      "loss": 4.0,
+      "step": 19032
+    },
+    {
+      "epoch": 0.19033,
+      "grad_norm": 1.13957679271698,
+      "learning_rate": 0.003,
+      "loss": 3.9857,
+      "step": 19033
+    },
+    {
+      "epoch": 0.19034,
+      "grad_norm": 0.836190938949585,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 19034
+    },
+    {
+      "epoch": 0.19035,
+      "grad_norm": 0.6466774344444275,
+      "learning_rate": 0.003,
+      "loss": 3.9796,
+      "step": 19035
+    },
+    {
+      "epoch": 0.19036,
+      "grad_norm": 0.5852211713790894,
+      "learning_rate": 0.003,
+      "loss": 3.9972,
+      "step": 19036
+    },
+    {
+      "epoch": 0.19037,
+      "grad_norm": 0.7002070546150208,
+      "learning_rate": 0.003,
+      "loss": 3.9827,
+      "step": 19037
+    },
+    {
+      "epoch": 0.19038,
+      "grad_norm": 0.7181115746498108,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 19038
+    },
+    {
+      "epoch": 0.19039,
+      "grad_norm": 0.696847140789032,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 19039
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.7739638090133667,
+      "learning_rate": 0.003,
+      "loss": 3.9739,
+      "step": 19040
+    },
+    {
+      "epoch": 0.19041,
+      "grad_norm": 0.8265454173088074,
+      "learning_rate": 0.003,
+      "loss": 3.992,
+      "step": 19041
+    },
+    {
+      "epoch": 0.19042,
+      "grad_norm": 0.9292768836021423,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 19042
+    },
+    {
+      "epoch": 0.19043,
+      "grad_norm": 1.05363130569458,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 19043
+    },
+    {
+      "epoch": 0.19044,
+      "grad_norm": 0.8629478216171265,
+      "learning_rate": 0.003,
+      "loss": 3.9952,
+      "step": 19044
+    },
+    {
+      "epoch": 0.19045,
+      "grad_norm": 0.7886390686035156,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 19045
+    },
+    {
+      "epoch": 0.19046,
+      "grad_norm": 0.8244084715843201,
+      "learning_rate": 0.003,
+      "loss": 3.9849,
+      "step": 19046
+    },
+    {
+      "epoch": 0.19047,
+      "grad_norm": 0.8120138049125671,
+      "learning_rate": 0.003,
+      "loss": 3.9952,
+      "step": 19047
+    },
+    {
+      "epoch": 0.19048,
+      "grad_norm": 0.8467174768447876,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 19048
+    },
+    {
+      "epoch": 0.19049,
+      "grad_norm": 0.9316357374191284,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 19049
+    },
+    {
+      "epoch": 0.1905,
+      "grad_norm": 0.9798911809921265,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 19050
+    },
+    {
+      "epoch": 0.19051,
+      "grad_norm": 0.8504247665405273,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 19051
+    },
+    {
+      "epoch": 0.19052,
+      "grad_norm": 0.748024046421051,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 19052
+    },
+    {
+      "epoch": 0.19053,
+      "grad_norm": 0.7716191411018372,
+      "learning_rate": 0.003,
+      "loss": 3.9755,
+      "step": 19053
+    },
+    {
+      "epoch": 0.19054,
+      "grad_norm": 0.7982362508773804,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 19054
+    },
+    {
+      "epoch": 0.19055,
+      "grad_norm": 0.8361937403678894,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 19055
+    },
+    {
+      "epoch": 0.19056,
+      "grad_norm": 1.080262541770935,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 19056
+    },
+    {
+      "epoch": 0.19057,
+      "grad_norm": 1.1439944505691528,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 19057
+    },
+    {
+      "epoch": 0.19058,
+      "grad_norm": 0.9121918678283691,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 19058
+    },
+    {
+      "epoch": 0.19059,
+      "grad_norm": 0.9394298791885376,
+      "learning_rate": 0.003,
+      "loss": 4.0204,
+      "step": 19059
+    },
+    {
+      "epoch": 0.1906,
+      "grad_norm": 0.9266657829284668,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 19060
+    },
+    {
+      "epoch": 0.19061,
+      "grad_norm": 0.881291389465332,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 19061
+    },
+    {
+      "epoch": 0.19062,
+      "grad_norm": 0.8577966690063477,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 19062
+    },
+    {
+      "epoch": 0.19063,
+      "grad_norm": 1.003316879272461,
+      "learning_rate": 0.003,
+      "loss": 3.9727,
+      "step": 19063
+    },
+    {
+      "epoch": 0.19064,
+      "grad_norm": 1.2838621139526367,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 19064
+    },
+    {
+      "epoch": 0.19065,
+      "grad_norm": 0.8092989921569824,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 19065
+    },
+    {
+      "epoch": 0.19066,
+      "grad_norm": 0.6843506097793579,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 19066
+    },
+    {
+      "epoch": 0.19067,
+      "grad_norm": 0.5812391638755798,
+      "learning_rate": 0.003,
+      "loss": 3.9943,
+      "step": 19067
+    },
+    {
+      "epoch": 0.19068,
+      "grad_norm": 0.5177496075630188,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 19068
+    },
+    {
+      "epoch": 0.19069,
+      "grad_norm": 0.49588555097579956,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 19069
+    },
+    {
+      "epoch": 0.1907,
+      "grad_norm": 0.580472469329834,
+      "learning_rate": 0.003,
+      "loss": 3.9822,
+      "step": 19070
+    },
+    {
+      "epoch": 0.19071,
+      "grad_norm": 0.5863806009292603,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 19071
+    },
+    {
+      "epoch": 0.19072,
+      "grad_norm": 0.6271872520446777,
+      "learning_rate": 0.003,
+      "loss": 4.0119,
+      "step": 19072
+    },
+    {
+      "epoch": 0.19073,
+      "grad_norm": 0.7241467237472534,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 19073
+    },
+    {
+      "epoch": 0.19074,
+      "grad_norm": 0.9060819149017334,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 19074
+    },
+    {
+      "epoch": 0.19075,
+      "grad_norm": 1.0471937656402588,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 19075
+    },
+    {
+      "epoch": 0.19076,
+      "grad_norm": 1.0599033832550049,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 19076
+    },
+    {
+      "epoch": 0.19077,
+      "grad_norm": 0.9345073699951172,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 19077
+    },
+    {
+      "epoch": 0.19078,
+      "grad_norm": 0.8938175439834595,
+      "learning_rate": 0.003,
+      "loss": 3.9897,
+      "step": 19078
+    },
+    {
+      "epoch": 0.19079,
+      "grad_norm": 0.9085829257965088,
+      "learning_rate": 0.003,
+      "loss": 4.019,
+      "step": 19079
+    },
+    {
+      "epoch": 0.1908,
+      "grad_norm": 0.8755341172218323,
+      "learning_rate": 0.003,
+      "loss": 3.9762,
+      "step": 19080
+    },
+    {
+      "epoch": 0.19081,
+      "grad_norm": 0.9034981727600098,
+      "learning_rate": 0.003,
+      "loss": 3.9866,
+      "step": 19081
+    },
+    {
+      "epoch": 0.19082,
+      "grad_norm": 1.0624445676803589,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 19082
+    },
+    {
+      "epoch": 0.19083,
+      "grad_norm": 1.2125186920166016,
+      "learning_rate": 0.003,
+      "loss": 3.9977,
+      "step": 19083
+    },
+    {
+      "epoch": 0.19084,
+      "grad_norm": 0.8801583647727966,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 19084
+    },
+    {
+      "epoch": 0.19085,
+      "grad_norm": 0.8632391691207886,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 19085
+    },
+    {
+      "epoch": 0.19086,
+      "grad_norm": 0.7941170930862427,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 19086
+    },
+    {
+      "epoch": 0.19087,
+      "grad_norm": 0.7477037906646729,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 19087
+    },
+    {
+      "epoch": 0.19088,
+      "grad_norm": 0.6981642246246338,
+      "learning_rate": 0.003,
+      "loss": 4.0273,
+      "step": 19088
+    },
+    {
+      "epoch": 0.19089,
+      "grad_norm": 0.7463194131851196,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 19089
+    },
+    {
+      "epoch": 0.1909,
+      "grad_norm": 0.904951810836792,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 19090
+    },
+    {
+      "epoch": 0.19091,
+      "grad_norm": 0.9113186001777649,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 19091
+    },
+    {
+      "epoch": 0.19092,
+      "grad_norm": 0.7952855825424194,
+      "learning_rate": 0.003,
+      "loss": 3.9788,
+      "step": 19092
+    },
+    {
+      "epoch": 0.19093,
+      "grad_norm": 0.8860151171684265,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 19093
+    },
+    {
+      "epoch": 0.19094,
+      "grad_norm": 0.8939390778541565,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 19094
+    },
+    {
+      "epoch": 0.19095,
+      "grad_norm": 0.8042697906494141,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 19095
+    },
+    {
+      "epoch": 0.19096,
+      "grad_norm": 0.8192028999328613,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 19096
+    },
+    {
+      "epoch": 0.19097,
+      "grad_norm": 0.9775270223617554,
+      "learning_rate": 0.003,
+      "loss": 4.0168,
+      "step": 19097
+    },
+    {
+      "epoch": 0.19098,
+      "grad_norm": 1.206665277481079,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 19098
+    },
+    {
+      "epoch": 0.19099,
+      "grad_norm": 0.8887525200843811,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 19099
+    },
+    {
+      "epoch": 0.191,
+      "grad_norm": 0.9043484330177307,
+      "learning_rate": 0.003,
+      "loss": 4.0306,
+      "step": 19100
+    },
+    {
+      "epoch": 0.19101,
+      "grad_norm": 0.9640260338783264,
+      "learning_rate": 0.003,
+      "loss": 4.0237,
+      "step": 19101
+    },
+    {
+      "epoch": 0.19102,
+      "grad_norm": 0.8492767810821533,
+      "learning_rate": 0.003,
+      "loss": 3.988,
+      "step": 19102
+    },
+    {
+      "epoch": 0.19103,
+      "grad_norm": 0.8072956204414368,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 19103
+    },
+    {
+      "epoch": 0.19104,
+      "grad_norm": 0.8654136657714844,
+      "learning_rate": 0.003,
+      "loss": 4.0216,
+      "step": 19104
+    },
+    {
+      "epoch": 0.19105,
+      "grad_norm": 0.8672546148300171,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 19105
+    },
+    {
+      "epoch": 0.19106,
+      "grad_norm": 0.7615546584129333,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 19106
+    },
+    {
+      "epoch": 0.19107,
+      "grad_norm": 0.8596553802490234,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 19107
+    },
+    {
+      "epoch": 0.19108,
+      "grad_norm": 0.8763645887374878,
+      "learning_rate": 0.003,
+      "loss": 4.0056,
+      "step": 19108
+    },
+    {
+      "epoch": 0.19109,
+      "grad_norm": 0.880420446395874,
+      "learning_rate": 0.003,
+      "loss": 4.0364,
+      "step": 19109
+    },
+    {
+      "epoch": 0.1911,
+      "grad_norm": 0.8416774272918701,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 19110
+    },
+    {
+      "epoch": 0.19111,
+      "grad_norm": 0.7089952826499939,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 19111
+    },
+    {
+      "epoch": 0.19112,
+      "grad_norm": 0.6287502646446228,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 19112
+    },
+    {
+      "epoch": 0.19113,
+      "grad_norm": 0.6182113289833069,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 19113
+    },
+    {
+      "epoch": 0.19114,
+      "grad_norm": 0.7214407324790955,
+      "learning_rate": 0.003,
+      "loss": 3.9783,
+      "step": 19114
+    },
+    {
+      "epoch": 0.19115,
+      "grad_norm": 0.9095062613487244,
+      "learning_rate": 0.003,
+      "loss": 4.0001,
+      "step": 19115
+    },
+    {
+      "epoch": 0.19116,
+      "grad_norm": 1.3402451276779175,
+      "learning_rate": 0.003,
+      "loss": 4.029,
+      "step": 19116
+    },
+    {
+      "epoch": 0.19117,
+      "grad_norm": 0.6122109293937683,
+      "learning_rate": 0.003,
+      "loss": 3.9849,
+      "step": 19117
+    },
+    {
+      "epoch": 0.19118,
+      "grad_norm": 0.6436371803283691,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 19118
+    },
+    {
+      "epoch": 0.19119,
+      "grad_norm": 0.8043946623802185,
+      "learning_rate": 0.003,
+      "loss": 4.0454,
+      "step": 19119
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.8193957209587097,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 19120
+    },
+    {
+      "epoch": 0.19121,
+      "grad_norm": 0.778981626033783,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 19121
+    },
+    {
+      "epoch": 0.19122,
+      "grad_norm": 0.7985784411430359,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 19122
+    },
+    {
+      "epoch": 0.19123,
+      "grad_norm": 0.7375852465629578,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 19123
+    },
+    {
+      "epoch": 0.19124,
+      "grad_norm": 0.8971546292304993,
+      "learning_rate": 0.003,
+      "loss": 4.0341,
+      "step": 19124
+    },
+    {
+      "epoch": 0.19125,
+      "grad_norm": 0.9998382329940796,
+      "learning_rate": 0.003,
+      "loss": 3.998,
+      "step": 19125
+    },
+    {
+      "epoch": 0.19126,
+      "grad_norm": 0.9625898003578186,
+      "learning_rate": 0.003,
+      "loss": 3.9894,
+      "step": 19126
+    },
+    {
+      "epoch": 0.19127,
+      "grad_norm": 1.0204427242279053,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 19127
+    },
+    {
+      "epoch": 0.19128,
+      "grad_norm": 1.2511576414108276,
+      "learning_rate": 0.003,
+      "loss": 4.0095,
+      "step": 19128
+    },
+    {
+      "epoch": 0.19129,
+      "grad_norm": 0.7735320925712585,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 19129
+    },
+    {
+      "epoch": 0.1913,
+      "grad_norm": 0.6742091178894043,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 19130
+    },
+    {
+      "epoch": 0.19131,
+      "grad_norm": 0.6724696755409241,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 19131
+    },
+    {
+      "epoch": 0.19132,
+      "grad_norm": 0.723955512046814,
+      "learning_rate": 0.003,
+      "loss": 3.9983,
+      "step": 19132
+    },
+    {
+      "epoch": 0.19133,
+      "grad_norm": 0.8885080218315125,
+      "learning_rate": 0.003,
+      "loss": 3.9723,
+      "step": 19133
+    },
+    {
+      "epoch": 0.19134,
+      "grad_norm": 0.9969792366027832,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 19134
+    },
+    {
+      "epoch": 0.19135,
+      "grad_norm": 1.0846734046936035,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 19135
+    },
+    {
+      "epoch": 0.19136,
+      "grad_norm": 0.7409761548042297,
+      "learning_rate": 0.003,
+      "loss": 3.9942,
+      "step": 19136
+    },
+    {
+      "epoch": 0.19137,
+      "grad_norm": 0.5579649209976196,
+      "learning_rate": 0.003,
+      "loss": 3.9697,
+      "step": 19137
+    },
+    {
+      "epoch": 0.19138,
+      "grad_norm": 0.7213162183761597,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 19138
+    },
+    {
+      "epoch": 0.19139,
+      "grad_norm": 0.8387914896011353,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 19139
+    },
+    {
+      "epoch": 0.1914,
+      "grad_norm": 0.996585488319397,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 19140
+    },
+    {
+      "epoch": 0.19141,
+      "grad_norm": 1.0790042877197266,
+      "learning_rate": 0.003,
+      "loss": 3.9623,
+      "step": 19141
+    },
+    {
+      "epoch": 0.19142,
+      "grad_norm": 0.6569657325744629,
+      "learning_rate": 0.003,
+      "loss": 3.9935,
+      "step": 19142
+    },
+    {
+      "epoch": 0.19143,
+      "grad_norm": 0.6758229732513428,
+      "learning_rate": 0.003,
+      "loss": 3.9854,
+      "step": 19143
+    },
+    {
+      "epoch": 0.19144,
+      "grad_norm": 0.7549024224281311,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 19144
+    },
+    {
+      "epoch": 0.19145,
+      "grad_norm": 0.8701123595237732,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 19145
+    },
+    {
+      "epoch": 0.19146,
+      "grad_norm": 0.9431536793708801,
+      "learning_rate": 0.003,
+      "loss": 3.9894,
+      "step": 19146
+    },
+    {
+      "epoch": 0.19147,
+      "grad_norm": 0.9425971508026123,
+      "learning_rate": 0.003,
+      "loss": 3.9994,
+      "step": 19147
+    },
+    {
+      "epoch": 0.19148,
+      "grad_norm": 0.9396876692771912,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 19148
+    },
+    {
+      "epoch": 0.19149,
+      "grad_norm": 0.7899640798568726,
+      "learning_rate": 0.003,
+      "loss": 3.9885,
+      "step": 19149
+    },
+    {
+      "epoch": 0.1915,
+      "grad_norm": 0.8626085519790649,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 19150
+    },
+    {
+      "epoch": 0.19151,
+      "grad_norm": 0.9163888096809387,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 19151
+    },
+    {
+      "epoch": 0.19152,
+      "grad_norm": 0.7615463137626648,
+      "learning_rate": 0.003,
+      "loss": 3.9766,
+      "step": 19152
+    },
+    {
+      "epoch": 0.19153,
+      "grad_norm": 0.8965135216712952,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 19153
+    },
+    {
+      "epoch": 0.19154,
+      "grad_norm": 1.055113673210144,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 19154
+    },
+    {
+      "epoch": 0.19155,
+      "grad_norm": 0.943031907081604,
+      "learning_rate": 0.003,
+      "loss": 3.9767,
+      "step": 19155
+    },
+    {
+      "epoch": 0.19156,
+      "grad_norm": 0.8581675887107849,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 19156
+    },
+    {
+      "epoch": 0.19157,
+      "grad_norm": 0.8582165837287903,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 19157
+    },
+    {
+      "epoch": 0.19158,
+      "grad_norm": 0.977011501789093,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 19158
+    },
+    {
+      "epoch": 0.19159,
+      "grad_norm": 1.2999720573425293,
+      "learning_rate": 0.003,
+      "loss": 4.0456,
+      "step": 19159
+    },
+    {
+      "epoch": 0.1916,
+      "grad_norm": 0.5874431133270264,
+      "learning_rate": 0.003,
+      "loss": 3.9829,
+      "step": 19160
+    },
+    {
+      "epoch": 0.19161,
+      "grad_norm": 0.6543835997581482,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 19161
+    },
+    {
+      "epoch": 0.19162,
+      "grad_norm": 0.6022617816925049,
+      "learning_rate": 0.003,
+      "loss": 3.9875,
+      "step": 19162
+    },
+    {
+      "epoch": 0.19163,
+      "grad_norm": 0.6174903512001038,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 19163
+    },
+    {
+      "epoch": 0.19164,
+      "grad_norm": 0.6451676487922668,
+      "learning_rate": 0.003,
+      "loss": 4.0502,
+      "step": 19164
+    },
+    {
+      "epoch": 0.19165,
+      "grad_norm": 0.7233970761299133,
+      "learning_rate": 0.003,
+      "loss": 3.987,
+      "step": 19165
+    },
+    {
+      "epoch": 0.19166,
+      "grad_norm": 0.8002468943595886,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 19166
+    },
+    {
+      "epoch": 0.19167,
+      "grad_norm": 0.8846295475959778,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 19167
+    },
+    {
+      "epoch": 0.19168,
+      "grad_norm": 0.8885087966918945,
+      "learning_rate": 0.003,
+      "loss": 3.9772,
+      "step": 19168
+    },
+    {
+      "epoch": 0.19169,
+      "grad_norm": 0.8156308531761169,
+      "learning_rate": 0.003,
+      "loss": 3.9699,
+      "step": 19169
+    },
+    {
+      "epoch": 0.1917,
+      "grad_norm": 0.753954291343689,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 19170
+    },
+    {
+      "epoch": 0.19171,
+      "grad_norm": 0.7089570164680481,
+      "learning_rate": 0.003,
+      "loss": 3.9768,
+      "step": 19171
+    },
+    {
+      "epoch": 0.19172,
+      "grad_norm": 0.6770859360694885,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 19172
+    },
+    {
+      "epoch": 0.19173,
+      "grad_norm": 0.7039762735366821,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 19173
+    },
+    {
+      "epoch": 0.19174,
+      "grad_norm": 0.6847444176673889,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 19174
+    },
+    {
+      "epoch": 0.19175,
+      "grad_norm": 0.7717546820640564,
+      "learning_rate": 0.003,
+      "loss": 4.0349,
+      "step": 19175
+    },
+    {
+      "epoch": 0.19176,
+      "grad_norm": 1.1402064561843872,
+      "learning_rate": 0.003,
+      "loss": 3.9938,
+      "step": 19176
+    },
+    {
+      "epoch": 0.19177,
+      "grad_norm": 1.2233058214187622,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 19177
+    },
+    {
+      "epoch": 0.19178,
+      "grad_norm": 0.7143800854682922,
+      "learning_rate": 0.003,
+      "loss": 3.9773,
+      "step": 19178
+    },
+    {
+      "epoch": 0.19179,
+      "grad_norm": 0.6129800081253052,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 19179
+    },
+    {
+      "epoch": 0.1918,
+      "grad_norm": 0.596025824546814,
+      "learning_rate": 0.003,
+      "loss": 3.9746,
+      "step": 19180
+    },
+    {
+      "epoch": 0.19181,
+      "grad_norm": 0.637728214263916,
+      "learning_rate": 0.003,
+      "loss": 3.9992,
+      "step": 19181
+    },
+    {
+      "epoch": 0.19182,
+      "grad_norm": 0.54209965467453,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 19182
+    },
+    {
+      "epoch": 0.19183,
+      "grad_norm": 0.5044422745704651,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 19183
+    },
+    {
+      "epoch": 0.19184,
+      "grad_norm": 0.5068732500076294,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 19184
+    },
+    {
+      "epoch": 0.19185,
+      "grad_norm": 0.6069113612174988,
+      "learning_rate": 0.003,
+      "loss": 3.9729,
+      "step": 19185
+    },
+    {
+      "epoch": 0.19186,
+      "grad_norm": 0.797422468662262,
+      "learning_rate": 0.003,
+      "loss": 3.9646,
+      "step": 19186
+    },
+    {
+      "epoch": 0.19187,
+      "grad_norm": 1.0681114196777344,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 19187
+    },
+    {
+      "epoch": 0.19188,
+      "grad_norm": 1.1297365427017212,
+      "learning_rate": 0.003,
+      "loss": 3.9919,
+      "step": 19188
+    },
+    {
+      "epoch": 0.19189,
+      "grad_norm": 0.9015651941299438,
+      "learning_rate": 0.003,
+      "loss": 3.9703,
+      "step": 19189
+    },
+    {
+      "epoch": 0.1919,
+      "grad_norm": 0.8453728556632996,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 19190
+    },
+    {
+      "epoch": 0.19191,
+      "grad_norm": 0.8107426762580872,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 19191
+    },
+    {
+      "epoch": 0.19192,
+      "grad_norm": 0.7438492774963379,
+      "learning_rate": 0.003,
+      "loss": 3.9605,
+      "step": 19192
+    },
+    {
+      "epoch": 0.19193,
+      "grad_norm": 0.7621150016784668,
+      "learning_rate": 0.003,
+      "loss": 3.9998,
+      "step": 19193
+    },
+    {
+      "epoch": 0.19194,
+      "grad_norm": 0.8221701979637146,
+      "learning_rate": 0.003,
+      "loss": 3.9865,
+      "step": 19194
+    },
+    {
+      "epoch": 0.19195,
+      "grad_norm": 0.9234983921051025,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 19195
+    },
+    {
+      "epoch": 0.19196,
+      "grad_norm": 0.8986861705780029,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 19196
+    },
+    {
+      "epoch": 0.19197,
+      "grad_norm": 0.9112302660942078,
+      "learning_rate": 0.003,
+      "loss": 3.9889,
+      "step": 19197
+    },
+    {
+      "epoch": 0.19198,
+      "grad_norm": 0.8387850522994995,
+      "learning_rate": 0.003,
+      "loss": 3.9906,
+      "step": 19198
+    },
+    {
+      "epoch": 0.19199,
+      "grad_norm": 0.9891698956489563,
+      "learning_rate": 0.003,
+      "loss": 4.0302,
+      "step": 19199
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 1.352467656135559,
+      "learning_rate": 0.003,
+      "loss": 4.0545,
+      "step": 19200
+    },
+    {
+      "epoch": 0.19201,
+      "grad_norm": 1.191960096359253,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 19201
+    },
+    {
+      "epoch": 0.19202,
+      "grad_norm": 0.7610149383544922,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 19202
+    },
+    {
+      "epoch": 0.19203,
+      "grad_norm": 0.7151528000831604,
+      "learning_rate": 0.003,
+      "loss": 3.982,
+      "step": 19203
+    },
+    {
+      "epoch": 0.19204,
+      "grad_norm": 0.8194527626037598,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 19204
+    },
+    {
+      "epoch": 0.19205,
+      "grad_norm": 0.8837549686431885,
+      "learning_rate": 0.003,
+      "loss": 4.0395,
+      "step": 19205
+    },
+    {
+      "epoch": 0.19206,
+      "grad_norm": 0.9980619549751282,
+      "learning_rate": 0.003,
+      "loss": 3.9916,
+      "step": 19206
+    },
+    {
+      "epoch": 0.19207,
+      "grad_norm": 1.1445870399475098,
+      "learning_rate": 0.003,
+      "loss": 4.0311,
+      "step": 19207
+    },
+    {
+      "epoch": 0.19208,
+      "grad_norm": 0.8322120904922485,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 19208
+    },
+    {
+      "epoch": 0.19209,
+      "grad_norm": 0.8153591752052307,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 19209
+    },
+    {
+      "epoch": 0.1921,
+      "grad_norm": 0.9522150158882141,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 19210
+    },
+    {
+      "epoch": 0.19211,
+      "grad_norm": 1.006571888923645,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 19211
+    },
+    {
+      "epoch": 0.19212,
+      "grad_norm": 0.9249372482299805,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 19212
+    },
+    {
+      "epoch": 0.19213,
+      "grad_norm": 0.764064610004425,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 19213
+    },
+    {
+      "epoch": 0.19214,
+      "grad_norm": 0.9267817139625549,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 19214
+    },
+    {
+      "epoch": 0.19215,
+      "grad_norm": 0.907848596572876,
+      "learning_rate": 0.003,
+      "loss": 4.0385,
+      "step": 19215
+    },
+    {
+      "epoch": 0.19216,
+      "grad_norm": 0.9351398348808289,
+      "learning_rate": 0.003,
+      "loss": 4.0108,
+      "step": 19216
+    },
+    {
+      "epoch": 0.19217,
+      "grad_norm": 0.9645265936851501,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 19217
+    },
+    {
+      "epoch": 0.19218,
+      "grad_norm": 0.9986564517021179,
+      "learning_rate": 0.003,
+      "loss": 4.0585,
+      "step": 19218
+    },
+    {
+      "epoch": 0.19219,
+      "grad_norm": 1.132468819618225,
+      "learning_rate": 0.003,
+      "loss": 3.9994,
+      "step": 19219
+    },
+    {
+      "epoch": 0.1922,
+      "grad_norm": 1.048362374305725,
+      "learning_rate": 0.003,
+      "loss": 3.9836,
+      "step": 19220
+    },
+    {
+      "epoch": 0.19221,
+      "grad_norm": 1.063513994216919,
+      "learning_rate": 0.003,
+      "loss": 4.0418,
+      "step": 19221
+    },
+    {
+      "epoch": 0.19222,
+      "grad_norm": 0.8725884556770325,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 19222
+    },
+    {
+      "epoch": 0.19223,
+      "grad_norm": 0.8729296922683716,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 19223
+    },
+    {
+      "epoch": 0.19224,
+      "grad_norm": 1.0028876066207886,
+      "learning_rate": 0.003,
+      "loss": 3.9884,
+      "step": 19224
+    },
+    {
+      "epoch": 0.19225,
+      "grad_norm": 1.1254569292068481,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 19225
+    },
+    {
+      "epoch": 0.19226,
+      "grad_norm": 0.6930163502693176,
+      "learning_rate": 0.003,
+      "loss": 3.9965,
+      "step": 19226
+    },
+    {
+      "epoch": 0.19227,
+      "grad_norm": 0.7227094769477844,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 19227
+    },
+    {
+      "epoch": 0.19228,
+      "grad_norm": 0.7676095366477966,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 19228
+    },
+    {
+      "epoch": 0.19229,
+      "grad_norm": 0.9406114816665649,
+      "learning_rate": 0.003,
+      "loss": 3.995,
+      "step": 19229
+    },
+    {
+      "epoch": 0.1923,
+      "grad_norm": 1.07334566116333,
+      "learning_rate": 0.003,
+      "loss": 4.0237,
+      "step": 19230
+    },
+    {
+      "epoch": 0.19231,
+      "grad_norm": 0.9809719324111938,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 19231
+    },
+    {
+      "epoch": 0.19232,
+      "grad_norm": 1.104724407196045,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 19232
+    },
+    {
+      "epoch": 0.19233,
+      "grad_norm": 0.8253532648086548,
+      "learning_rate": 0.003,
+      "loss": 3.9994,
+      "step": 19233
+    },
+    {
+      "epoch": 0.19234,
+      "grad_norm": 0.6342723965644836,
+      "learning_rate": 0.003,
+      "loss": 3.9894,
+      "step": 19234
+    },
+    {
+      "epoch": 0.19235,
+      "grad_norm": 0.7233535051345825,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 19235
+    },
+    {
+      "epoch": 0.19236,
+      "grad_norm": 0.8769366145133972,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 19236
+    },
+    {
+      "epoch": 0.19237,
+      "grad_norm": 1.0040518045425415,
+      "learning_rate": 0.003,
+      "loss": 3.9975,
+      "step": 19237
+    },
+    {
+      "epoch": 0.19238,
+      "grad_norm": 1.0712950229644775,
+      "learning_rate": 0.003,
+      "loss": 3.9967,
+      "step": 19238
+    },
+    {
+      "epoch": 0.19239,
+      "grad_norm": 0.8149884939193726,
+      "learning_rate": 0.003,
+      "loss": 3.9952,
+      "step": 19239
+    },
+    {
+      "epoch": 0.1924,
+      "grad_norm": 0.7744317054748535,
+      "learning_rate": 0.003,
+      "loss": 3.974,
+      "step": 19240
+    },
+    {
+      "epoch": 0.19241,
+      "grad_norm": 0.7898107171058655,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 19241
+    },
+    {
+      "epoch": 0.19242,
+      "grad_norm": 0.8145323991775513,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 19242
+    },
+    {
+      "epoch": 0.19243,
+      "grad_norm": 0.8011947870254517,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 19243
+    },
+    {
+      "epoch": 0.19244,
+      "grad_norm": 0.6696740984916687,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 19244
+    },
+    {
+      "epoch": 0.19245,
+      "grad_norm": 0.7286134958267212,
+      "learning_rate": 0.003,
+      "loss": 3.9967,
+      "step": 19245
+    },
+    {
+      "epoch": 0.19246,
+      "grad_norm": 0.809220016002655,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 19246
+    },
+    {
+      "epoch": 0.19247,
+      "grad_norm": 0.717673122882843,
+      "learning_rate": 0.003,
+      "loss": 3.9909,
+      "step": 19247
+    },
+    {
+      "epoch": 0.19248,
+      "grad_norm": 0.6953349709510803,
+      "learning_rate": 0.003,
+      "loss": 3.9812,
+      "step": 19248
+    },
+    {
+      "epoch": 0.19249,
+      "grad_norm": 0.5940714478492737,
+      "learning_rate": 0.003,
+      "loss": 3.97,
+      "step": 19249
+    },
+    {
+      "epoch": 0.1925,
+      "grad_norm": 0.6480825543403625,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 19250
+    },
+    {
+      "epoch": 0.19251,
+      "grad_norm": 0.75681072473526,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 19251
+    },
+    {
+      "epoch": 0.19252,
+      "grad_norm": 0.9951421022415161,
+      "learning_rate": 0.003,
+      "loss": 3.9846,
+      "step": 19252
+    },
+    {
+      "epoch": 0.19253,
+      "grad_norm": 1.3604280948638916,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 19253
+    },
+    {
+      "epoch": 0.19254,
+      "grad_norm": 0.6712243556976318,
+      "learning_rate": 0.003,
+      "loss": 4.04,
+      "step": 19254
+    },
+    {
+      "epoch": 0.19255,
+      "grad_norm": 0.6681257486343384,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 19255
+    },
+    {
+      "epoch": 0.19256,
+      "grad_norm": 0.9547545313835144,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 19256
+    },
+    {
+      "epoch": 0.19257,
+      "grad_norm": 0.8940248489379883,
+      "learning_rate": 0.003,
+      "loss": 3.9753,
+      "step": 19257
+    },
+    {
+      "epoch": 0.19258,
+      "grad_norm": 0.8410124778747559,
+      "learning_rate": 0.003,
+      "loss": 3.9912,
+      "step": 19258
+    },
+    {
+      "epoch": 0.19259,
+      "grad_norm": 0.8101106286048889,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 19259
+    },
+    {
+      "epoch": 0.1926,
+      "grad_norm": 0.6973461508750916,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 19260
+    },
+    {
+      "epoch": 0.19261,
+      "grad_norm": 0.6412228345870972,
+      "learning_rate": 0.003,
+      "loss": 3.9879,
+      "step": 19261
+    },
+    {
+      "epoch": 0.19262,
+      "grad_norm": 0.7030969262123108,
+      "learning_rate": 0.003,
+      "loss": 3.9851,
+      "step": 19262
+    },
+    {
+      "epoch": 0.19263,
+      "grad_norm": 0.6626996397972107,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 19263
+    },
+    {
+      "epoch": 0.19264,
+      "grad_norm": 0.7193623185157776,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 19264
+    },
+    {
+      "epoch": 0.19265,
+      "grad_norm": 0.7803868651390076,
+      "learning_rate": 0.003,
+      "loss": 3.9994,
+      "step": 19265
+    },
+    {
+      "epoch": 0.19266,
+      "grad_norm": 0.8606928586959839,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 19266
+    },
+    {
+      "epoch": 0.19267,
+      "grad_norm": 1.137913703918457,
+      "learning_rate": 0.003,
+      "loss": 3.9943,
+      "step": 19267
+    },
+    {
+      "epoch": 0.19268,
+      "grad_norm": 1.2063173055648804,
+      "learning_rate": 0.003,
+      "loss": 3.9922,
+      "step": 19268
+    },
+    {
+      "epoch": 0.19269,
+      "grad_norm": 0.8055838346481323,
+      "learning_rate": 0.003,
+      "loss": 4.0095,
+      "step": 19269
+    },
+    {
+      "epoch": 0.1927,
+      "grad_norm": 0.8761873245239258,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 19270
+    },
+    {
+      "epoch": 0.19271,
+      "grad_norm": 0.8632670044898987,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 19271
+    },
+    {
+      "epoch": 0.19272,
+      "grad_norm": 0.8842520117759705,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 19272
+    },
+    {
+      "epoch": 0.19273,
+      "grad_norm": 0.9441511034965515,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 19273
+    },
+    {
+      "epoch": 0.19274,
+      "grad_norm": 1.003538727760315,
+      "learning_rate": 0.003,
+      "loss": 4.0311,
+      "step": 19274
+    },
+    {
+      "epoch": 0.19275,
+      "grad_norm": 1.0713435411453247,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 19275
+    },
+    {
+      "epoch": 0.19276,
+      "grad_norm": 0.8656263947486877,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 19276
+    },
+    {
+      "epoch": 0.19277,
+      "grad_norm": 0.8232696056365967,
+      "learning_rate": 0.003,
+      "loss": 4.0386,
+      "step": 19277
+    },
+    {
+      "epoch": 0.19278,
+      "grad_norm": 0.9967733025550842,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 19278
+    },
+    {
+      "epoch": 0.19279,
+      "grad_norm": 1.1772435903549194,
+      "learning_rate": 0.003,
+      "loss": 3.987,
+      "step": 19279
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.718956470489502,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 19280
+    },
+    {
+      "epoch": 0.19281,
+      "grad_norm": 0.6918644309043884,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 19281
+    },
+    {
+      "epoch": 0.19282,
+      "grad_norm": 0.6497114896774292,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 19282
+    },
+    {
+      "epoch": 0.19283,
+      "grad_norm": 0.7545351386070251,
+      "learning_rate": 0.003,
+      "loss": 3.9888,
+      "step": 19283
+    },
+    {
+      "epoch": 0.19284,
+      "grad_norm": 0.7960440516471863,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 19284
+    },
+    {
+      "epoch": 0.19285,
+      "grad_norm": 0.820967972278595,
+      "learning_rate": 0.003,
+      "loss": 3.9861,
+      "step": 19285
+    },
+    {
+      "epoch": 0.19286,
+      "grad_norm": 0.9108230471611023,
+      "learning_rate": 0.003,
+      "loss": 3.9746,
+      "step": 19286
+    },
+    {
+      "epoch": 0.19287,
+      "grad_norm": 1.113358974456787,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 19287
+    },
+    {
+      "epoch": 0.19288,
+      "grad_norm": 1.094118356704712,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 19288
+    },
+    {
+      "epoch": 0.19289,
+      "grad_norm": 0.9445165991783142,
+      "learning_rate": 0.003,
+      "loss": 3.9858,
+      "step": 19289
+    },
+    {
+      "epoch": 0.1929,
+      "grad_norm": 0.8943971395492554,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 19290
+    },
+    {
+      "epoch": 0.19291,
+      "grad_norm": 0.8869792819023132,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 19291
+    },
+    {
+      "epoch": 0.19292,
+      "grad_norm": 0.8029800057411194,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 19292
+    },
+    {
+      "epoch": 0.19293,
+      "grad_norm": 0.63421630859375,
+      "learning_rate": 0.003,
+      "loss": 3.9903,
+      "step": 19293
+    },
+    {
+      "epoch": 0.19294,
+      "grad_norm": 0.6482874155044556,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 19294
+    },
+    {
+      "epoch": 0.19295,
+      "grad_norm": 0.6990712285041809,
+      "learning_rate": 0.003,
+      "loss": 3.9869,
+      "step": 19295
+    },
+    {
+      "epoch": 0.19296,
+      "grad_norm": 0.6776507496833801,
+      "learning_rate": 0.003,
+      "loss": 3.9819,
+      "step": 19296
+    },
+    {
+      "epoch": 0.19297,
+      "grad_norm": 0.690116286277771,
+      "learning_rate": 0.003,
+      "loss": 3.9936,
+      "step": 19297
+    },
+    {
+      "epoch": 0.19298,
+      "grad_norm": 0.7109023928642273,
+      "learning_rate": 0.003,
+      "loss": 3.9801,
+      "step": 19298
+    },
+    {
+      "epoch": 0.19299,
+      "grad_norm": 0.7003673911094666,
+      "learning_rate": 0.003,
+      "loss": 3.9825,
+      "step": 19299
+    },
+    {
+      "epoch": 0.193,
+      "grad_norm": 0.7295224070549011,
+      "learning_rate": 0.003,
+      "loss": 3.9973,
+      "step": 19300
+    },
+    {
+      "epoch": 0.19301,
+      "grad_norm": 0.8747197985649109,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 19301
+    },
+    {
+      "epoch": 0.19302,
+      "grad_norm": 0.9157381653785706,
+      "learning_rate": 0.003,
+      "loss": 3.9977,
+      "step": 19302
+    },
+    {
+      "epoch": 0.19303,
+      "grad_norm": 0.9902534484863281,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 19303
+    },
+    {
+      "epoch": 0.19304,
+      "grad_norm": 1.1121066808700562,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 19304
+    },
+    {
+      "epoch": 0.19305,
+      "grad_norm": 0.913833737373352,
+      "learning_rate": 0.003,
+      "loss": 4.0057,
+      "step": 19305
+    },
+    {
+      "epoch": 0.19306,
+      "grad_norm": 0.9779497981071472,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 19306
+    },
+    {
+      "epoch": 0.19307,
+      "grad_norm": 0.9935235977172852,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 19307
+    },
+    {
+      "epoch": 0.19308,
+      "grad_norm": 1.01974356174469,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 19308
+    },
+    {
+      "epoch": 0.19309,
+      "grad_norm": 0.8969842791557312,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 19309
+    },
+    {
+      "epoch": 0.1931,
+      "grad_norm": 0.9678723812103271,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 19310
+    },
+    {
+      "epoch": 0.19311,
+      "grad_norm": 0.9434349536895752,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 19311
+    },
+    {
+      "epoch": 0.19312,
+      "grad_norm": 0.8398012518882751,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 19312
+    },
+    {
+      "epoch": 0.19313,
+      "grad_norm": 0.9102538824081421,
+      "learning_rate": 0.003,
+      "loss": 4.0356,
+      "step": 19313
+    },
+    {
+      "epoch": 0.19314,
+      "grad_norm": 1.021769404411316,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 19314
+    },
+    {
+      "epoch": 0.19315,
+      "grad_norm": 1.102475881576538,
+      "learning_rate": 0.003,
+      "loss": 3.982,
+      "step": 19315
+    },
+    {
+      "epoch": 0.19316,
+      "grad_norm": 0.7890679240226746,
+      "learning_rate": 0.003,
+      "loss": 3.9869,
+      "step": 19316
+    },
+    {
+      "epoch": 0.19317,
+      "grad_norm": 0.6756047606468201,
+      "learning_rate": 0.003,
+      "loss": 3.9866,
+      "step": 19317
+    },
+    {
+      "epoch": 0.19318,
+      "grad_norm": 0.5816415548324585,
+      "learning_rate": 0.003,
+      "loss": 3.9875,
+      "step": 19318
+    },
+    {
+      "epoch": 0.19319,
+      "grad_norm": 0.5623725652694702,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 19319
+    },
+    {
+      "epoch": 0.1932,
+      "grad_norm": 0.6121018528938293,
+      "learning_rate": 0.003,
+      "loss": 3.9966,
+      "step": 19320
+    },
+    {
+      "epoch": 0.19321,
+      "grad_norm": 0.590885579586029,
+      "learning_rate": 0.003,
+      "loss": 3.9925,
+      "step": 19321
+    },
+    {
+      "epoch": 0.19322,
+      "grad_norm": 0.6558840870857239,
+      "learning_rate": 0.003,
+      "loss": 4.0113,
+      "step": 19322
+    },
+    {
+      "epoch": 0.19323,
+      "grad_norm": 0.7619689702987671,
+      "learning_rate": 0.003,
+      "loss": 3.987,
+      "step": 19323
+    },
+    {
+      "epoch": 0.19324,
+      "grad_norm": 1.0216729640960693,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 19324
+    },
+    {
+      "epoch": 0.19325,
+      "grad_norm": 1.1044150590896606,
+      "learning_rate": 0.003,
+      "loss": 3.9993,
+      "step": 19325
+    },
+    {
+      "epoch": 0.19326,
+      "grad_norm": 0.6996297836303711,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 19326
+    },
+    {
+      "epoch": 0.19327,
+      "grad_norm": 0.6888504028320312,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 19327
+    },
+    {
+      "epoch": 0.19328,
+      "grad_norm": 0.7001692652702332,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 19328
+    },
+    {
+      "epoch": 0.19329,
+      "grad_norm": 0.6823729276657104,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 19329
+    },
+    {
+      "epoch": 0.1933,
+      "grad_norm": 0.6538233757019043,
+      "learning_rate": 0.003,
+      "loss": 3.9637,
+      "step": 19330
+    },
+    {
+      "epoch": 0.19331,
+      "grad_norm": 0.7303865551948547,
+      "learning_rate": 0.003,
+      "loss": 3.9856,
+      "step": 19331
+    },
+    {
+      "epoch": 0.19332,
+      "grad_norm": 0.7956865429878235,
+      "learning_rate": 0.003,
+      "loss": 3.9903,
+      "step": 19332
+    },
+    {
+      "epoch": 0.19333,
+      "grad_norm": 0.8402124047279358,
+      "learning_rate": 0.003,
+      "loss": 4.0004,
+      "step": 19333
+    },
+    {
+      "epoch": 0.19334,
+      "grad_norm": 0.978168785572052,
+      "learning_rate": 0.003,
+      "loss": 3.963,
+      "step": 19334
+    },
+    {
+      "epoch": 0.19335,
+      "grad_norm": 1.1001943349838257,
+      "learning_rate": 0.003,
+      "loss": 3.9776,
+      "step": 19335
+    },
+    {
+      "epoch": 0.19336,
+      "grad_norm": 0.8973298072814941,
+      "learning_rate": 0.003,
+      "loss": 3.9845,
+      "step": 19336
+    },
+    {
+      "epoch": 0.19337,
+      "grad_norm": 0.8150404095649719,
+      "learning_rate": 0.003,
+      "loss": 3.9957,
+      "step": 19337
+    },
+    {
+      "epoch": 0.19338,
+      "grad_norm": 0.8321810960769653,
+      "learning_rate": 0.003,
+      "loss": 4.035,
+      "step": 19338
+    },
+    {
+      "epoch": 0.19339,
+      "grad_norm": 0.8140332102775574,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 19339
+    },
+    {
+      "epoch": 0.1934,
+      "grad_norm": 0.8432743549346924,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 19340
+    },
+    {
+      "epoch": 0.19341,
+      "grad_norm": 1.1796387434005737,
+      "learning_rate": 0.003,
+      "loss": 3.9818,
+      "step": 19341
+    },
+    {
+      "epoch": 0.19342,
+      "grad_norm": 0.8989206552505493,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 19342
+    },
+    {
+      "epoch": 0.19343,
+      "grad_norm": 0.770767331123352,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 19343
+    },
+    {
+      "epoch": 0.19344,
+      "grad_norm": 0.847355306148529,
+      "learning_rate": 0.003,
+      "loss": 3.9994,
+      "step": 19344
+    },
+    {
+      "epoch": 0.19345,
+      "grad_norm": 1.0080876350402832,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 19345
+    },
+    {
+      "epoch": 0.19346,
+      "grad_norm": 1.0260003805160522,
+      "learning_rate": 0.003,
+      "loss": 3.9783,
+      "step": 19346
+    },
+    {
+      "epoch": 0.19347,
+      "grad_norm": 0.8898026943206787,
+      "learning_rate": 0.003,
+      "loss": 4.0197,
+      "step": 19347
+    },
+    {
+      "epoch": 0.19348,
+      "grad_norm": 1.0255355834960938,
+      "learning_rate": 0.003,
+      "loss": 4.0563,
+      "step": 19348
+    },
+    {
+      "epoch": 0.19349,
+      "grad_norm": 0.9337112307548523,
+      "learning_rate": 0.003,
+      "loss": 3.9933,
+      "step": 19349
+    },
+    {
+      "epoch": 0.1935,
+      "grad_norm": 0.9396280646324158,
+      "learning_rate": 0.003,
+      "loss": 4.0121,
+      "step": 19350
+    },
+    {
+      "epoch": 0.19351,
+      "grad_norm": 1.0090826749801636,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 19351
+    },
+    {
+      "epoch": 0.19352,
+      "grad_norm": 0.9399703145027161,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 19352
+    },
+    {
+      "epoch": 0.19353,
+      "grad_norm": 0.900275468826294,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 19353
+    },
+    {
+      "epoch": 0.19354,
+      "grad_norm": 0.8167095184326172,
+      "learning_rate": 0.003,
+      "loss": 4.0207,
+      "step": 19354
+    },
+    {
+      "epoch": 0.19355,
+      "grad_norm": 0.8206589221954346,
+      "learning_rate": 0.003,
+      "loss": 4.0126,
+      "step": 19355
+    },
+    {
+      "epoch": 0.19356,
+      "grad_norm": 0.9308629631996155,
+      "learning_rate": 0.003,
+      "loss": 4.0163,
+      "step": 19356
+    },
+    {
+      "epoch": 0.19357,
+      "grad_norm": 0.9931376576423645,
+      "learning_rate": 0.003,
+      "loss": 3.9909,
+      "step": 19357
+    },
+    {
+      "epoch": 0.19358,
+      "grad_norm": 0.9372820854187012,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 19358
+    },
+    {
+      "epoch": 0.19359,
+      "grad_norm": 0.7388105392456055,
+      "learning_rate": 0.003,
+      "loss": 3.9894,
+      "step": 19359
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.6335276961326599,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 19360
+    },
+    {
+      "epoch": 0.19361,
+      "grad_norm": 0.7078648209571838,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 19361
+    },
+    {
+      "epoch": 0.19362,
+      "grad_norm": 0.677641749382019,
+      "learning_rate": 0.003,
+      "loss": 3.9913,
+      "step": 19362
+    },
+    {
+      "epoch": 0.19363,
+      "grad_norm": 0.6329396963119507,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 19363
+    },
+    {
+      "epoch": 0.19364,
+      "grad_norm": 0.6954342126846313,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 19364
+    },
+    {
+      "epoch": 0.19365,
+      "grad_norm": 0.6739953756332397,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 19365
+    },
+    {
+      "epoch": 0.19366,
+      "grad_norm": 0.6043066382408142,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 19366
+    },
+    {
+      "epoch": 0.19367,
+      "grad_norm": 0.7020654082298279,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 19367
+    },
+    {
+      "epoch": 0.19368,
+      "grad_norm": 0.7260381579399109,
+      "learning_rate": 0.003,
+      "loss": 3.9803,
+      "step": 19368
+    },
+    {
+      "epoch": 0.19369,
+      "grad_norm": 0.8367782831192017,
+      "learning_rate": 0.003,
+      "loss": 3.9887,
+      "step": 19369
+    },
+    {
+      "epoch": 0.1937,
+      "grad_norm": 1.1033228635787964,
+      "learning_rate": 0.003,
+      "loss": 3.9887,
+      "step": 19370
+    },
+    {
+      "epoch": 0.19371,
+      "grad_norm": 1.3062193393707275,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 19371
+    },
+    {
+      "epoch": 0.19372,
+      "grad_norm": 0.5842729806900024,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 19372
+    },
+    {
+      "epoch": 0.19373,
+      "grad_norm": 0.659385621547699,
+      "learning_rate": 0.003,
+      "loss": 4.0027,
+      "step": 19373
+    },
+    {
+      "epoch": 0.19374,
+      "grad_norm": 0.7550154328346252,
+      "learning_rate": 0.003,
+      "loss": 3.9767,
+      "step": 19374
+    },
+    {
+      "epoch": 0.19375,
+      "grad_norm": 0.7876573801040649,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 19375
+    },
+    {
+      "epoch": 0.19376,
+      "grad_norm": 0.830193817615509,
+      "learning_rate": 0.003,
+      "loss": 3.9984,
+      "step": 19376
+    },
+    {
+      "epoch": 0.19377,
+      "grad_norm": 0.8212099671363831,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 19377
+    },
+    {
+      "epoch": 0.19378,
+      "grad_norm": 0.6696993112564087,
+      "learning_rate": 0.003,
+      "loss": 4.0234,
+      "step": 19378
+    },
+    {
+      "epoch": 0.19379,
+      "grad_norm": 0.679273247718811,
+      "learning_rate": 0.003,
+      "loss": 3.9658,
+      "step": 19379
+    },
+    {
+      "epoch": 0.1938,
+      "grad_norm": 0.7368372082710266,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 19380
+    },
+    {
+      "epoch": 0.19381,
+      "grad_norm": 0.6955564022064209,
+      "learning_rate": 0.003,
+      "loss": 3.9873,
+      "step": 19381
+    },
+    {
+      "epoch": 0.19382,
+      "grad_norm": 0.7486737370491028,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 19382
+    },
+    {
+      "epoch": 0.19383,
+      "grad_norm": 0.8705873489379883,
+      "learning_rate": 0.003,
+      "loss": 3.9796,
+      "step": 19383
+    },
+    {
+      "epoch": 0.19384,
+      "grad_norm": 1.004651665687561,
+      "learning_rate": 0.003,
+      "loss": 4.0448,
+      "step": 19384
+    },
+    {
+      "epoch": 0.19385,
+      "grad_norm": 0.9881071448326111,
+      "learning_rate": 0.003,
+      "loss": 4.0134,
+      "step": 19385
+    },
+    {
+      "epoch": 0.19386,
+      "grad_norm": 1.0152630805969238,
+      "learning_rate": 0.003,
+      "loss": 3.9769,
+      "step": 19386
+    },
+    {
+      "epoch": 0.19387,
+      "grad_norm": 1.1704293489456177,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 19387
+    },
+    {
+      "epoch": 0.19388,
+      "grad_norm": 0.6372362375259399,
+      "learning_rate": 0.003,
+      "loss": 4.0006,
+      "step": 19388
+    },
+    {
+      "epoch": 0.19389,
+      "grad_norm": 0.8370605111122131,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 19389
+    },
+    {
+      "epoch": 0.1939,
+      "grad_norm": 1.0577499866485596,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 19390
+    },
+    {
+      "epoch": 0.19391,
+      "grad_norm": 1.250914216041565,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 19391
+    },
+    {
+      "epoch": 0.19392,
+      "grad_norm": 1.0160754919052124,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 19392
+    },
+    {
+      "epoch": 0.19393,
+      "grad_norm": 1.0917208194732666,
+      "learning_rate": 0.003,
+      "loss": 3.994,
+      "step": 19393
+    },
+    {
+      "epoch": 0.19394,
+      "grad_norm": 0.85653156042099,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 19394
+    },
+    {
+      "epoch": 0.19395,
+      "grad_norm": 0.8118630051612854,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 19395
+    },
+    {
+      "epoch": 0.19396,
+      "grad_norm": 0.6265334486961365,
+      "learning_rate": 0.003,
+      "loss": 4.0027,
+      "step": 19396
+    },
+    {
+      "epoch": 0.19397,
+      "grad_norm": 0.6211968660354614,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 19397
+    },
+    {
+      "epoch": 0.19398,
+      "grad_norm": 0.5773246884346008,
+      "learning_rate": 0.003,
+      "loss": 3.9929,
+      "step": 19398
+    },
+    {
+      "epoch": 0.19399,
+      "grad_norm": 0.6495667695999146,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 19399
+    },
+    {
+      "epoch": 0.194,
+      "grad_norm": 0.7733378410339355,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 19400
+    },
+    {
+      "epoch": 0.19401,
+      "grad_norm": 0.988730251789093,
+      "learning_rate": 0.003,
+      "loss": 3.9905,
+      "step": 19401
+    },
+    {
+      "epoch": 0.19402,
+      "grad_norm": 1.137104868888855,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 19402
+    },
+    {
+      "epoch": 0.19403,
+      "grad_norm": 0.720878005027771,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 19403
+    },
+    {
+      "epoch": 0.19404,
+      "grad_norm": 0.5655572414398193,
+      "learning_rate": 0.003,
+      "loss": 3.997,
+      "step": 19404
+    },
+    {
+      "epoch": 0.19405,
+      "grad_norm": 0.6101178526878357,
+      "learning_rate": 0.003,
+      "loss": 3.9928,
+      "step": 19405
+    },
+    {
+      "epoch": 0.19406,
+      "grad_norm": 0.6160560846328735,
+      "learning_rate": 0.003,
+      "loss": 3.9617,
+      "step": 19406
+    },
+    {
+      "epoch": 0.19407,
+      "grad_norm": 0.6651063561439514,
+      "learning_rate": 0.003,
+      "loss": 3.9912,
+      "step": 19407
+    },
+    {
+      "epoch": 0.19408,
+      "grad_norm": 0.7105965614318848,
+      "learning_rate": 0.003,
+      "loss": 3.9907,
+      "step": 19408
+    },
+    {
+      "epoch": 0.19409,
+      "grad_norm": 0.7422831654548645,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 19409
+    },
+    {
+      "epoch": 0.1941,
+      "grad_norm": 0.6924363970756531,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 19410
+    },
+    {
+      "epoch": 0.19411,
+      "grad_norm": 0.7285566329956055,
+      "learning_rate": 0.003,
+      "loss": 4.0154,
+      "step": 19411
+    },
+    {
+      "epoch": 0.19412,
+      "grad_norm": 0.5908989906311035,
+      "learning_rate": 0.003,
+      "loss": 3.9756,
+      "step": 19412
+    },
+    {
+      "epoch": 0.19413,
+      "grad_norm": 0.5184929370880127,
+      "learning_rate": 0.003,
+      "loss": 3.9918,
+      "step": 19413
+    },
+    {
+      "epoch": 0.19414,
+      "grad_norm": 0.5630707740783691,
+      "learning_rate": 0.003,
+      "loss": 3.9719,
+      "step": 19414
+    },
+    {
+      "epoch": 0.19415,
+      "grad_norm": 0.6402706503868103,
+      "learning_rate": 0.003,
+      "loss": 3.9527,
+      "step": 19415
+    },
+    {
+      "epoch": 0.19416,
+      "grad_norm": 0.8083771467208862,
+      "learning_rate": 0.003,
+      "loss": 3.9863,
+      "step": 19416
+    },
+    {
+      "epoch": 0.19417,
+      "grad_norm": 1.0702193975448608,
+      "learning_rate": 0.003,
+      "loss": 4.0176,
+      "step": 19417
+    },
+    {
+      "epoch": 0.19418,
+      "grad_norm": 1.3197736740112305,
+      "learning_rate": 0.003,
+      "loss": 3.9923,
+      "step": 19418
+    },
+    {
+      "epoch": 0.19419,
+      "grad_norm": 0.9700169563293457,
+      "learning_rate": 0.003,
+      "loss": 3.9935,
+      "step": 19419
+    },
+    {
+      "epoch": 0.1942,
+      "grad_norm": 0.9081416130065918,
+      "learning_rate": 0.003,
+      "loss": 3.9809,
+      "step": 19420
+    },
+    {
+      "epoch": 0.19421,
+      "grad_norm": 0.9122806191444397,
+      "learning_rate": 0.003,
+      "loss": 3.9871,
+      "step": 19421
+    },
+    {
+      "epoch": 0.19422,
+      "grad_norm": 1.0107626914978027,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 19422
+    },
+    {
+      "epoch": 0.19423,
+      "grad_norm": 1.0141414403915405,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 19423
+    },
+    {
+      "epoch": 0.19424,
+      "grad_norm": 0.9829725623130798,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 19424
+    },
+    {
+      "epoch": 0.19425,
+      "grad_norm": 0.8267939686775208,
+      "learning_rate": 0.003,
+      "loss": 3.9931,
+      "step": 19425
+    },
+    {
+      "epoch": 0.19426,
+      "grad_norm": 0.8093584775924683,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 19426
+    },
+    {
+      "epoch": 0.19427,
+      "grad_norm": 1.0037287473678589,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 19427
+    },
+    {
+      "epoch": 0.19428,
+      "grad_norm": 1.0828919410705566,
+      "learning_rate": 0.003,
+      "loss": 3.992,
+      "step": 19428
+    },
+    {
+      "epoch": 0.19429,
+      "grad_norm": 0.894026517868042,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 19429
+    },
+    {
+      "epoch": 0.1943,
+      "grad_norm": 0.9650946855545044,
+      "learning_rate": 0.003,
+      "loss": 4.0328,
+      "step": 19430
+    },
+    {
+      "epoch": 0.19431,
+      "grad_norm": 1.0324418544769287,
+      "learning_rate": 0.003,
+      "loss": 3.9798,
+      "step": 19431
+    },
+    {
+      "epoch": 0.19432,
+      "grad_norm": 1.05412757396698,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 19432
+    },
+    {
+      "epoch": 0.19433,
+      "grad_norm": 0.862998366355896,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 19433
+    },
+    {
+      "epoch": 0.19434,
+      "grad_norm": 1.014135479927063,
+      "learning_rate": 0.003,
+      "loss": 4.0268,
+      "step": 19434
+    },
+    {
+      "epoch": 0.19435,
+      "grad_norm": 1.3630157709121704,
+      "learning_rate": 0.003,
+      "loss": 4.0295,
+      "step": 19435
+    },
+    {
+      "epoch": 0.19436,
+      "grad_norm": 0.762340247631073,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 19436
+    },
+    {
+      "epoch": 0.19437,
+      "grad_norm": 0.8234502673149109,
+      "learning_rate": 0.003,
+      "loss": 3.9931,
+      "step": 19437
+    },
+    {
+      "epoch": 0.19438,
+      "grad_norm": 0.8225938677787781,
+      "learning_rate": 0.003,
+      "loss": 4.027,
+      "step": 19438
+    },
+    {
+      "epoch": 0.19439,
+      "grad_norm": 0.9233136177062988,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 19439
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.7754296660423279,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 19440
+    },
+    {
+      "epoch": 0.19441,
+      "grad_norm": 0.7464416027069092,
+      "learning_rate": 0.003,
+      "loss": 4.0371,
+      "step": 19441
+    },
+    {
+      "epoch": 0.19442,
+      "grad_norm": 0.7560956478118896,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 19442
+    },
+    {
+      "epoch": 0.19443,
+      "grad_norm": 0.765295147895813,
+      "learning_rate": 0.003,
+      "loss": 4.031,
+      "step": 19443
+    },
+    {
+      "epoch": 0.19444,
+      "grad_norm": 0.694036066532135,
+      "learning_rate": 0.003,
+      "loss": 3.99,
+      "step": 19444
+    },
+    {
+      "epoch": 0.19445,
+      "grad_norm": 0.7585387825965881,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 19445
+    },
+    {
+      "epoch": 0.19446,
+      "grad_norm": 1.08774733543396,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 19446
+    },
+    {
+      "epoch": 0.19447,
+      "grad_norm": 1.089281678199768,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 19447
+    },
+    {
+      "epoch": 0.19448,
+      "grad_norm": 1.04354727268219,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 19448
+    },
+    {
+      "epoch": 0.19449,
+      "grad_norm": 1.0154160261154175,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 19449
+    },
+    {
+      "epoch": 0.1945,
+      "grad_norm": 0.9952034950256348,
+      "learning_rate": 0.003,
+      "loss": 3.9931,
+      "step": 19450
+    },
+    {
+      "epoch": 0.19451,
+      "grad_norm": 0.9826450347900391,
+      "learning_rate": 0.003,
+      "loss": 4.0124,
+      "step": 19451
+    },
+    {
+      "epoch": 0.19452,
+      "grad_norm": 0.8546826839447021,
+      "learning_rate": 0.003,
+      "loss": 3.9843,
+      "step": 19452
+    },
+    {
+      "epoch": 0.19453,
+      "grad_norm": 0.9039189219474792,
+      "learning_rate": 0.003,
+      "loss": 4.0251,
+      "step": 19453
+    },
+    {
+      "epoch": 0.19454,
+      "grad_norm": 0.8569737076759338,
+      "learning_rate": 0.003,
+      "loss": 4.0293,
+      "step": 19454
+    },
+    {
+      "epoch": 0.19455,
+      "grad_norm": 0.8439573645591736,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 19455
+    },
+    {
+      "epoch": 0.19456,
+      "grad_norm": 0.8585202693939209,
+      "learning_rate": 0.003,
+      "loss": 4.0379,
+      "step": 19456
+    },
+    {
+      "epoch": 0.19457,
+      "grad_norm": 0.7985356450080872,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 19457
+    },
+    {
+      "epoch": 0.19458,
+      "grad_norm": 0.8447664380073547,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 19458
+    },
+    {
+      "epoch": 0.19459,
+      "grad_norm": 0.948081374168396,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 19459
+    },
+    {
+      "epoch": 0.1946,
+      "grad_norm": 1.1912204027175903,
+      "learning_rate": 0.003,
+      "loss": 3.9922,
+      "step": 19460
+    },
+    {
+      "epoch": 0.19461,
+      "grad_norm": 1.2770426273345947,
+      "learning_rate": 0.003,
+      "loss": 4.0609,
+      "step": 19461
+    },
+    {
+      "epoch": 0.19462,
+      "grad_norm": 0.6283977627754211,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 19462
+    },
+    {
+      "epoch": 0.19463,
+      "grad_norm": 0.6860957741737366,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 19463
+    },
+    {
+      "epoch": 0.19464,
+      "grad_norm": 0.8585407137870789,
+      "learning_rate": 0.003,
+      "loss": 3.9715,
+      "step": 19464
+    },
+    {
+      "epoch": 0.19465,
+      "grad_norm": 1.158394455909729,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 19465
+    },
+    {
+      "epoch": 0.19466,
+      "grad_norm": 1.021871566772461,
+      "learning_rate": 0.003,
+      "loss": 4.0185,
+      "step": 19466
+    },
+    {
+      "epoch": 0.19467,
+      "grad_norm": 0.7340610027313232,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 19467
+    },
+    {
+      "epoch": 0.19468,
+      "grad_norm": 0.6287662982940674,
+      "learning_rate": 0.003,
+      "loss": 3.984,
+      "step": 19468
+    },
+    {
+      "epoch": 0.19469,
+      "grad_norm": 0.7421482801437378,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 19469
+    },
+    {
+      "epoch": 0.1947,
+      "grad_norm": 0.8674723505973816,
+      "learning_rate": 0.003,
+      "loss": 4.0118,
+      "step": 19470
+    },
+    {
+      "epoch": 0.19471,
+      "grad_norm": 0.93992018699646,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 19471
+    },
+    {
+      "epoch": 0.19472,
+      "grad_norm": 0.7212894558906555,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 19472
+    },
+    {
+      "epoch": 0.19473,
+      "grad_norm": 0.6208124756813049,
+      "learning_rate": 0.003,
+      "loss": 3.9975,
+      "step": 19473
+    },
+    {
+      "epoch": 0.19474,
+      "grad_norm": 0.7051081657409668,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 19474
+    },
+    {
+      "epoch": 0.19475,
+      "grad_norm": 0.7773585319519043,
+      "learning_rate": 0.003,
+      "loss": 3.9988,
+      "step": 19475
+    },
+    {
+      "epoch": 0.19476,
+      "grad_norm": 0.9117393493652344,
+      "learning_rate": 0.003,
+      "loss": 3.9924,
+      "step": 19476
+    },
+    {
+      "epoch": 0.19477,
+      "grad_norm": 0.8664668798446655,
+      "learning_rate": 0.003,
+      "loss": 3.9998,
+      "step": 19477
+    },
+    {
+      "epoch": 0.19478,
+      "grad_norm": 0.7039976716041565,
+      "learning_rate": 0.003,
+      "loss": 3.9738,
+      "step": 19478
+    },
+    {
+      "epoch": 0.19479,
+      "grad_norm": 0.6012693047523499,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 19479
+    },
+    {
+      "epoch": 0.1948,
+      "grad_norm": 0.6584872007369995,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 19480
+    },
+    {
+      "epoch": 0.19481,
+      "grad_norm": 0.7361558675765991,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 19481
+    },
+    {
+      "epoch": 0.19482,
+      "grad_norm": 0.7228919267654419,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 19482
+    },
+    {
+      "epoch": 0.19483,
+      "grad_norm": 0.6640205383300781,
+      "learning_rate": 0.003,
+      "loss": 3.9809,
+      "step": 19483
+    },
+    {
+      "epoch": 0.19484,
+      "grad_norm": 0.6353579759597778,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 19484
+    },
+    {
+      "epoch": 0.19485,
+      "grad_norm": 0.5491459965705872,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 19485
+    },
+    {
+      "epoch": 0.19486,
+      "grad_norm": 0.5839203596115112,
+      "learning_rate": 0.003,
+      "loss": 4.0027,
+      "step": 19486
+    },
+    {
+      "epoch": 0.19487,
+      "grad_norm": 0.7307979464530945,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 19487
+    },
+    {
+      "epoch": 0.19488,
+      "grad_norm": 0.8992578983306885,
+      "learning_rate": 0.003,
+      "loss": 3.9812,
+      "step": 19488
+    },
+    {
+      "epoch": 0.19489,
+      "grad_norm": 1.2707570791244507,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 19489
+    },
+    {
+      "epoch": 0.1949,
+      "grad_norm": 0.8503918051719666,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 19490
+    },
+    {
+      "epoch": 0.19491,
+      "grad_norm": 0.6091768741607666,
+      "learning_rate": 0.003,
+      "loss": 3.9983,
+      "step": 19491
+    },
+    {
+      "epoch": 0.19492,
+      "grad_norm": 0.5840878486633301,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 19492
+    },
+    {
+      "epoch": 0.19493,
+      "grad_norm": 0.6856347918510437,
+      "learning_rate": 0.003,
+      "loss": 3.9912,
+      "step": 19493
+    },
+    {
+      "epoch": 0.19494,
+      "grad_norm": 0.9025638699531555,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 19494
+    },
+    {
+      "epoch": 0.19495,
+      "grad_norm": 1.0733050107955933,
+      "learning_rate": 0.003,
+      "loss": 3.9842,
+      "step": 19495
+    },
+    {
+      "epoch": 0.19496,
+      "grad_norm": 1.009167194366455,
+      "learning_rate": 0.003,
+      "loss": 3.9846,
+      "step": 19496
+    },
+    {
+      "epoch": 0.19497,
+      "grad_norm": 1.1027225255966187,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 19497
+    },
+    {
+      "epoch": 0.19498,
+      "grad_norm": 0.8099426031112671,
+      "learning_rate": 0.003,
+      "loss": 3.9754,
+      "step": 19498
+    },
+    {
+      "epoch": 0.19499,
+      "grad_norm": 0.7486651539802551,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 19499
+    },
+    {
+      "epoch": 0.195,
+      "grad_norm": 0.7175552845001221,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 19500
+    },
+    {
+      "epoch": 0.19501,
+      "grad_norm": 0.592884361743927,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 19501
+    },
+    {
+      "epoch": 0.19502,
+      "grad_norm": 0.5978587865829468,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 19502
+    },
+    {
+      "epoch": 0.19503,
+      "grad_norm": 0.7668148875236511,
+      "learning_rate": 0.003,
+      "loss": 3.9749,
+      "step": 19503
+    },
+    {
+      "epoch": 0.19504,
+      "grad_norm": 0.8724342584609985,
+      "learning_rate": 0.003,
+      "loss": 3.9966,
+      "step": 19504
+    },
+    {
+      "epoch": 0.19505,
+      "grad_norm": 1.0184301137924194,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 19505
+    },
+    {
+      "epoch": 0.19506,
+      "grad_norm": 1.0616868734359741,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 19506
+    },
+    {
+      "epoch": 0.19507,
+      "grad_norm": 0.9919269680976868,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 19507
+    },
+    {
+      "epoch": 0.19508,
+      "grad_norm": 0.9941419363021851,
+      "learning_rate": 0.003,
+      "loss": 3.9943,
+      "step": 19508
+    },
+    {
+      "epoch": 0.19509,
+      "grad_norm": 1.1667042970657349,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 19509
+    },
+    {
+      "epoch": 0.1951,
+      "grad_norm": 0.8843926787376404,
+      "learning_rate": 0.003,
+      "loss": 3.9972,
+      "step": 19510
+    },
+    {
+      "epoch": 0.19511,
+      "grad_norm": 0.8106421828269958,
+      "learning_rate": 0.003,
+      "loss": 3.9833,
+      "step": 19511
+    },
+    {
+      "epoch": 0.19512,
+      "grad_norm": 0.7645822167396545,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 19512
+    },
+    {
+      "epoch": 0.19513,
+      "grad_norm": 0.808374285697937,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 19513
+    },
+    {
+      "epoch": 0.19514,
+      "grad_norm": 0.8262529373168945,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 19514
+    },
+    {
+      "epoch": 0.19515,
+      "grad_norm": 0.9069766998291016,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 19515
+    },
+    {
+      "epoch": 0.19516,
+      "grad_norm": 0.9706578254699707,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 19516
+    },
+    {
+      "epoch": 0.19517,
+      "grad_norm": 0.9760490655899048,
+      "learning_rate": 0.003,
+      "loss": 3.9994,
+      "step": 19517
+    },
+    {
+      "epoch": 0.19518,
+      "grad_norm": 0.8847740888595581,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 19518
+    },
+    {
+      "epoch": 0.19519,
+      "grad_norm": 1.038620948791504,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 19519
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.942561149597168,
+      "learning_rate": 0.003,
+      "loss": 4.0329,
+      "step": 19520
+    },
+    {
+      "epoch": 0.19521,
+      "grad_norm": 0.871234655380249,
+      "learning_rate": 0.003,
+      "loss": 3.995,
+      "step": 19521
+    },
+    {
+      "epoch": 0.19522,
+      "grad_norm": 0.8343075513839722,
+      "learning_rate": 0.003,
+      "loss": 4.0336,
+      "step": 19522
+    },
+    {
+      "epoch": 0.19523,
+      "grad_norm": 0.8660777807235718,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 19523
+    },
+    {
+      "epoch": 0.19524,
+      "grad_norm": 0.9518958926200867,
+      "learning_rate": 0.003,
+      "loss": 4.0313,
+      "step": 19524
+    },
+    {
+      "epoch": 0.19525,
+      "grad_norm": 0.9712399840354919,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 19525
+    },
+    {
+      "epoch": 0.19526,
+      "grad_norm": 1.0904133319854736,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 19526
+    },
+    {
+      "epoch": 0.19527,
+      "grad_norm": 0.7620598077774048,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 19527
+    },
+    {
+      "epoch": 0.19528,
+      "grad_norm": 0.7478907704353333,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 19528
+    },
+    {
+      "epoch": 0.19529,
+      "grad_norm": 0.8900734186172485,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 19529
+    },
+    {
+      "epoch": 0.1953,
+      "grad_norm": 1.03106689453125,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 19530
+    },
+    {
+      "epoch": 0.19531,
+      "grad_norm": 1.0545998811721802,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 19531
+    },
+    {
+      "epoch": 0.19532,
+      "grad_norm": 0.9456791877746582,
+      "learning_rate": 0.003,
+      "loss": 4.0465,
+      "step": 19532
+    },
+    {
+      "epoch": 0.19533,
+      "grad_norm": 1.0635806322097778,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 19533
+    },
+    {
+      "epoch": 0.19534,
+      "grad_norm": 1.0025690793991089,
+      "learning_rate": 0.003,
+      "loss": 4.0364,
+      "step": 19534
+    },
+    {
+      "epoch": 0.19535,
+      "grad_norm": 0.9176450371742249,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 19535
+    },
+    {
+      "epoch": 0.19536,
+      "grad_norm": 0.7869552969932556,
+      "learning_rate": 0.003,
+      "loss": 3.9737,
+      "step": 19536
+    },
+    {
+      "epoch": 0.19537,
+      "grad_norm": 0.7948800325393677,
+      "learning_rate": 0.003,
+      "loss": 3.9887,
+      "step": 19537
+    },
+    {
+      "epoch": 0.19538,
+      "grad_norm": 0.7708257436752319,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 19538
+    },
+    {
+      "epoch": 0.19539,
+      "grad_norm": 0.8740930557250977,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 19539
+    },
+    {
+      "epoch": 0.1954,
+      "grad_norm": 0.950183093547821,
+      "learning_rate": 0.003,
+      "loss": 4.0006,
+      "step": 19540
+    },
+    {
+      "epoch": 0.19541,
+      "grad_norm": 0.9351487159729004,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 19541
+    },
+    {
+      "epoch": 0.19542,
+      "grad_norm": 1.0730854272842407,
+      "learning_rate": 0.003,
+      "loss": 4.0327,
+      "step": 19542
+    },
+    {
+      "epoch": 0.19543,
+      "grad_norm": 0.9327528476715088,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 19543
+    },
+    {
+      "epoch": 0.19544,
+      "grad_norm": 0.771660566329956,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 19544
+    },
+    {
+      "epoch": 0.19545,
+      "grad_norm": 0.7307870388031006,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 19545
+    },
+    {
+      "epoch": 0.19546,
+      "grad_norm": 0.8321375846862793,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 19546
+    },
+    {
+      "epoch": 0.19547,
+      "grad_norm": 0.8160489201545715,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 19547
+    },
+    {
+      "epoch": 0.19548,
+      "grad_norm": 0.9429007172584534,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 19548
+    },
+    {
+      "epoch": 0.19549,
+      "grad_norm": 0.9813238382339478,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 19549
+    },
+    {
+      "epoch": 0.1955,
+      "grad_norm": 0.9885159730911255,
+      "learning_rate": 0.003,
+      "loss": 3.9796,
+      "step": 19550
+    },
+    {
+      "epoch": 0.19551,
+      "grad_norm": 0.8191841244697571,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 19551
+    },
+    {
+      "epoch": 0.19552,
+      "grad_norm": 0.7239416241645813,
+      "learning_rate": 0.003,
+      "loss": 3.9792,
+      "step": 19552
+    },
+    {
+      "epoch": 0.19553,
+      "grad_norm": 0.7224628329277039,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 19553
+    },
+    {
+      "epoch": 0.19554,
+      "grad_norm": 0.7405498623847961,
+      "learning_rate": 0.003,
+      "loss": 3.9621,
+      "step": 19554
+    },
+    {
+      "epoch": 0.19555,
+      "grad_norm": 0.6783916354179382,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 19555
+    },
+    {
+      "epoch": 0.19556,
+      "grad_norm": 0.7338733673095703,
+      "learning_rate": 0.003,
+      "loss": 3.9762,
+      "step": 19556
+    },
+    {
+      "epoch": 0.19557,
+      "grad_norm": 0.690729558467865,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 19557
+    },
+    {
+      "epoch": 0.19558,
+      "grad_norm": 0.6429121494293213,
+      "learning_rate": 0.003,
+      "loss": 3.9838,
+      "step": 19558
+    },
+    {
+      "epoch": 0.19559,
+      "grad_norm": 0.706646203994751,
+      "learning_rate": 0.003,
+      "loss": 3.9916,
+      "step": 19559
+    },
+    {
+      "epoch": 0.1956,
+      "grad_norm": 0.8352551460266113,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 19560
+    },
+    {
+      "epoch": 0.19561,
+      "grad_norm": 0.8661195039749146,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 19561
+    },
+    {
+      "epoch": 0.19562,
+      "grad_norm": 0.8343972563743591,
+      "learning_rate": 0.003,
+      "loss": 3.9714,
+      "step": 19562
+    },
+    {
+      "epoch": 0.19563,
+      "grad_norm": 0.7886388897895813,
+      "learning_rate": 0.003,
+      "loss": 3.9854,
+      "step": 19563
+    },
+    {
+      "epoch": 0.19564,
+      "grad_norm": 0.7899730205535889,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 19564
+    },
+    {
+      "epoch": 0.19565,
+      "grad_norm": 0.7374923229217529,
+      "learning_rate": 0.003,
+      "loss": 3.9674,
+      "step": 19565
+    },
+    {
+      "epoch": 0.19566,
+      "grad_norm": 0.6665852069854736,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 19566
+    },
+    {
+      "epoch": 0.19567,
+      "grad_norm": 0.6971161365509033,
+      "learning_rate": 0.003,
+      "loss": 3.9871,
+      "step": 19567
+    },
+    {
+      "epoch": 0.19568,
+      "grad_norm": 1.001599907875061,
+      "learning_rate": 0.003,
+      "loss": 3.9985,
+      "step": 19568
+    },
+    {
+      "epoch": 0.19569,
+      "grad_norm": 1.2593739032745361,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 19569
+    },
+    {
+      "epoch": 0.1957,
+      "grad_norm": 0.7797518968582153,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 19570
+    },
+    {
+      "epoch": 0.19571,
+      "grad_norm": 0.6890308260917664,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 19571
+    },
+    {
+      "epoch": 0.19572,
+      "grad_norm": 0.6744365692138672,
+      "learning_rate": 0.003,
+      "loss": 3.9934,
+      "step": 19572
+    },
+    {
+      "epoch": 0.19573,
+      "grad_norm": 0.6337965130805969,
+      "learning_rate": 0.003,
+      "loss": 3.9828,
+      "step": 19573
+    },
+    {
+      "epoch": 0.19574,
+      "grad_norm": 0.6815075278282166,
+      "learning_rate": 0.003,
+      "loss": 3.9929,
+      "step": 19574
+    },
+    {
+      "epoch": 0.19575,
+      "grad_norm": 0.7398589849472046,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 19575
+    },
+    {
+      "epoch": 0.19576,
+      "grad_norm": 0.808785080909729,
+      "learning_rate": 0.003,
+      "loss": 3.984,
+      "step": 19576
+    },
+    {
+      "epoch": 0.19577,
+      "grad_norm": 0.9979984760284424,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 19577
+    },
+    {
+      "epoch": 0.19578,
+      "grad_norm": 1.3587226867675781,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 19578
+    },
+    {
+      "epoch": 0.19579,
+      "grad_norm": 0.7701467871665955,
+      "learning_rate": 0.003,
+      "loss": 3.9984,
+      "step": 19579
+    },
+    {
+      "epoch": 0.1958,
+      "grad_norm": 0.8579768538475037,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 19580
+    },
+    {
+      "epoch": 0.19581,
+      "grad_norm": 0.8206691145896912,
+      "learning_rate": 0.003,
+      "loss": 3.9595,
+      "step": 19581
+    },
+    {
+      "epoch": 0.19582,
+      "grad_norm": 0.825986921787262,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 19582
+    },
+    {
+      "epoch": 0.19583,
+      "grad_norm": 1.0285770893096924,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 19583
+    },
+    {
+      "epoch": 0.19584,
+      "grad_norm": 0.9314975738525391,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 19584
+    },
+    {
+      "epoch": 0.19585,
+      "grad_norm": 0.8427761197090149,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 19585
+    },
+    {
+      "epoch": 0.19586,
+      "grad_norm": 0.8255916237831116,
+      "learning_rate": 0.003,
+      "loss": 3.9883,
+      "step": 19586
+    },
+    {
+      "epoch": 0.19587,
+      "grad_norm": 0.9761923551559448,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 19587
+    },
+    {
+      "epoch": 0.19588,
+      "grad_norm": 0.9684008955955505,
+      "learning_rate": 0.003,
+      "loss": 3.9829,
+      "step": 19588
+    },
+    {
+      "epoch": 0.19589,
+      "grad_norm": 0.8309191465377808,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 19589
+    },
+    {
+      "epoch": 0.1959,
+      "grad_norm": 0.9165033102035522,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 19590
+    },
+    {
+      "epoch": 0.19591,
+      "grad_norm": 1.048823595046997,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 19591
+    },
+    {
+      "epoch": 0.19592,
+      "grad_norm": 0.855726957321167,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 19592
+    },
+    {
+      "epoch": 0.19593,
+      "grad_norm": 0.7062085866928101,
+      "learning_rate": 0.003,
+      "loss": 4.0022,
+      "step": 19593
+    },
+    {
+      "epoch": 0.19594,
+      "grad_norm": 0.6082330942153931,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 19594
+    },
+    {
+      "epoch": 0.19595,
+      "grad_norm": 0.6740702986717224,
+      "learning_rate": 0.003,
+      "loss": 3.9867,
+      "step": 19595
+    },
+    {
+      "epoch": 0.19596,
+      "grad_norm": 0.700225830078125,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 19596
+    },
+    {
+      "epoch": 0.19597,
+      "grad_norm": 0.7771080732345581,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 19597
+    },
+    {
+      "epoch": 0.19598,
+      "grad_norm": 0.8797142505645752,
+      "learning_rate": 0.003,
+      "loss": 3.985,
+      "step": 19598
+    },
+    {
+      "epoch": 0.19599,
+      "grad_norm": 0.9389474987983704,
+      "learning_rate": 0.003,
+      "loss": 3.9791,
+      "step": 19599
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 1.2271173000335693,
+      "learning_rate": 0.003,
+      "loss": 3.9836,
+      "step": 19600
+    },
+    {
+      "epoch": 0.19601,
+      "grad_norm": 0.9719414710998535,
+      "learning_rate": 0.003,
+      "loss": 3.9881,
+      "step": 19601
+    },
+    {
+      "epoch": 0.19602,
+      "grad_norm": 0.7353074550628662,
+      "learning_rate": 0.003,
+      "loss": 3.9753,
+      "step": 19602
+    },
+    {
+      "epoch": 0.19603,
+      "grad_norm": 0.7225217819213867,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 19603
+    },
+    {
+      "epoch": 0.19604,
+      "grad_norm": 0.8830018043518066,
+      "learning_rate": 0.003,
+      "loss": 3.9982,
+      "step": 19604
+    },
+    {
+      "epoch": 0.19605,
+      "grad_norm": 0.9781860113143921,
+      "learning_rate": 0.003,
+      "loss": 3.9687,
+      "step": 19605
+    },
+    {
+      "epoch": 0.19606,
+      "grad_norm": 1.1129882335662842,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 19606
+    },
+    {
+      "epoch": 0.19607,
+      "grad_norm": 0.8997833728790283,
+      "learning_rate": 0.003,
+      "loss": 3.9801,
+      "step": 19607
+    },
+    {
+      "epoch": 0.19608,
+      "grad_norm": 0.8203573822975159,
+      "learning_rate": 0.003,
+      "loss": 3.9921,
+      "step": 19608
+    },
+    {
+      "epoch": 0.19609,
+      "grad_norm": 0.8649213314056396,
+      "learning_rate": 0.003,
+      "loss": 4.014,
+      "step": 19609
+    },
+    {
+      "epoch": 0.1961,
+      "grad_norm": 0.8213027715682983,
+      "learning_rate": 0.003,
+      "loss": 4.0072,
+      "step": 19610
+    },
+    {
+      "epoch": 0.19611,
+      "grad_norm": 0.7620607614517212,
+      "learning_rate": 0.003,
+      "loss": 4.0039,
+      "step": 19611
+    },
+    {
+      "epoch": 0.19612,
+      "grad_norm": 0.650044322013855,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 19612
+    },
+    {
+      "epoch": 0.19613,
+      "grad_norm": 0.7654820680618286,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 19613
+    },
+    {
+      "epoch": 0.19614,
+      "grad_norm": 0.8626854419708252,
+      "learning_rate": 0.003,
+      "loss": 3.9721,
+      "step": 19614
+    },
+    {
+      "epoch": 0.19615,
+      "grad_norm": 0.9784452319145203,
+      "learning_rate": 0.003,
+      "loss": 3.9982,
+      "step": 19615
+    },
+    {
+      "epoch": 0.19616,
+      "grad_norm": 1.17198646068573,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 19616
+    },
+    {
+      "epoch": 0.19617,
+      "grad_norm": 1.013183832168579,
+      "learning_rate": 0.003,
+      "loss": 4.0365,
+      "step": 19617
+    },
+    {
+      "epoch": 0.19618,
+      "grad_norm": 1.0921857357025146,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 19618
+    },
+    {
+      "epoch": 0.19619,
+      "grad_norm": 0.7567834258079529,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 19619
+    },
+    {
+      "epoch": 0.1962,
+      "grad_norm": 0.6237739324569702,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 19620
+    },
+    {
+      "epoch": 0.19621,
+      "grad_norm": 0.6849679350852966,
+      "learning_rate": 0.003,
+      "loss": 3.9865,
+      "step": 19621
+    },
+    {
+      "epoch": 0.19622,
+      "grad_norm": 0.9135372638702393,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 19622
+    },
+    {
+      "epoch": 0.19623,
+      "grad_norm": 1.1048378944396973,
+      "learning_rate": 0.003,
+      "loss": 3.9934,
+      "step": 19623
+    },
+    {
+      "epoch": 0.19624,
+      "grad_norm": 0.7356999516487122,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 19624
+    },
+    {
+      "epoch": 0.19625,
+      "grad_norm": 0.5960215330123901,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 19625
+    },
+    {
+      "epoch": 0.19626,
+      "grad_norm": 0.6349772214889526,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 19626
+    },
+    {
+      "epoch": 0.19627,
+      "grad_norm": 0.6433651447296143,
+      "learning_rate": 0.003,
+      "loss": 3.9868,
+      "step": 19627
+    },
+    {
+      "epoch": 0.19628,
+      "grad_norm": 0.5735981464385986,
+      "learning_rate": 0.003,
+      "loss": 3.9748,
+      "step": 19628
+    },
+    {
+      "epoch": 0.19629,
+      "grad_norm": 0.5863465666770935,
+      "learning_rate": 0.003,
+      "loss": 3.9826,
+      "step": 19629
+    },
+    {
+      "epoch": 0.1963,
+      "grad_norm": 0.6383113861083984,
+      "learning_rate": 0.003,
+      "loss": 3.9621,
+      "step": 19630
+    },
+    {
+      "epoch": 0.19631,
+      "grad_norm": 0.7619834542274475,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 19631
+    },
+    {
+      "epoch": 0.19632,
+      "grad_norm": 0.9621742963790894,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 19632
+    },
+    {
+      "epoch": 0.19633,
+      "grad_norm": 1.2753996849060059,
+      "learning_rate": 0.003,
+      "loss": 3.9855,
+      "step": 19633
+    },
+    {
+      "epoch": 0.19634,
+      "grad_norm": 0.7475082278251648,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 19634
+    },
+    {
+      "epoch": 0.19635,
+      "grad_norm": 0.6645981669425964,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 19635
+    },
+    {
+      "epoch": 0.19636,
+      "grad_norm": 0.6509314179420471,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 19636
+    },
+    {
+      "epoch": 0.19637,
+      "grad_norm": 0.6968465447425842,
+      "learning_rate": 0.003,
+      "loss": 3.9703,
+      "step": 19637
+    },
+    {
+      "epoch": 0.19638,
+      "grad_norm": 0.695035457611084,
+      "learning_rate": 0.003,
+      "loss": 3.9423,
+      "step": 19638
+    },
+    {
+      "epoch": 0.19639,
+      "grad_norm": 0.6628406047821045,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 19639
+    },
+    {
+      "epoch": 0.1964,
+      "grad_norm": 0.7157939076423645,
+      "learning_rate": 0.003,
+      "loss": 3.9744,
+      "step": 19640
+    },
+    {
+      "epoch": 0.19641,
+      "grad_norm": 0.707119345664978,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 19641
+    },
+    {
+      "epoch": 0.19642,
+      "grad_norm": 0.7395365834236145,
+      "learning_rate": 0.003,
+      "loss": 3.9815,
+      "step": 19642
+    },
+    {
+      "epoch": 0.19643,
+      "grad_norm": 0.8493033647537231,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 19643
+    },
+    {
+      "epoch": 0.19644,
+      "grad_norm": 1.1865185499191284,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 19644
+    },
+    {
+      "epoch": 0.19645,
+      "grad_norm": 0.9355396032333374,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 19645
+    },
+    {
+      "epoch": 0.19646,
+      "grad_norm": 0.8840484023094177,
+      "learning_rate": 0.003,
+      "loss": 4.0185,
+      "step": 19646
+    },
+    {
+      "epoch": 0.19647,
+      "grad_norm": 1.0170649290084839,
+      "learning_rate": 0.003,
+      "loss": 3.9985,
+      "step": 19647
+    },
+    {
+      "epoch": 0.19648,
+      "grad_norm": 1.1426259279251099,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 19648
+    },
+    {
+      "epoch": 0.19649,
+      "grad_norm": 0.8664590120315552,
+      "learning_rate": 0.003,
+      "loss": 3.9849,
+      "step": 19649
+    },
+    {
+      "epoch": 0.1965,
+      "grad_norm": 0.8678274154663086,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 19650
+    },
+    {
+      "epoch": 0.19651,
+      "grad_norm": 0.9149540662765503,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 19651
+    },
+    {
+      "epoch": 0.19652,
+      "grad_norm": 1.1072866916656494,
+      "learning_rate": 0.003,
+      "loss": 4.0252,
+      "step": 19652
+    },
+    {
+      "epoch": 0.19653,
+      "grad_norm": 1.0925770998001099,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 19653
+    },
+    {
+      "epoch": 0.19654,
+      "grad_norm": 1.0328413248062134,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 19654
+    },
+    {
+      "epoch": 0.19655,
+      "grad_norm": 0.7894022464752197,
+      "learning_rate": 0.003,
+      "loss": 3.9973,
+      "step": 19655
+    },
+    {
+      "epoch": 0.19656,
+      "grad_norm": 0.765109658241272,
+      "learning_rate": 0.003,
+      "loss": 3.989,
+      "step": 19656
+    },
+    {
+      "epoch": 0.19657,
+      "grad_norm": 0.7951400876045227,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 19657
+    },
+    {
+      "epoch": 0.19658,
+      "grad_norm": 0.7783117890357971,
+      "learning_rate": 0.003,
+      "loss": 4.0367,
+      "step": 19658
+    },
+    {
+      "epoch": 0.19659,
+      "grad_norm": 0.7544420957565308,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 19659
+    },
+    {
+      "epoch": 0.1966,
+      "grad_norm": 0.7228208184242249,
+      "learning_rate": 0.003,
+      "loss": 3.9873,
+      "step": 19660
+    },
+    {
+      "epoch": 0.19661,
+      "grad_norm": 0.8361641764640808,
+      "learning_rate": 0.003,
+      "loss": 3.9933,
+      "step": 19661
+    },
+    {
+      "epoch": 0.19662,
+      "grad_norm": 0.9005764722824097,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 19662
+    },
+    {
+      "epoch": 0.19663,
+      "grad_norm": 0.8779737949371338,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 19663
+    },
+    {
+      "epoch": 0.19664,
+      "grad_norm": 0.9157434105873108,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 19664
+    },
+    {
+      "epoch": 0.19665,
+      "grad_norm": 0.8780549168586731,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 19665
+    },
+    {
+      "epoch": 0.19666,
+      "grad_norm": 0.7676054835319519,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 19666
+    },
+    {
+      "epoch": 0.19667,
+      "grad_norm": 0.8035011887550354,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 19667
+    },
+    {
+      "epoch": 0.19668,
+      "grad_norm": 1.096777319908142,
+      "learning_rate": 0.003,
+      "loss": 3.9991,
+      "step": 19668
+    },
+    {
+      "epoch": 0.19669,
+      "grad_norm": 1.2337048053741455,
+      "learning_rate": 0.003,
+      "loss": 3.9952,
+      "step": 19669
+    },
+    {
+      "epoch": 0.1967,
+      "grad_norm": 0.8524037599563599,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 19670
+    },
+    {
+      "epoch": 0.19671,
+      "grad_norm": 0.8955839276313782,
+      "learning_rate": 0.003,
+      "loss": 4.0006,
+      "step": 19671
+    },
+    {
+      "epoch": 0.19672,
+      "grad_norm": 0.8179341554641724,
+      "learning_rate": 0.003,
+      "loss": 3.9836,
+      "step": 19672
+    },
+    {
+      "epoch": 0.19673,
+      "grad_norm": 0.805479884147644,
+      "learning_rate": 0.003,
+      "loss": 3.9841,
+      "step": 19673
+    },
+    {
+      "epoch": 0.19674,
+      "grad_norm": 0.9087765216827393,
+      "learning_rate": 0.003,
+      "loss": 3.9849,
+      "step": 19674
+    },
+    {
+      "epoch": 0.19675,
+      "grad_norm": 0.9388098120689392,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 19675
+    },
+    {
+      "epoch": 0.19676,
+      "grad_norm": 0.7843668460845947,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 19676
+    },
+    {
+      "epoch": 0.19677,
+      "grad_norm": 0.8374168872833252,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 19677
+    },
+    {
+      "epoch": 0.19678,
+      "grad_norm": 0.8288348913192749,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 19678
+    },
+    {
+      "epoch": 0.19679,
+      "grad_norm": 0.9719083905220032,
+      "learning_rate": 0.003,
+      "loss": 3.9896,
+      "step": 19679
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 1.1349914073944092,
+      "learning_rate": 0.003,
+      "loss": 3.9937,
+      "step": 19680
+    },
+    {
+      "epoch": 0.19681,
+      "grad_norm": 0.9622350931167603,
+      "learning_rate": 0.003,
+      "loss": 4.0386,
+      "step": 19681
+    },
+    {
+      "epoch": 0.19682,
+      "grad_norm": 0.9577870965003967,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 19682
+    },
+    {
+      "epoch": 0.19683,
+      "grad_norm": 0.9903905391693115,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 19683
+    },
+    {
+      "epoch": 0.19684,
+      "grad_norm": 0.9436802268028259,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 19684
+    },
+    {
+      "epoch": 0.19685,
+      "grad_norm": 0.8398043513298035,
+      "learning_rate": 0.003,
+      "loss": 3.9659,
+      "step": 19685
+    },
+    {
+      "epoch": 0.19686,
+      "grad_norm": 0.7675806283950806,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 19686
+    },
+    {
+      "epoch": 0.19687,
+      "grad_norm": 0.7498754262924194,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 19687
+    },
+    {
+      "epoch": 0.19688,
+      "grad_norm": 0.7731102108955383,
+      "learning_rate": 0.003,
+      "loss": 4.0223,
+      "step": 19688
+    },
+    {
+      "epoch": 0.19689,
+      "grad_norm": 0.8190988302230835,
+      "learning_rate": 0.003,
+      "loss": 4.0056,
+      "step": 19689
+    },
+    {
+      "epoch": 0.1969,
+      "grad_norm": 0.9088810682296753,
+      "learning_rate": 0.003,
+      "loss": 4.0234,
+      "step": 19690
+    },
+    {
+      "epoch": 0.19691,
+      "grad_norm": 0.9039297699928284,
+      "learning_rate": 0.003,
+      "loss": 3.9848,
+      "step": 19691
+    },
+    {
+      "epoch": 0.19692,
+      "grad_norm": 0.9410934448242188,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 19692
+    },
+    {
+      "epoch": 0.19693,
+      "grad_norm": 0.8614537715911865,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 19693
+    },
+    {
+      "epoch": 0.19694,
+      "grad_norm": 0.8556104302406311,
+      "learning_rate": 0.003,
+      "loss": 3.9897,
+      "step": 19694
+    },
+    {
+      "epoch": 0.19695,
+      "grad_norm": 1.0040862560272217,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 19695
+    },
+    {
+      "epoch": 0.19696,
+      "grad_norm": 1.0933338403701782,
+      "learning_rate": 0.003,
+      "loss": 4.0215,
+      "step": 19696
+    },
+    {
+      "epoch": 0.19697,
+      "grad_norm": 1.2151308059692383,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 19697
+    },
+    {
+      "epoch": 0.19698,
+      "grad_norm": 0.6764607429504395,
+      "learning_rate": 0.003,
+      "loss": 4.0344,
+      "step": 19698
+    },
+    {
+      "epoch": 0.19699,
+      "grad_norm": 0.6174522638320923,
+      "learning_rate": 0.003,
+      "loss": 3.9764,
+      "step": 19699
+    },
+    {
+      "epoch": 0.197,
+      "grad_norm": 0.6862599849700928,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 19700
+    },
+    {
+      "epoch": 0.19701,
+      "grad_norm": 0.7461639642715454,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 19701
+    },
+    {
+      "epoch": 0.19702,
+      "grad_norm": 0.8803563714027405,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 19702
+    },
+    {
+      "epoch": 0.19703,
+      "grad_norm": 0.9222602248191833,
+      "learning_rate": 0.003,
+      "loss": 4.0311,
+      "step": 19703
+    },
+    {
+      "epoch": 0.19704,
+      "grad_norm": 0.8738635778427124,
+      "learning_rate": 0.003,
+      "loss": 3.9836,
+      "step": 19704
+    },
+    {
+      "epoch": 0.19705,
+      "grad_norm": 0.8284536004066467,
+      "learning_rate": 0.003,
+      "loss": 3.9862,
+      "step": 19705
+    },
+    {
+      "epoch": 0.19706,
+      "grad_norm": 0.9120162725448608,
+      "learning_rate": 0.003,
+      "loss": 3.9733,
+      "step": 19706
+    },
+    {
+      "epoch": 0.19707,
+      "grad_norm": 0.8574915528297424,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 19707
+    },
+    {
+      "epoch": 0.19708,
+      "grad_norm": 1.0177977085113525,
+      "learning_rate": 0.003,
+      "loss": 4.021,
+      "step": 19708
+    },
+    {
+      "epoch": 0.19709,
+      "grad_norm": 1.028807520866394,
+      "learning_rate": 0.003,
+      "loss": 4.0435,
+      "step": 19709
+    },
+    {
+      "epoch": 0.1971,
+      "grad_norm": 1.077039361000061,
+      "learning_rate": 0.003,
+      "loss": 4.0237,
+      "step": 19710
+    },
+    {
+      "epoch": 0.19711,
+      "grad_norm": 0.7538310885429382,
+      "learning_rate": 0.003,
+      "loss": 4.0182,
+      "step": 19711
+    },
+    {
+      "epoch": 0.19712,
+      "grad_norm": 0.7804000973701477,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 19712
+    },
+    {
+      "epoch": 0.19713,
+      "grad_norm": 0.7949774265289307,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 19713
+    },
+    {
+      "epoch": 0.19714,
+      "grad_norm": 0.7602480053901672,
+      "learning_rate": 0.003,
+      "loss": 3.9726,
+      "step": 19714
+    },
+    {
+      "epoch": 0.19715,
+      "grad_norm": 0.7396253943443298,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 19715
+    },
+    {
+      "epoch": 0.19716,
+      "grad_norm": 0.789199948310852,
+      "learning_rate": 0.003,
+      "loss": 3.9811,
+      "step": 19716
+    },
+    {
+      "epoch": 0.19717,
+      "grad_norm": 0.9171184301376343,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 19717
+    },
+    {
+      "epoch": 0.19718,
+      "grad_norm": 1.1561167240142822,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 19718
+    },
+    {
+      "epoch": 0.19719,
+      "grad_norm": 0.8584945201873779,
+      "learning_rate": 0.003,
+      "loss": 3.9582,
+      "step": 19719
+    },
+    {
+      "epoch": 0.1972,
+      "grad_norm": 0.7851689457893372,
+      "learning_rate": 0.003,
+      "loss": 3.967,
+      "step": 19720
+    },
+    {
+      "epoch": 0.19721,
+      "grad_norm": 0.8622748851776123,
+      "learning_rate": 0.003,
+      "loss": 3.9936,
+      "step": 19721
+    },
+    {
+      "epoch": 0.19722,
+      "grad_norm": 0.9780141711235046,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 19722
+    },
+    {
+      "epoch": 0.19723,
+      "grad_norm": 1.0162153244018555,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 19723
+    },
+    {
+      "epoch": 0.19724,
+      "grad_norm": 0.8792099356651306,
+      "learning_rate": 0.003,
+      "loss": 3.9913,
+      "step": 19724
+    },
+    {
+      "epoch": 0.19725,
+      "grad_norm": 0.7323285341262817,
+      "learning_rate": 0.003,
+      "loss": 3.9909,
+      "step": 19725
+    },
+    {
+      "epoch": 0.19726,
+      "grad_norm": 0.5505766868591309,
+      "learning_rate": 0.003,
+      "loss": 3.9993,
+      "step": 19726
+    },
+    {
+      "epoch": 0.19727,
+      "grad_norm": 0.5450384020805359,
+      "learning_rate": 0.003,
+      "loss": 3.9857,
+      "step": 19727
+    },
+    {
+      "epoch": 0.19728,
+      "grad_norm": 0.6860783696174622,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 19728
+    },
+    {
+      "epoch": 0.19729,
+      "grad_norm": 0.8005902171134949,
+      "learning_rate": 0.003,
+      "loss": 3.9944,
+      "step": 19729
+    },
+    {
+      "epoch": 0.1973,
+      "grad_norm": 0.8072187304496765,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 19730
+    },
+    {
+      "epoch": 0.19731,
+      "grad_norm": 0.7468485236167908,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 19731
+    },
+    {
+      "epoch": 0.19732,
+      "grad_norm": 0.6146760582923889,
+      "learning_rate": 0.003,
+      "loss": 3.995,
+      "step": 19732
+    },
+    {
+      "epoch": 0.19733,
+      "grad_norm": 0.6144158840179443,
+      "learning_rate": 0.003,
+      "loss": 3.988,
+      "step": 19733
+    },
+    {
+      "epoch": 0.19734,
+      "grad_norm": 0.6798016428947449,
+      "learning_rate": 0.003,
+      "loss": 3.9856,
+      "step": 19734
+    },
+    {
+      "epoch": 0.19735,
+      "grad_norm": 0.6656534671783447,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 19735
+    },
+    {
+      "epoch": 0.19736,
+      "grad_norm": 0.7358046174049377,
+      "learning_rate": 0.003,
+      "loss": 3.9874,
+      "step": 19736
+    },
+    {
+      "epoch": 0.19737,
+      "grad_norm": 0.8249279856681824,
+      "learning_rate": 0.003,
+      "loss": 3.9698,
+      "step": 19737
+    },
+    {
+      "epoch": 0.19738,
+      "grad_norm": 0.8836650848388672,
+      "learning_rate": 0.003,
+      "loss": 3.9873,
+      "step": 19738
+    },
+    {
+      "epoch": 0.19739,
+      "grad_norm": 0.847974419593811,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 19739
+    },
+    {
+      "epoch": 0.1974,
+      "grad_norm": 0.8489623069763184,
+      "learning_rate": 0.003,
+      "loss": 3.9789,
+      "step": 19740
+    },
+    {
+      "epoch": 0.19741,
+      "grad_norm": 0.9032725691795349,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 19741
+    },
+    {
+      "epoch": 0.19742,
+      "grad_norm": 0.8884512186050415,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 19742
+    },
+    {
+      "epoch": 0.19743,
+      "grad_norm": 0.8677976727485657,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 19743
+    },
+    {
+      "epoch": 0.19744,
+      "grad_norm": 0.897802472114563,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 19744
+    },
+    {
+      "epoch": 0.19745,
+      "grad_norm": 0.899142324924469,
+      "learning_rate": 0.003,
+      "loss": 3.9893,
+      "step": 19745
+    },
+    {
+      "epoch": 0.19746,
+      "grad_norm": 0.9474034905433655,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 19746
+    },
+    {
+      "epoch": 0.19747,
+      "grad_norm": 0.9039626121520996,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 19747
+    },
+    {
+      "epoch": 0.19748,
+      "grad_norm": 0.9217604398727417,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 19748
+    },
+    {
+      "epoch": 0.19749,
+      "grad_norm": 1.0116978883743286,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 19749
+    },
+    {
+      "epoch": 0.1975,
+      "grad_norm": 0.9110078811645508,
+      "learning_rate": 0.003,
+      "loss": 4.0172,
+      "step": 19750
+    },
+    {
+      "epoch": 0.19751,
+      "grad_norm": 1.0594103336334229,
+      "learning_rate": 0.003,
+      "loss": 4.0309,
+      "step": 19751
+    },
+    {
+      "epoch": 0.19752,
+      "grad_norm": 1.0353708267211914,
+      "learning_rate": 0.003,
+      "loss": 4.0262,
+      "step": 19752
+    },
+    {
+      "epoch": 0.19753,
+      "grad_norm": 0.9697558879852295,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 19753
+    },
+    {
+      "epoch": 0.19754,
+      "grad_norm": 0.8715497255325317,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 19754
+    },
+    {
+      "epoch": 0.19755,
+      "grad_norm": 0.9651036262512207,
+      "learning_rate": 0.003,
+      "loss": 4.0322,
+      "step": 19755
+    },
+    {
+      "epoch": 0.19756,
+      "grad_norm": 1.427270531654358,
+      "learning_rate": 0.003,
+      "loss": 3.9833,
+      "step": 19756
+    },
+    {
+      "epoch": 0.19757,
+      "grad_norm": 0.6546532511711121,
+      "learning_rate": 0.003,
+      "loss": 3.9921,
+      "step": 19757
+    },
+    {
+      "epoch": 0.19758,
+      "grad_norm": 0.7043947577476501,
+      "learning_rate": 0.003,
+      "loss": 4.0447,
+      "step": 19758
+    },
+    {
+      "epoch": 0.19759,
+      "grad_norm": 0.8468017578125,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 19759
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.7900123596191406,
+      "learning_rate": 0.003,
+      "loss": 4.0498,
+      "step": 19760
+    },
+    {
+      "epoch": 0.19761,
+      "grad_norm": 0.6871782541275024,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 19761
+    },
+    {
+      "epoch": 0.19762,
+      "grad_norm": 0.7135297656059265,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 19762
+    },
+    {
+      "epoch": 0.19763,
+      "grad_norm": 0.8189486265182495,
+      "learning_rate": 0.003,
+      "loss": 3.9893,
+      "step": 19763
+    },
+    {
+      "epoch": 0.19764,
+      "grad_norm": 0.9873164892196655,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 19764
+    },
+    {
+      "epoch": 0.19765,
+      "grad_norm": 1.213226079940796,
+      "learning_rate": 0.003,
+      "loss": 3.9871,
+      "step": 19765
+    },
+    {
+      "epoch": 0.19766,
+      "grad_norm": 0.634507417678833,
+      "learning_rate": 0.003,
+      "loss": 4.0286,
+      "step": 19766
+    },
+    {
+      "epoch": 0.19767,
+      "grad_norm": 0.5030357837677002,
+      "learning_rate": 0.003,
+      "loss": 3.9857,
+      "step": 19767
+    },
+    {
+      "epoch": 0.19768,
+      "grad_norm": 0.5763629674911499,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 19768
+    },
+    {
+      "epoch": 0.19769,
+      "grad_norm": 0.6826533079147339,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 19769
+    },
+    {
+      "epoch": 0.1977,
+      "grad_norm": 0.7585141062736511,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 19770
+    },
+    {
+      "epoch": 0.19771,
+      "grad_norm": 0.7298708558082581,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 19771
+    },
+    {
+      "epoch": 0.19772,
+      "grad_norm": 0.7219603657722473,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 19772
+    },
+    {
+      "epoch": 0.19773,
+      "grad_norm": 0.7906659245491028,
+      "learning_rate": 0.003,
+      "loss": 3.968,
+      "step": 19773
+    },
+    {
+      "epoch": 0.19774,
+      "grad_norm": 0.9170296788215637,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 19774
+    },
+    {
+      "epoch": 0.19775,
+      "grad_norm": 0.9445464015007019,
+      "learning_rate": 0.003,
+      "loss": 3.986,
+      "step": 19775
+    },
+    {
+      "epoch": 0.19776,
+      "grad_norm": 0.8738341927528381,
+      "learning_rate": 0.003,
+      "loss": 3.9833,
+      "step": 19776
+    },
+    {
+      "epoch": 0.19777,
+      "grad_norm": 0.8931269645690918,
+      "learning_rate": 0.003,
+      "loss": 3.9706,
+      "step": 19777
+    },
+    {
+      "epoch": 0.19778,
+      "grad_norm": 1.0102183818817139,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 19778
+    },
+    {
+      "epoch": 0.19779,
+      "grad_norm": 1.0455142259597778,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 19779
+    },
+    {
+      "epoch": 0.1978,
+      "grad_norm": 1.0050582885742188,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 19780
+    },
+    {
+      "epoch": 0.19781,
+      "grad_norm": 0.9604713320732117,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 19781
+    },
+    {
+      "epoch": 0.19782,
+      "grad_norm": 0.8043559789657593,
+      "learning_rate": 0.003,
+      "loss": 3.9957,
+      "step": 19782
+    },
+    {
+      "epoch": 0.19783,
+      "grad_norm": 0.7254209518432617,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 19783
+    },
+    {
+      "epoch": 0.19784,
+      "grad_norm": 0.8698140978813171,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 19784
+    },
+    {
+      "epoch": 0.19785,
+      "grad_norm": 1.16221284866333,
+      "learning_rate": 0.003,
+      "loss": 4.0184,
+      "step": 19785
+    },
+    {
+      "epoch": 0.19786,
+      "grad_norm": 0.8535551428794861,
+      "learning_rate": 0.003,
+      "loss": 3.9988,
+      "step": 19786
+    },
+    {
+      "epoch": 0.19787,
+      "grad_norm": 0.6688913702964783,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 19787
+    },
+    {
+      "epoch": 0.19788,
+      "grad_norm": 0.6375434398651123,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 19788
+    },
+    {
+      "epoch": 0.19789,
+      "grad_norm": 0.6849889755249023,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 19789
+    },
+    {
+      "epoch": 0.1979,
+      "grad_norm": 0.8710851669311523,
+      "learning_rate": 0.003,
+      "loss": 3.968,
+      "step": 19790
+    },
+    {
+      "epoch": 0.19791,
+      "grad_norm": 1.0272157192230225,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 19791
+    },
+    {
+      "epoch": 0.19792,
+      "grad_norm": 0.9329808354377747,
+      "learning_rate": 0.003,
+      "loss": 4.0294,
+      "step": 19792
+    },
+    {
+      "epoch": 0.19793,
+      "grad_norm": 0.9324360489845276,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 19793
+    },
+    {
+      "epoch": 0.19794,
+      "grad_norm": 1.0681499242782593,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 19794
+    },
+    {
+      "epoch": 0.19795,
+      "grad_norm": 1.0618821382522583,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 19795
+    },
+    {
+      "epoch": 0.19796,
+      "grad_norm": 0.8624000549316406,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 19796
+    },
+    {
+      "epoch": 0.19797,
+      "grad_norm": 0.7481711506843567,
+      "learning_rate": 0.003,
+      "loss": 3.9852,
+      "step": 19797
+    },
+    {
+      "epoch": 0.19798,
+      "grad_norm": 0.7045367360115051,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 19798
+    },
+    {
+      "epoch": 0.19799,
+      "grad_norm": 0.7832326889038086,
+      "learning_rate": 0.003,
+      "loss": 3.9547,
+      "step": 19799
+    },
+    {
+      "epoch": 0.198,
+      "grad_norm": 0.9828478693962097,
+      "learning_rate": 0.003,
+      "loss": 4.0317,
+      "step": 19800
+    },
+    {
+      "epoch": 0.19801,
+      "grad_norm": 1.1521586179733276,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 19801
+    },
+    {
+      "epoch": 0.19802,
+      "grad_norm": 0.6974064111709595,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 19802
+    },
+    {
+      "epoch": 0.19803,
+      "grad_norm": 0.6272710561752319,
+      "learning_rate": 0.003,
+      "loss": 3.996,
+      "step": 19803
+    },
+    {
+      "epoch": 0.19804,
+      "grad_norm": 0.7898540496826172,
+      "learning_rate": 0.003,
+      "loss": 3.9972,
+      "step": 19804
+    },
+    {
+      "epoch": 0.19805,
+      "grad_norm": 0.8566932678222656,
+      "learning_rate": 0.003,
+      "loss": 3.9731,
+      "step": 19805
+    },
+    {
+      "epoch": 0.19806,
+      "grad_norm": 0.9286311268806458,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 19806
+    },
+    {
+      "epoch": 0.19807,
+      "grad_norm": 0.8343588709831238,
+      "learning_rate": 0.003,
+      "loss": 3.9928,
+      "step": 19807
+    },
+    {
+      "epoch": 0.19808,
+      "grad_norm": 0.787165641784668,
+      "learning_rate": 0.003,
+      "loss": 4.0472,
+      "step": 19808
+    },
+    {
+      "epoch": 0.19809,
+      "grad_norm": 0.7548251152038574,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 19809
+    },
+    {
+      "epoch": 0.1981,
+      "grad_norm": 0.7300617098808289,
+      "learning_rate": 0.003,
+      "loss": 4.0019,
+      "step": 19810
+    },
+    {
+      "epoch": 0.19811,
+      "grad_norm": 0.7945849299430847,
+      "learning_rate": 0.003,
+      "loss": 3.9765,
+      "step": 19811
+    },
+    {
+      "epoch": 0.19812,
+      "grad_norm": 0.8813292384147644,
+      "learning_rate": 0.003,
+      "loss": 3.981,
+      "step": 19812
+    },
+    {
+      "epoch": 0.19813,
+      "grad_norm": 0.781726062297821,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 19813
+    },
+    {
+      "epoch": 0.19814,
+      "grad_norm": 0.6853597164154053,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 19814
+    },
+    {
+      "epoch": 0.19815,
+      "grad_norm": 0.7190750241279602,
+      "learning_rate": 0.003,
+      "loss": 3.9898,
+      "step": 19815
+    },
+    {
+      "epoch": 0.19816,
+      "grad_norm": 0.7978911399841309,
+      "learning_rate": 0.003,
+      "loss": 3.99,
+      "step": 19816
+    },
+    {
+      "epoch": 0.19817,
+      "grad_norm": 0.9732509851455688,
+      "learning_rate": 0.003,
+      "loss": 3.9957,
+      "step": 19817
+    },
+    {
+      "epoch": 0.19818,
+      "grad_norm": 1.2398778200149536,
+      "learning_rate": 0.003,
+      "loss": 3.9861,
+      "step": 19818
+    },
+    {
+      "epoch": 0.19819,
+      "grad_norm": 0.8849130272865295,
+      "learning_rate": 0.003,
+      "loss": 3.987,
+      "step": 19819
+    },
+    {
+      "epoch": 0.1982,
+      "grad_norm": 0.979040265083313,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 19820
+    },
+    {
+      "epoch": 0.19821,
+      "grad_norm": 1.0033127069473267,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 19821
+    },
+    {
+      "epoch": 0.19822,
+      "grad_norm": 0.8031641840934753,
+      "learning_rate": 0.003,
+      "loss": 3.9765,
+      "step": 19822
+    },
+    {
+      "epoch": 0.19823,
+      "grad_norm": 0.700145959854126,
+      "learning_rate": 0.003,
+      "loss": 3.9662,
+      "step": 19823
+    },
+    {
+      "epoch": 0.19824,
+      "grad_norm": 0.7709698677062988,
+      "learning_rate": 0.003,
+      "loss": 4.0125,
+      "step": 19824
+    },
+    {
+      "epoch": 0.19825,
+      "grad_norm": 0.9196158051490784,
+      "learning_rate": 0.003,
+      "loss": 3.9879,
+      "step": 19825
+    },
+    {
+      "epoch": 0.19826,
+      "grad_norm": 1.0031273365020752,
+      "learning_rate": 0.003,
+      "loss": 3.9977,
+      "step": 19826
+    },
+    {
+      "epoch": 0.19827,
+      "grad_norm": 1.04696524143219,
+      "learning_rate": 0.003,
+      "loss": 3.9793,
+      "step": 19827
+    },
+    {
+      "epoch": 0.19828,
+      "grad_norm": 0.8323779106140137,
+      "learning_rate": 0.003,
+      "loss": 3.993,
+      "step": 19828
+    },
+    {
+      "epoch": 0.19829,
+      "grad_norm": 0.6003943681716919,
+      "learning_rate": 0.003,
+      "loss": 3.9709,
+      "step": 19829
+    },
+    {
+      "epoch": 0.1983,
+      "grad_norm": 0.587047815322876,
+      "learning_rate": 0.003,
+      "loss": 3.9793,
+      "step": 19830
+    },
+    {
+      "epoch": 0.19831,
+      "grad_norm": 0.5091107487678528,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 19831
+    },
+    {
+      "epoch": 0.19832,
+      "grad_norm": 0.550421953201294,
+      "learning_rate": 0.003,
+      "loss": 3.9856,
+      "step": 19832
+    },
+    {
+      "epoch": 0.19833,
+      "grad_norm": 0.633222758769989,
+      "learning_rate": 0.003,
+      "loss": 3.9783,
+      "step": 19833
+    },
+    {
+      "epoch": 0.19834,
+      "grad_norm": 0.6866979598999023,
+      "learning_rate": 0.003,
+      "loss": 3.9633,
+      "step": 19834
+    },
+    {
+      "epoch": 0.19835,
+      "grad_norm": 0.8001576066017151,
+      "learning_rate": 0.003,
+      "loss": 3.9891,
+      "step": 19835
+    },
+    {
+      "epoch": 0.19836,
+      "grad_norm": 0.9836876392364502,
+      "learning_rate": 0.003,
+      "loss": 3.9811,
+      "step": 19836
+    },
+    {
+      "epoch": 0.19837,
+      "grad_norm": 0.9438751339912415,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 19837
+    },
+    {
+      "epoch": 0.19838,
+      "grad_norm": 0.6752098798751831,
+      "learning_rate": 0.003,
+      "loss": 3.9845,
+      "step": 19838
+    },
+    {
+      "epoch": 0.19839,
+      "grad_norm": 0.7575652003288269,
+      "learning_rate": 0.003,
+      "loss": 3.9718,
+      "step": 19839
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.8691379427909851,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 19840
+    },
+    {
+      "epoch": 0.19841,
+      "grad_norm": 1.0416834354400635,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 19841
+    },
+    {
+      "epoch": 0.19842,
+      "grad_norm": 1.0642367601394653,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 19842
+    },
+    {
+      "epoch": 0.19843,
+      "grad_norm": 1.018958330154419,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 19843
+    },
+    {
+      "epoch": 0.19844,
+      "grad_norm": 0.9024566411972046,
+      "learning_rate": 0.003,
+      "loss": 3.9962,
+      "step": 19844
+    },
+    {
+      "epoch": 0.19845,
+      "grad_norm": 0.8511251211166382,
+      "learning_rate": 0.003,
+      "loss": 3.9954,
+      "step": 19845
+    },
+    {
+      "epoch": 0.19846,
+      "grad_norm": 0.9189441204071045,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 19846
+    },
+    {
+      "epoch": 0.19847,
+      "grad_norm": 1.092054009437561,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 19847
+    },
+    {
+      "epoch": 0.19848,
+      "grad_norm": 0.8711313605308533,
+      "learning_rate": 0.003,
+      "loss": 3.9754,
+      "step": 19848
+    },
+    {
+      "epoch": 0.19849,
+      "grad_norm": 0.8742891550064087,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 19849
+    },
+    {
+      "epoch": 0.1985,
+      "grad_norm": 0.9483180046081543,
+      "learning_rate": 0.003,
+      "loss": 3.9934,
+      "step": 19850
+    },
+    {
+      "epoch": 0.19851,
+      "grad_norm": 1.1380302906036377,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 19851
+    },
+    {
+      "epoch": 0.19852,
+      "grad_norm": 0.8698943257331848,
+      "learning_rate": 0.003,
+      "loss": 3.9952,
+      "step": 19852
+    },
+    {
+      "epoch": 0.19853,
+      "grad_norm": 0.8115482926368713,
+      "learning_rate": 0.003,
+      "loss": 3.9817,
+      "step": 19853
+    },
+    {
+      "epoch": 0.19854,
+      "grad_norm": 0.7584234476089478,
+      "learning_rate": 0.003,
+      "loss": 3.9982,
+      "step": 19854
+    },
+    {
+      "epoch": 0.19855,
+      "grad_norm": 0.8206432461738586,
+      "learning_rate": 0.003,
+      "loss": 3.9887,
+      "step": 19855
+    },
+    {
+      "epoch": 0.19856,
+      "grad_norm": 0.8196293115615845,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 19856
+    },
+    {
+      "epoch": 0.19857,
+      "grad_norm": 0.8289163708686829,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 19857
+    },
+    {
+      "epoch": 0.19858,
+      "grad_norm": 0.8808925747871399,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 19858
+    },
+    {
+      "epoch": 0.19859,
+      "grad_norm": 1.131120204925537,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 19859
+    },
+    {
+      "epoch": 0.1986,
+      "grad_norm": 1.0879793167114258,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 19860
+    },
+    {
+      "epoch": 0.19861,
+      "grad_norm": 0.780910074710846,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 19861
+    },
+    {
+      "epoch": 0.19862,
+      "grad_norm": 0.6187844276428223,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 19862
+    },
+    {
+      "epoch": 0.19863,
+      "grad_norm": 0.665023922920227,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 19863
+    },
+    {
+      "epoch": 0.19864,
+      "grad_norm": 0.9973242878913879,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 19864
+    },
+    {
+      "epoch": 0.19865,
+      "grad_norm": 1.1035375595092773,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 19865
+    },
+    {
+      "epoch": 0.19866,
+      "grad_norm": 0.7924714684486389,
+      "learning_rate": 0.003,
+      "loss": 3.9583,
+      "step": 19866
+    },
+    {
+      "epoch": 0.19867,
+      "grad_norm": 0.7919667363166809,
+      "learning_rate": 0.003,
+      "loss": 3.9977,
+      "step": 19867
+    },
+    {
+      "epoch": 0.19868,
+      "grad_norm": 0.8689961433410645,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 19868
+    },
+    {
+      "epoch": 0.19869,
+      "grad_norm": 0.7589074373245239,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 19869
+    },
+    {
+      "epoch": 0.1987,
+      "grad_norm": 0.6872889995574951,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 19870
+    },
+    {
+      "epoch": 0.19871,
+      "grad_norm": 0.7166623473167419,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 19871
+    },
+    {
+      "epoch": 0.19872,
+      "grad_norm": 0.758948564529419,
+      "learning_rate": 0.003,
+      "loss": 3.9817,
+      "step": 19872
+    },
+    {
+      "epoch": 0.19873,
+      "grad_norm": 0.8943865299224854,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 19873
+    },
+    {
+      "epoch": 0.19874,
+      "grad_norm": 1.0847712755203247,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 19874
+    },
+    {
+      "epoch": 0.19875,
+      "grad_norm": 1.0528215169906616,
+      "learning_rate": 0.003,
+      "loss": 3.9871,
+      "step": 19875
+    },
+    {
+      "epoch": 0.19876,
+      "grad_norm": 0.9561887979507446,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 19876
+    },
+    {
+      "epoch": 0.19877,
+      "grad_norm": 0.9552778005599976,
+      "learning_rate": 0.003,
+      "loss": 4.0022,
+      "step": 19877
+    },
+    {
+      "epoch": 0.19878,
+      "grad_norm": 0.7830400466918945,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 19878
+    },
+    {
+      "epoch": 0.19879,
+      "grad_norm": 0.7615962028503418,
+      "learning_rate": 0.003,
+      "loss": 3.9609,
+      "step": 19879
+    },
+    {
+      "epoch": 0.1988,
+      "grad_norm": 0.7598347067832947,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 19880
+    },
+    {
+      "epoch": 0.19881,
+      "grad_norm": 0.7968227863311768,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 19881
+    },
+    {
+      "epoch": 0.19882,
+      "grad_norm": 0.7083049416542053,
+      "learning_rate": 0.003,
+      "loss": 4.0198,
+      "step": 19882
+    },
+    {
+      "epoch": 0.19883,
+      "grad_norm": 0.7206639051437378,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 19883
+    },
+    {
+      "epoch": 0.19884,
+      "grad_norm": 0.678696870803833,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 19884
+    },
+    {
+      "epoch": 0.19885,
+      "grad_norm": 0.7773205637931824,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 19885
+    },
+    {
+      "epoch": 0.19886,
+      "grad_norm": 0.8615788221359253,
+      "learning_rate": 0.003,
+      "loss": 3.9795,
+      "step": 19886
+    },
+    {
+      "epoch": 0.19887,
+      "grad_norm": 0.9369819760322571,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 19887
+    },
+    {
+      "epoch": 0.19888,
+      "grad_norm": 1.0702393054962158,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 19888
+    },
+    {
+      "epoch": 0.19889,
+      "grad_norm": 1.0090556144714355,
+      "learning_rate": 0.003,
+      "loss": 3.9907,
+      "step": 19889
+    },
+    {
+      "epoch": 0.1989,
+      "grad_norm": 0.9635480046272278,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 19890
+    },
+    {
+      "epoch": 0.19891,
+      "grad_norm": 0.9786340594291687,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 19891
+    },
+    {
+      "epoch": 0.19892,
+      "grad_norm": 1.0961414575576782,
+      "learning_rate": 0.003,
+      "loss": 3.9878,
+      "step": 19892
+    },
+    {
+      "epoch": 0.19893,
+      "grad_norm": 0.8336204290390015,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 19893
+    },
+    {
+      "epoch": 0.19894,
+      "grad_norm": 0.8045467138290405,
+      "learning_rate": 0.003,
+      "loss": 4.0004,
+      "step": 19894
+    },
+    {
+      "epoch": 0.19895,
+      "grad_norm": 0.7639846205711365,
+      "learning_rate": 0.003,
+      "loss": 4.0255,
+      "step": 19895
+    },
+    {
+      "epoch": 0.19896,
+      "grad_norm": 0.711005449295044,
+      "learning_rate": 0.003,
+      "loss": 3.9894,
+      "step": 19896
+    },
+    {
+      "epoch": 0.19897,
+      "grad_norm": 0.7749508619308472,
+      "learning_rate": 0.003,
+      "loss": 3.9761,
+      "step": 19897
+    },
+    {
+      "epoch": 0.19898,
+      "grad_norm": 0.8815631866455078,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 19898
+    },
+    {
+      "epoch": 0.19899,
+      "grad_norm": 1.0683717727661133,
+      "learning_rate": 0.003,
+      "loss": 3.9769,
+      "step": 19899
+    },
+    {
+      "epoch": 0.199,
+      "grad_norm": 1.163893461227417,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 19900
+    },
+    {
+      "epoch": 0.19901,
+      "grad_norm": 0.9210971593856812,
+      "learning_rate": 0.003,
+      "loss": 3.9886,
+      "step": 19901
+    },
+    {
+      "epoch": 0.19902,
+      "grad_norm": 0.881843090057373,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 19902
+    },
+    {
+      "epoch": 0.19903,
+      "grad_norm": 0.7368955016136169,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 19903
+    },
+    {
+      "epoch": 0.19904,
+      "grad_norm": 0.710391640663147,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 19904
+    },
+    {
+      "epoch": 0.19905,
+      "grad_norm": 0.7400331497192383,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 19905
+    },
+    {
+      "epoch": 0.19906,
+      "grad_norm": 0.7363407015800476,
+      "learning_rate": 0.003,
+      "loss": 3.9838,
+      "step": 19906
+    },
+    {
+      "epoch": 0.19907,
+      "grad_norm": 0.7485768795013428,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 19907
+    },
+    {
+      "epoch": 0.19908,
+      "grad_norm": 0.8457195162773132,
+      "learning_rate": 0.003,
+      "loss": 3.9628,
+      "step": 19908
+    },
+    {
+      "epoch": 0.19909,
+      "grad_norm": 0.8688400387763977,
+      "learning_rate": 0.003,
+      "loss": 3.9878,
+      "step": 19909
+    },
+    {
+      "epoch": 0.1991,
+      "grad_norm": 0.8740235567092896,
+      "learning_rate": 0.003,
+      "loss": 3.9983,
+      "step": 19910
+    },
+    {
+      "epoch": 0.19911,
+      "grad_norm": 0.7607549428939819,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 19911
+    },
+    {
+      "epoch": 0.19912,
+      "grad_norm": 0.8285241723060608,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 19912
+    },
+    {
+      "epoch": 0.19913,
+      "grad_norm": 0.8437578678131104,
+      "learning_rate": 0.003,
+      "loss": 4.0132,
+      "step": 19913
+    },
+    {
+      "epoch": 0.19914,
+      "grad_norm": 1.0094083547592163,
+      "learning_rate": 0.003,
+      "loss": 3.9944,
+      "step": 19914
+    },
+    {
+      "epoch": 0.19915,
+      "grad_norm": 1.1903234720230103,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 19915
+    },
+    {
+      "epoch": 0.19916,
+      "grad_norm": 0.8025692701339722,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 19916
+    },
+    {
+      "epoch": 0.19917,
+      "grad_norm": 0.6294260621070862,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 19917
+    },
+    {
+      "epoch": 0.19918,
+      "grad_norm": 0.7292620539665222,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 19918
+    },
+    {
+      "epoch": 0.19919,
+      "grad_norm": 0.7832669615745544,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 19919
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.7429418563842773,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 19920
+    },
+    {
+      "epoch": 0.19921,
+      "grad_norm": 0.7528523206710815,
+      "learning_rate": 0.003,
+      "loss": 4.0057,
+      "step": 19921
+    },
+    {
+      "epoch": 0.19922,
+      "grad_norm": 0.9220634698867798,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 19922
+    },
+    {
+      "epoch": 0.19923,
+      "grad_norm": 1.0650054216384888,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 19923
+    },
+    {
+      "epoch": 0.19924,
+      "grad_norm": 0.9716998934745789,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 19924
+    },
+    {
+      "epoch": 0.19925,
+      "grad_norm": 1.0343821048736572,
+      "learning_rate": 0.003,
+      "loss": 4.0287,
+      "step": 19925
+    },
+    {
+      "epoch": 0.19926,
+      "grad_norm": 1.0323013067245483,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 19926
+    },
+    {
+      "epoch": 0.19927,
+      "grad_norm": 1.1438926458358765,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 19927
+    },
+    {
+      "epoch": 0.19928,
+      "grad_norm": 0.9947730898857117,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 19928
+    },
+    {
+      "epoch": 0.19929,
+      "grad_norm": 0.9139689803123474,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 19929
+    },
+    {
+      "epoch": 0.1993,
+      "grad_norm": 0.7781880497932434,
+      "learning_rate": 0.003,
+      "loss": 3.9991,
+      "step": 19930
+    },
+    {
+      "epoch": 0.19931,
+      "grad_norm": 0.7079624533653259,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 19931
+    },
+    {
+      "epoch": 0.19932,
+      "grad_norm": 0.6574318408966064,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 19932
+    },
+    {
+      "epoch": 0.19933,
+      "grad_norm": 0.5009096264839172,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 19933
+    },
+    {
+      "epoch": 0.19934,
+      "grad_norm": 0.570099949836731,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 19934
+    },
+    {
+      "epoch": 0.19935,
+      "grad_norm": 0.740693211555481,
+      "learning_rate": 0.003,
+      "loss": 3.9825,
+      "step": 19935
+    },
+    {
+      "epoch": 0.19936,
+      "grad_norm": 0.9218637943267822,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 19936
+    },
+    {
+      "epoch": 0.19937,
+      "grad_norm": 1.1842974424362183,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 19937
+    },
+    {
+      "epoch": 0.19938,
+      "grad_norm": 0.8443614840507507,
+      "learning_rate": 0.003,
+      "loss": 3.9898,
+      "step": 19938
+    },
+    {
+      "epoch": 0.19939,
+      "grad_norm": 0.6035322546958923,
+      "learning_rate": 0.003,
+      "loss": 3.9886,
+      "step": 19939
+    },
+    {
+      "epoch": 0.1994,
+      "grad_norm": 0.6982221007347107,
+      "learning_rate": 0.003,
+      "loss": 3.9707,
+      "step": 19940
+    },
+    {
+      "epoch": 0.19941,
+      "grad_norm": 0.9117821455001831,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 19941
+    },
+    {
+      "epoch": 0.19942,
+      "grad_norm": 1.0192880630493164,
+      "learning_rate": 0.003,
+      "loss": 3.9796,
+      "step": 19942
+    },
+    {
+      "epoch": 0.19943,
+      "grad_norm": 1.0437871217727661,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 19943
+    },
+    {
+      "epoch": 0.19944,
+      "grad_norm": 0.7494208216667175,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 19944
+    },
+    {
+      "epoch": 0.19945,
+      "grad_norm": 0.6562110781669617,
+      "learning_rate": 0.003,
+      "loss": 3.9938,
+      "step": 19945
+    },
+    {
+      "epoch": 0.19946,
+      "grad_norm": 0.9326704144477844,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 19946
+    },
+    {
+      "epoch": 0.19947,
+      "grad_norm": 0.9937777519226074,
+      "learning_rate": 0.003,
+      "loss": 3.9872,
+      "step": 19947
+    },
+    {
+      "epoch": 0.19948,
+      "grad_norm": 1.0566006898880005,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 19948
+    },
+    {
+      "epoch": 0.19949,
+      "grad_norm": 0.9444541931152344,
+      "learning_rate": 0.003,
+      "loss": 3.9941,
+      "step": 19949
+    },
+    {
+      "epoch": 0.1995,
+      "grad_norm": 0.7949107885360718,
+      "learning_rate": 0.003,
+      "loss": 3.9869,
+      "step": 19950
+    },
+    {
+      "epoch": 0.19951,
+      "grad_norm": 0.8902096748352051,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 19951
+    },
+    {
+      "epoch": 0.19952,
+      "grad_norm": 0.859947919845581,
+      "learning_rate": 0.003,
+      "loss": 4.0295,
+      "step": 19952
+    },
+    {
+      "epoch": 0.19953,
+      "grad_norm": 0.6775383353233337,
+      "learning_rate": 0.003,
+      "loss": 3.9828,
+      "step": 19953
+    },
+    {
+      "epoch": 0.19954,
+      "grad_norm": 0.6701100468635559,
+      "learning_rate": 0.003,
+      "loss": 3.9791,
+      "step": 19954
+    },
+    {
+      "epoch": 0.19955,
+      "grad_norm": 0.6915163993835449,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 19955
+    },
+    {
+      "epoch": 0.19956,
+      "grad_norm": 0.7247843742370605,
+      "learning_rate": 0.003,
+      "loss": 3.9884,
+      "step": 19956
+    },
+    {
+      "epoch": 0.19957,
+      "grad_norm": 0.7546032667160034,
+      "learning_rate": 0.003,
+      "loss": 3.9858,
+      "step": 19957
+    },
+    {
+      "epoch": 0.19958,
+      "grad_norm": 0.8902223706245422,
+      "learning_rate": 0.003,
+      "loss": 4.0195,
+      "step": 19958
+    },
+    {
+      "epoch": 0.19959,
+      "grad_norm": 1.1033283472061157,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 19959
+    },
+    {
+      "epoch": 0.1996,
+      "grad_norm": 1.0015894174575806,
+      "learning_rate": 0.003,
+      "loss": 3.9873,
+      "step": 19960
+    },
+    {
+      "epoch": 0.19961,
+      "grad_norm": 0.8465616106987,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 19961
+    },
+    {
+      "epoch": 0.19962,
+      "grad_norm": 0.771872878074646,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 19962
+    },
+    {
+      "epoch": 0.19963,
+      "grad_norm": 0.8222938776016235,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 19963
+    },
+    {
+      "epoch": 0.19964,
+      "grad_norm": 0.7800220251083374,
+      "learning_rate": 0.003,
+      "loss": 3.9941,
+      "step": 19964
+    },
+    {
+      "epoch": 0.19965,
+      "grad_norm": 0.8346208333969116,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 19965
+    },
+    {
+      "epoch": 0.19966,
+      "grad_norm": 0.8404595851898193,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 19966
+    },
+    {
+      "epoch": 0.19967,
+      "grad_norm": 0.7356424927711487,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 19967
+    },
+    {
+      "epoch": 0.19968,
+      "grad_norm": 0.7217704653739929,
+      "learning_rate": 0.003,
+      "loss": 3.9862,
+      "step": 19968
+    },
+    {
+      "epoch": 0.19969,
+      "grad_norm": 0.7107825875282288,
+      "learning_rate": 0.003,
+      "loss": 3.9786,
+      "step": 19969
+    },
+    {
+      "epoch": 0.1997,
+      "grad_norm": 0.773546040058136,
+      "learning_rate": 0.003,
+      "loss": 3.9889,
+      "step": 19970
+    },
+    {
+      "epoch": 0.19971,
+      "grad_norm": 0.8497675061225891,
+      "learning_rate": 0.003,
+      "loss": 3.9671,
+      "step": 19971
+    },
+    {
+      "epoch": 0.19972,
+      "grad_norm": 0.9392867088317871,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 19972
+    },
+    {
+      "epoch": 0.19973,
+      "grad_norm": 0.9622828364372253,
+      "learning_rate": 0.003,
+      "loss": 4.0202,
+      "step": 19973
+    },
+    {
+      "epoch": 0.19974,
+      "grad_norm": 1.161346435546875,
+      "learning_rate": 0.003,
+      "loss": 3.9924,
+      "step": 19974
+    },
+    {
+      "epoch": 0.19975,
+      "grad_norm": 0.9697285294532776,
+      "learning_rate": 0.003,
+      "loss": 3.9868,
+      "step": 19975
+    },
+    {
+      "epoch": 0.19976,
+      "grad_norm": 0.9470873475074768,
+      "learning_rate": 0.003,
+      "loss": 3.9957,
+      "step": 19976
+    },
+    {
+      "epoch": 0.19977,
+      "grad_norm": 0.9103334546089172,
+      "learning_rate": 0.003,
+      "loss": 3.968,
+      "step": 19977
+    },
+    {
+      "epoch": 0.19978,
+      "grad_norm": 1.0092731714248657,
+      "learning_rate": 0.003,
+      "loss": 4.0352,
+      "step": 19978
+    },
+    {
+      "epoch": 0.19979,
+      "grad_norm": 1.1104042530059814,
+      "learning_rate": 0.003,
+      "loss": 4.018,
+      "step": 19979
+    },
+    {
+      "epoch": 0.1998,
+      "grad_norm": 0.9779863357543945,
+      "learning_rate": 0.003,
+      "loss": 4.0032,
+      "step": 19980
+    },
+    {
+      "epoch": 0.19981,
+      "grad_norm": 1.0093814134597778,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 19981
+    },
+    {
+      "epoch": 0.19982,
+      "grad_norm": 1.0757005214691162,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 19982
+    },
+    {
+      "epoch": 0.19983,
+      "grad_norm": 0.9751392006874084,
+      "learning_rate": 0.003,
+      "loss": 4.0461,
+      "step": 19983
+    },
+    {
+      "epoch": 0.19984,
+      "grad_norm": 0.9791036248207092,
+      "learning_rate": 0.003,
+      "loss": 4.0185,
+      "step": 19984
+    },
+    {
+      "epoch": 0.19985,
+      "grad_norm": 0.9474849700927734,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 19985
+    },
+    {
+      "epoch": 0.19986,
+      "grad_norm": 1.0115426778793335,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 19986
+    },
+    {
+      "epoch": 0.19987,
+      "grad_norm": 0.925239086151123,
+      "learning_rate": 0.003,
+      "loss": 3.984,
+      "step": 19987
+    },
+    {
+      "epoch": 0.19988,
+      "grad_norm": 0.8519060611724854,
+      "learning_rate": 0.003,
+      "loss": 3.9612,
+      "step": 19988
+    },
+    {
+      "epoch": 0.19989,
+      "grad_norm": 0.986173689365387,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 19989
+    },
+    {
+      "epoch": 0.1999,
+      "grad_norm": 1.378746509552002,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 19990
+    },
+    {
+      "epoch": 0.19991,
+      "grad_norm": 0.5946156978607178,
+      "learning_rate": 0.003,
+      "loss": 3.9972,
+      "step": 19991
+    },
+    {
+      "epoch": 0.19992,
+      "grad_norm": 0.6783791780471802,
+      "learning_rate": 0.003,
+      "loss": 3.981,
+      "step": 19992
+    },
+    {
+      "epoch": 0.19993,
+      "grad_norm": 0.8321949243545532,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 19993
+    },
+    {
+      "epoch": 0.19994,
+      "grad_norm": 0.7344717383384705,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 19994
+    },
+    {
+      "epoch": 0.19995,
+      "grad_norm": 0.7066532969474792,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 19995
+    },
+    {
+      "epoch": 0.19996,
+      "grad_norm": 0.7640671133995056,
+      "learning_rate": 0.003,
+      "loss": 3.9742,
+      "step": 19996
+    },
+    {
+      "epoch": 0.19997,
+      "grad_norm": 0.7876488566398621,
+      "learning_rate": 0.003,
+      "loss": 3.9798,
+      "step": 19997
+    },
+    {
+      "epoch": 0.19998,
+      "grad_norm": 0.7732638716697693,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 19998
+    },
+    {
+      "epoch": 0.19999,
+      "grad_norm": 0.9254476428031921,
+      "learning_rate": 0.003,
+      "loss": 3.9811,
+      "step": 19999
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.0530250072479248,
+      "learning_rate": 0.003,
+      "loss": 3.9802,
+      "step": 20000
+    },
+    {
+      "epoch": 0.20001,
+      "grad_norm": 0.9983343482017517,
+      "learning_rate": 0.003,
+      "loss": 3.9752,
+      "step": 20001
+    },
+    {
+      "epoch": 0.20002,
+      "grad_norm": 0.9355091452598572,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 20002
+    },
+    {
+      "epoch": 0.20003,
+      "grad_norm": 0.7862496972084045,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 20003
+    },
+    {
+      "epoch": 0.20004,
+      "grad_norm": 0.7292976379394531,
+      "learning_rate": 0.003,
+      "loss": 3.9815,
+      "step": 20004
+    },
+    {
+      "epoch": 0.20005,
+      "grad_norm": 0.6682198643684387,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 20005
+    },
+    {
+      "epoch": 0.20006,
+      "grad_norm": 0.5940819978713989,
+      "learning_rate": 0.003,
+      "loss": 3.9578,
+      "step": 20006
+    },
+    {
+      "epoch": 0.20007,
+      "grad_norm": 0.65861576795578,
+      "learning_rate": 0.003,
+      "loss": 3.9673,
+      "step": 20007
+    },
+    {
+      "epoch": 0.20008,
+      "grad_norm": 0.7443798184394836,
+      "learning_rate": 0.003,
+      "loss": 3.9907,
+      "step": 20008
+    },
+    {
+      "epoch": 0.20009,
+      "grad_norm": 0.8729384541511536,
+      "learning_rate": 0.003,
+      "loss": 4.0095,
+      "step": 20009
+    },
+    {
+      "epoch": 0.2001,
+      "grad_norm": 0.9064518809318542,
+      "learning_rate": 0.003,
+      "loss": 3.9876,
+      "step": 20010
+    },
+    {
+      "epoch": 0.20011,
+      "grad_norm": 0.7785030007362366,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 20011
+    },
+    {
+      "epoch": 0.20012,
+      "grad_norm": 0.6019753217697144,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 20012
+    },
+    {
+      "epoch": 0.20013,
+      "grad_norm": 0.6211569309234619,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 20013
+    },
+    {
+      "epoch": 0.20014,
+      "grad_norm": 0.7714516520500183,
+      "learning_rate": 0.003,
+      "loss": 3.997,
+      "step": 20014
+    },
+    {
+      "epoch": 0.20015,
+      "grad_norm": 0.9249976873397827,
+      "learning_rate": 0.003,
+      "loss": 3.9868,
+      "step": 20015
+    },
+    {
+      "epoch": 0.20016,
+      "grad_norm": 0.8645272850990295,
+      "learning_rate": 0.003,
+      "loss": 4.0054,
+      "step": 20016
+    },
+    {
+      "epoch": 0.20017,
+      "grad_norm": 0.7886145114898682,
+      "learning_rate": 0.003,
+      "loss": 3.9641,
+      "step": 20017
+    },
+    {
+      "epoch": 0.20018,
+      "grad_norm": 0.7523903250694275,
+      "learning_rate": 0.003,
+      "loss": 3.9896,
+      "step": 20018
+    },
+    {
+      "epoch": 0.20019,
+      "grad_norm": 0.7617239356040955,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 20019
+    },
+    {
+      "epoch": 0.2002,
+      "grad_norm": 0.762170135974884,
+      "learning_rate": 0.003,
+      "loss": 3.9907,
+      "step": 20020
+    },
+    {
+      "epoch": 0.20021,
+      "grad_norm": 0.7641904354095459,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 20021
+    },
+    {
+      "epoch": 0.20022,
+      "grad_norm": 0.7135266661643982,
+      "learning_rate": 0.003,
+      "loss": 3.995,
+      "step": 20022
+    },
+    {
+      "epoch": 0.20023,
+      "grad_norm": 0.6172149777412415,
+      "learning_rate": 0.003,
+      "loss": 3.9896,
+      "step": 20023
+    },
+    {
+      "epoch": 0.20024,
+      "grad_norm": 0.620384693145752,
+      "learning_rate": 0.003,
+      "loss": 3.9837,
+      "step": 20024
+    },
+    {
+      "epoch": 0.20025,
+      "grad_norm": 0.68131422996521,
+      "learning_rate": 0.003,
+      "loss": 3.9688,
+      "step": 20025
+    },
+    {
+      "epoch": 0.20026,
+      "grad_norm": 0.8223885297775269,
+      "learning_rate": 0.003,
+      "loss": 3.9856,
+      "step": 20026
+    },
+    {
+      "epoch": 0.20027,
+      "grad_norm": 0.9747222661972046,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 20027
+    },
+    {
+      "epoch": 0.20028,
+      "grad_norm": 1.0941600799560547,
+      "learning_rate": 0.003,
+      "loss": 3.9816,
+      "step": 20028
+    },
+    {
+      "epoch": 0.20029,
+      "grad_norm": 0.9600298404693604,
+      "learning_rate": 0.003,
+      "loss": 3.9748,
+      "step": 20029
+    },
+    {
+      "epoch": 0.2003,
+      "grad_norm": 0.9398216009140015,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 20030
+    },
+    {
+      "epoch": 0.20031,
+      "grad_norm": 0.8767874240875244,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 20031
+    },
+    {
+      "epoch": 0.20032,
+      "grad_norm": 0.8055996894836426,
+      "learning_rate": 0.003,
+      "loss": 3.9993,
+      "step": 20032
+    },
+    {
+      "epoch": 0.20033,
+      "grad_norm": 0.9127088189125061,
+      "learning_rate": 0.003,
+      "loss": 4.0003,
+      "step": 20033
+    },
+    {
+      "epoch": 0.20034,
+      "grad_norm": 1.1113648414611816,
+      "learning_rate": 0.003,
+      "loss": 3.9905,
+      "step": 20034
+    },
+    {
+      "epoch": 0.20035,
+      "grad_norm": 1.2661253213882446,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 20035
+    },
+    {
+      "epoch": 0.20036,
+      "grad_norm": 0.8963114619255066,
+      "learning_rate": 0.003,
+      "loss": 4.0351,
+      "step": 20036
+    },
+    {
+      "epoch": 0.20037,
+      "grad_norm": 0.907471776008606,
+      "learning_rate": 0.003,
+      "loss": 3.9935,
+      "step": 20037
+    },
+    {
+      "epoch": 0.20038,
+      "grad_norm": 0.8981186151504517,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 20038
+    },
+    {
+      "epoch": 0.20039,
+      "grad_norm": 0.9879725575447083,
+      "learning_rate": 0.003,
+      "loss": 3.9842,
+      "step": 20039
+    },
+    {
+      "epoch": 0.2004,
+      "grad_norm": 1.06181800365448,
+      "learning_rate": 0.003,
+      "loss": 4.0318,
+      "step": 20040
+    },
+    {
+      "epoch": 0.20041,
+      "grad_norm": 0.9043117761611938,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 20041
+    },
+    {
+      "epoch": 0.20042,
+      "grad_norm": 0.9589005708694458,
+      "learning_rate": 0.003,
+      "loss": 4.0524,
+      "step": 20042
+    },
+    {
+      "epoch": 0.20043,
+      "grad_norm": 1.015989065170288,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 20043
+    },
+    {
+      "epoch": 0.20044,
+      "grad_norm": 1.1127653121948242,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 20044
+    },
+    {
+      "epoch": 0.20045,
+      "grad_norm": 0.867679238319397,
+      "learning_rate": 0.003,
+      "loss": 3.9873,
+      "step": 20045
+    },
+    {
+      "epoch": 0.20046,
+      "grad_norm": 0.6572129726409912,
+      "learning_rate": 0.003,
+      "loss": 3.9799,
+      "step": 20046
+    },
+    {
+      "epoch": 0.20047,
+      "grad_norm": 0.7014610767364502,
+      "learning_rate": 0.003,
+      "loss": 3.9821,
+      "step": 20047
+    },
+    {
+      "epoch": 0.20048,
+      "grad_norm": 0.5642576217651367,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 20048
+    },
+    {
+      "epoch": 0.20049,
+      "grad_norm": 0.5769412517547607,
+      "learning_rate": 0.003,
+      "loss": 3.9841,
+      "step": 20049
+    },
+    {
+      "epoch": 0.2005,
+      "grad_norm": 0.5566190481185913,
+      "learning_rate": 0.003,
+      "loss": 3.9874,
+      "step": 20050
+    },
+    {
+      "epoch": 0.20051,
+      "grad_norm": 0.538754940032959,
+      "learning_rate": 0.003,
+      "loss": 3.9906,
+      "step": 20051
+    },
+    {
+      "epoch": 0.20052,
+      "grad_norm": 0.49352458119392395,
+      "learning_rate": 0.003,
+      "loss": 3.9831,
+      "step": 20052
+    },
+    {
+      "epoch": 0.20053,
+      "grad_norm": 0.5705218315124512,
+      "learning_rate": 0.003,
+      "loss": 3.982,
+      "step": 20053
+    },
+    {
+      "epoch": 0.20054,
+      "grad_norm": 0.7290521860122681,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 20054
+    },
+    {
+      "epoch": 0.20055,
+      "grad_norm": 0.956404447555542,
+      "learning_rate": 0.003,
+      "loss": 3.9844,
+      "step": 20055
+    },
+    {
+      "epoch": 0.20056,
+      "grad_norm": 1.1518845558166504,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 20056
+    },
+    {
+      "epoch": 0.20057,
+      "grad_norm": 0.8350433707237244,
+      "learning_rate": 0.003,
+      "loss": 3.981,
+      "step": 20057
+    },
+    {
+      "epoch": 0.20058,
+      "grad_norm": 0.6605261564254761,
+      "learning_rate": 0.003,
+      "loss": 3.9983,
+      "step": 20058
+    },
+    {
+      "epoch": 0.20059,
+      "grad_norm": 0.5880551934242249,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 20059
+    },
+    {
+      "epoch": 0.2006,
+      "grad_norm": 0.5347115993499756,
+      "learning_rate": 0.003,
+      "loss": 3.9627,
+      "step": 20060
+    },
+    {
+      "epoch": 0.20061,
+      "grad_norm": 0.5903977751731873,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 20061
+    },
+    {
+      "epoch": 0.20062,
+      "grad_norm": 0.6846911311149597,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 20062
+    },
+    {
+      "epoch": 0.20063,
+      "grad_norm": 0.7614838480949402,
+      "learning_rate": 0.003,
+      "loss": 4.0269,
+      "step": 20063
+    },
+    {
+      "epoch": 0.20064,
+      "grad_norm": 0.9693281650543213,
+      "learning_rate": 0.003,
+      "loss": 4.0302,
+      "step": 20064
+    },
+    {
+      "epoch": 0.20065,
+      "grad_norm": 1.1056090593338013,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 20065
+    },
+    {
+      "epoch": 0.20066,
+      "grad_norm": 0.9413695931434631,
+      "learning_rate": 0.003,
+      "loss": 3.9688,
+      "step": 20066
+    },
+    {
+      "epoch": 0.20067,
+      "grad_norm": 1.2127419710159302,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 20067
+    },
+    {
+      "epoch": 0.20068,
+      "grad_norm": 0.9101815223693848,
+      "learning_rate": 0.003,
+      "loss": 3.9832,
+      "step": 20068
+    },
+    {
+      "epoch": 0.20069,
+      "grad_norm": 0.911109447479248,
+      "learning_rate": 0.003,
+      "loss": 4.02,
+      "step": 20069
+    },
+    {
+      "epoch": 0.2007,
+      "grad_norm": 0.8823415637016296,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 20070
+    },
+    {
+      "epoch": 0.20071,
+      "grad_norm": 0.761504590511322,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 20071
+    },
+    {
+      "epoch": 0.20072,
+      "grad_norm": 0.6968681812286377,
+      "learning_rate": 0.003,
+      "loss": 3.9554,
+      "step": 20072
+    },
+    {
+      "epoch": 0.20073,
+      "grad_norm": 0.7271966338157654,
+      "learning_rate": 0.003,
+      "loss": 3.9618,
+      "step": 20073
+    },
+    {
+      "epoch": 0.20074,
+      "grad_norm": 0.8523584008216858,
+      "learning_rate": 0.003,
+      "loss": 3.9894,
+      "step": 20074
+    },
+    {
+      "epoch": 0.20075,
+      "grad_norm": 1.0339120626449585,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 20075
+    },
+    {
+      "epoch": 0.20076,
+      "grad_norm": 1.1442428827285767,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 20076
+    },
+    {
+      "epoch": 0.20077,
+      "grad_norm": 0.8703557848930359,
+      "learning_rate": 0.003,
+      "loss": 3.9687,
+      "step": 20077
+    },
+    {
+      "epoch": 0.20078,
+      "grad_norm": 0.8325510621070862,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 20078
+    },
+    {
+      "epoch": 0.20079,
+      "grad_norm": 0.9348880052566528,
+      "learning_rate": 0.003,
+      "loss": 3.9878,
+      "step": 20079
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 1.0314216613769531,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 20080
+    },
+    {
+      "epoch": 0.20081,
+      "grad_norm": 1.0552459955215454,
+      "learning_rate": 0.003,
+      "loss": 3.996,
+      "step": 20081
+    },
+    {
+      "epoch": 0.20082,
+      "grad_norm": 0.9345569014549255,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 20082
+    },
+    {
+      "epoch": 0.20083,
+      "grad_norm": 0.8296226859092712,
+      "learning_rate": 0.003,
+      "loss": 4.0088,
+      "step": 20083
+    },
+    {
+      "epoch": 0.20084,
+      "grad_norm": 0.7488117218017578,
+      "learning_rate": 0.003,
+      "loss": 3.9863,
+      "step": 20084
+    },
+    {
+      "epoch": 0.20085,
+      "grad_norm": 0.9322945475578308,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 20085
+    },
+    {
+      "epoch": 0.20086,
+      "grad_norm": 1.038449764251709,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 20086
+    },
+    {
+      "epoch": 0.20087,
+      "grad_norm": 1.1149946451187134,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 20087
+    },
+    {
+      "epoch": 0.20088,
+      "grad_norm": 0.9755486249923706,
+      "learning_rate": 0.003,
+      "loss": 3.9843,
+      "step": 20088
+    },
+    {
+      "epoch": 0.20089,
+      "grad_norm": 0.8299311995506287,
+      "learning_rate": 0.003,
+      "loss": 4.0038,
+      "step": 20089
+    },
+    {
+      "epoch": 0.2009,
+      "grad_norm": 0.7166337370872498,
+      "learning_rate": 0.003,
+      "loss": 3.9984,
+      "step": 20090
+    },
+    {
+      "epoch": 0.20091,
+      "grad_norm": 0.6959244012832642,
+      "learning_rate": 0.003,
+      "loss": 4.0067,
+      "step": 20091
+    },
+    {
+      "epoch": 0.20092,
+      "grad_norm": 0.7742272615432739,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 20092
+    },
+    {
+      "epoch": 0.20093,
+      "grad_norm": 0.8766480684280396,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 20093
+    },
+    {
+      "epoch": 0.20094,
+      "grad_norm": 0.7357550859451294,
+      "learning_rate": 0.003,
+      "loss": 3.9857,
+      "step": 20094
+    },
+    {
+      "epoch": 0.20095,
+      "grad_norm": 0.962944507598877,
+      "learning_rate": 0.003,
+      "loss": 4.0292,
+      "step": 20095
+    },
+    {
+      "epoch": 0.20096,
+      "grad_norm": 1.0833114385604858,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 20096
+    },
+    {
+      "epoch": 0.20097,
+      "grad_norm": 1.0890945196151733,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 20097
+    },
+    {
+      "epoch": 0.20098,
+      "grad_norm": 0.8043980598449707,
+      "learning_rate": 0.003,
+      "loss": 3.9893,
+      "step": 20098
+    },
+    {
+      "epoch": 0.20099,
+      "grad_norm": 0.6822202801704407,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 20099
+    },
+    {
+      "epoch": 0.201,
+      "grad_norm": 0.7577584385871887,
+      "learning_rate": 0.003,
+      "loss": 3.9873,
+      "step": 20100
+    },
+    {
+      "epoch": 0.20101,
+      "grad_norm": 0.8958604335784912,
+      "learning_rate": 0.003,
+      "loss": 3.9805,
+      "step": 20101
+    },
+    {
+      "epoch": 0.20102,
+      "grad_norm": 0.9075361490249634,
+      "learning_rate": 0.003,
+      "loss": 4.0251,
+      "step": 20102
+    },
+    {
+      "epoch": 0.20103,
+      "grad_norm": 0.8133460879325867,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 20103
+    },
+    {
+      "epoch": 0.20104,
+      "grad_norm": 0.8210949897766113,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 20104
+    },
+    {
+      "epoch": 0.20105,
+      "grad_norm": 0.8700422048568726,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 20105
+    },
+    {
+      "epoch": 0.20106,
+      "grad_norm": 0.9867281317710876,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 20106
+    },
+    {
+      "epoch": 0.20107,
+      "grad_norm": 1.0752992630004883,
+      "learning_rate": 0.003,
+      "loss": 4.0082,
+      "step": 20107
+    },
+    {
+      "epoch": 0.20108,
+      "grad_norm": 0.8385269045829773,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 20108
+    },
+    {
+      "epoch": 0.20109,
+      "grad_norm": 0.689854621887207,
+      "learning_rate": 0.003,
+      "loss": 4.0109,
+      "step": 20109
+    },
+    {
+      "epoch": 0.2011,
+      "grad_norm": 0.7017920017242432,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 20110
+    },
+    {
+      "epoch": 0.20111,
+      "grad_norm": 0.7412782311439514,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 20111
+    },
+    {
+      "epoch": 0.20112,
+      "grad_norm": 0.703317403793335,
+      "learning_rate": 0.003,
+      "loss": 3.9848,
+      "step": 20112
+    },
+    {
+      "epoch": 0.20113,
+      "grad_norm": 0.655783474445343,
+      "learning_rate": 0.003,
+      "loss": 4.0085,
+      "step": 20113
+    },
+    {
+      "epoch": 0.20114,
+      "grad_norm": 0.6221177577972412,
+      "learning_rate": 0.003,
+      "loss": 3.9808,
+      "step": 20114
+    },
+    {
+      "epoch": 0.20115,
+      "grad_norm": 0.6611085534095764,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 20115
+    },
+    {
+      "epoch": 0.20116,
+      "grad_norm": 0.7219901084899902,
+      "learning_rate": 0.003,
+      "loss": 3.9802,
+      "step": 20116
+    },
+    {
+      "epoch": 0.20117,
+      "grad_norm": 1.033886432647705,
+      "learning_rate": 0.003,
+      "loss": 3.9872,
+      "step": 20117
+    },
+    {
+      "epoch": 0.20118,
+      "grad_norm": 1.2467701435089111,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 20118
+    },
+    {
+      "epoch": 0.20119,
+      "grad_norm": 0.7234686613082886,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 20119
+    },
+    {
+      "epoch": 0.2012,
+      "grad_norm": 0.6081365942955017,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 20120
+    },
+    {
+      "epoch": 0.20121,
+      "grad_norm": 0.6261841058731079,
+      "learning_rate": 0.003,
+      "loss": 4.0072,
+      "step": 20121
+    },
+    {
+      "epoch": 0.20122,
+      "grad_norm": 0.7835150361061096,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 20122
+    },
+    {
+      "epoch": 0.20123,
+      "grad_norm": 0.8453176021575928,
+      "learning_rate": 0.003,
+      "loss": 4.0179,
+      "step": 20123
+    },
+    {
+      "epoch": 0.20124,
+      "grad_norm": 0.9139364957809448,
+      "learning_rate": 0.003,
+      "loss": 3.9848,
+      "step": 20124
+    },
+    {
+      "epoch": 0.20125,
+      "grad_norm": 1.1292014122009277,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 20125
+    },
+    {
+      "epoch": 0.20126,
+      "grad_norm": 1.171099066734314,
+      "learning_rate": 0.003,
+      "loss": 3.9862,
+      "step": 20126
+    },
+    {
+      "epoch": 0.20127,
+      "grad_norm": 0.7680856585502625,
+      "learning_rate": 0.003,
+      "loss": 3.9898,
+      "step": 20127
+    },
+    {
+      "epoch": 0.20128,
+      "grad_norm": 0.6378629803657532,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 20128
+    },
+    {
+      "epoch": 0.20129,
+      "grad_norm": 0.7174150943756104,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 20129
+    },
+    {
+      "epoch": 0.2013,
+      "grad_norm": 0.91487056016922,
+      "learning_rate": 0.003,
+      "loss": 4.0246,
+      "step": 20130
+    },
+    {
+      "epoch": 0.20131,
+      "grad_norm": 1.1376959085464478,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 20131
+    },
+    {
+      "epoch": 0.20132,
+      "grad_norm": 0.8521286845207214,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 20132
+    },
+    {
+      "epoch": 0.20133,
+      "grad_norm": 0.7289743423461914,
+      "learning_rate": 0.003,
+      "loss": 3.9973,
+      "step": 20133
+    },
+    {
+      "epoch": 0.20134,
+      "grad_norm": 0.7509118318557739,
+      "learning_rate": 0.003,
+      "loss": 3.9847,
+      "step": 20134
+    },
+    {
+      "epoch": 0.20135,
+      "grad_norm": 1.0695552825927734,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 20135
+    },
+    {
+      "epoch": 0.20136,
+      "grad_norm": 1.0982223749160767,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 20136
+    },
+    {
+      "epoch": 0.20137,
+      "grad_norm": 0.9279093146324158,
+      "learning_rate": 0.003,
+      "loss": 3.9738,
+      "step": 20137
+    },
+    {
+      "epoch": 0.20138,
+      "grad_norm": 0.8908368349075317,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 20138
+    },
+    {
+      "epoch": 0.20139,
+      "grad_norm": 0.762957751750946,
+      "learning_rate": 0.003,
+      "loss": 3.995,
+      "step": 20139
+    },
+    {
+      "epoch": 0.2014,
+      "grad_norm": 0.6879292130470276,
+      "learning_rate": 0.003,
+      "loss": 3.9787,
+      "step": 20140
+    },
+    {
+      "epoch": 0.20141,
+      "grad_norm": 0.7439958453178406,
+      "learning_rate": 0.003,
+      "loss": 3.9924,
+      "step": 20141
+    },
+    {
+      "epoch": 0.20142,
+      "grad_norm": 0.8566961884498596,
+      "learning_rate": 0.003,
+      "loss": 3.981,
+      "step": 20142
+    },
+    {
+      "epoch": 0.20143,
+      "grad_norm": 0.8951508402824402,
+      "learning_rate": 0.003,
+      "loss": 3.9863,
+      "step": 20143
+    },
+    {
+      "epoch": 0.20144,
+      "grad_norm": 1.1969650983810425,
+      "learning_rate": 0.003,
+      "loss": 3.9858,
+      "step": 20144
+    },
+    {
+      "epoch": 0.20145,
+      "grad_norm": 0.8651864528656006,
+      "learning_rate": 0.003,
+      "loss": 4.005,
+      "step": 20145
+    },
+    {
+      "epoch": 0.20146,
+      "grad_norm": 0.7943025827407837,
+      "learning_rate": 0.003,
+      "loss": 4.0234,
+      "step": 20146
+    },
+    {
+      "epoch": 0.20147,
+      "grad_norm": 0.8146642446517944,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 20147
+    },
+    {
+      "epoch": 0.20148,
+      "grad_norm": 1.0197103023529053,
+      "learning_rate": 0.003,
+      "loss": 4.011,
+      "step": 20148
+    },
+    {
+      "epoch": 0.20149,
+      "grad_norm": 1.257275104522705,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 20149
+    },
+    {
+      "epoch": 0.2015,
+      "grad_norm": 0.7094889283180237,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 20150
+    },
+    {
+      "epoch": 0.20151,
+      "grad_norm": 0.660546064376831,
+      "learning_rate": 0.003,
+      "loss": 3.9946,
+      "step": 20151
+    },
+    {
+      "epoch": 0.20152,
+      "grad_norm": 0.6721454858779907,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 20152
+    },
+    {
+      "epoch": 0.20153,
+      "grad_norm": 0.7225807905197144,
+      "learning_rate": 0.003,
+      "loss": 3.9787,
+      "step": 20153
+    },
+    {
+      "epoch": 0.20154,
+      "grad_norm": 0.9250208139419556,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 20154
+    },
+    {
+      "epoch": 0.20155,
+      "grad_norm": 0.9858287572860718,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 20155
+    },
+    {
+      "epoch": 0.20156,
+      "grad_norm": 0.8439344763755798,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 20156
+    },
+    {
+      "epoch": 0.20157,
+      "grad_norm": 0.7250839471817017,
+      "learning_rate": 0.003,
+      "loss": 3.9728,
+      "step": 20157
+    },
+    {
+      "epoch": 0.20158,
+      "grad_norm": 0.7875615954399109,
+      "learning_rate": 0.003,
+      "loss": 3.9781,
+      "step": 20158
+    },
+    {
+      "epoch": 0.20159,
+      "grad_norm": 0.9504771828651428,
+      "learning_rate": 0.003,
+      "loss": 3.9682,
+      "step": 20159
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 1.1765533685684204,
+      "learning_rate": 0.003,
+      "loss": 3.9849,
+      "step": 20160
+    },
+    {
+      "epoch": 0.20161,
+      "grad_norm": 1.008900761604309,
+      "learning_rate": 0.003,
+      "loss": 4.0006,
+      "step": 20161
+    },
+    {
+      "epoch": 0.20162,
+      "grad_norm": 0.9553276300430298,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 20162
+    },
+    {
+      "epoch": 0.20163,
+      "grad_norm": 0.8626341819763184,
+      "learning_rate": 0.003,
+      "loss": 4.015,
+      "step": 20163
+    },
+    {
+      "epoch": 0.20164,
+      "grad_norm": 0.7288162708282471,
+      "learning_rate": 0.003,
+      "loss": 4.0325,
+      "step": 20164
+    },
+    {
+      "epoch": 0.20165,
+      "grad_norm": 0.6690157651901245,
+      "learning_rate": 0.003,
+      "loss": 3.9721,
+      "step": 20165
+    },
+    {
+      "epoch": 0.20166,
+      "grad_norm": 0.6751003265380859,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 20166
+    },
+    {
+      "epoch": 0.20167,
+      "grad_norm": 0.7453999519348145,
+      "learning_rate": 0.003,
+      "loss": 3.9716,
+      "step": 20167
+    },
+    {
+      "epoch": 0.20168,
+      "grad_norm": 0.8011784553527832,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 20168
+    },
+    {
+      "epoch": 0.20169,
+      "grad_norm": 0.901778519153595,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 20169
+    },
+    {
+      "epoch": 0.2017,
+      "grad_norm": 0.9532800316810608,
+      "learning_rate": 0.003,
+      "loss": 3.9994,
+      "step": 20170
+    },
+    {
+      "epoch": 0.20171,
+      "grad_norm": 1.0813367366790771,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 20171
+    },
+    {
+      "epoch": 0.20172,
+      "grad_norm": 0.961879312992096,
+      "learning_rate": 0.003,
+      "loss": 4.0174,
+      "step": 20172
+    },
+    {
+      "epoch": 0.20173,
+      "grad_norm": 0.7712802290916443,
+      "learning_rate": 0.003,
+      "loss": 3.9568,
+      "step": 20173
+    },
+    {
+      "epoch": 0.20174,
+      "grad_norm": 0.7062988877296448,
+      "learning_rate": 0.003,
+      "loss": 3.9745,
+      "step": 20174
+    },
+    {
+      "epoch": 0.20175,
+      "grad_norm": 0.7877758741378784,
+      "learning_rate": 0.003,
+      "loss": 4.0048,
+      "step": 20175
+    },
+    {
+      "epoch": 0.20176,
+      "grad_norm": 0.9011003375053406,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 20176
+    },
+    {
+      "epoch": 0.20177,
+      "grad_norm": 1.0270823240280151,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 20177
+    },
+    {
+      "epoch": 0.20178,
+      "grad_norm": 0.9703652262687683,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 20178
+    },
+    {
+      "epoch": 0.20179,
+      "grad_norm": 0.9204882979393005,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 20179
+    },
+    {
+      "epoch": 0.2018,
+      "grad_norm": 0.9217516779899597,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 20180
+    },
+    {
+      "epoch": 0.20181,
+      "grad_norm": 0.811480700969696,
+      "learning_rate": 0.003,
+      "loss": 3.984,
+      "step": 20181
+    },
+    {
+      "epoch": 0.20182,
+      "grad_norm": 0.953285813331604,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 20182
+    },
+    {
+      "epoch": 0.20183,
+      "grad_norm": 1.112351894378662,
+      "learning_rate": 0.003,
+      "loss": 3.9926,
+      "step": 20183
+    },
+    {
+      "epoch": 0.20184,
+      "grad_norm": 1.1086965799331665,
+      "learning_rate": 0.003,
+      "loss": 4.0461,
+      "step": 20184
+    },
+    {
+      "epoch": 0.20185,
+      "grad_norm": 0.8778813481330872,
+      "learning_rate": 0.003,
+      "loss": 3.9846,
+      "step": 20185
+    },
+    {
+      "epoch": 0.20186,
+      "grad_norm": 0.8389912843704224,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 20186
+    },
+    {
+      "epoch": 0.20187,
+      "grad_norm": 0.9361748695373535,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 20187
+    },
+    {
+      "epoch": 0.20188,
+      "grad_norm": 0.8386715054512024,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 20188
+    },
+    {
+      "epoch": 0.20189,
+      "grad_norm": 0.7701196670532227,
+      "learning_rate": 0.003,
+      "loss": 3.9817,
+      "step": 20189
+    },
+    {
+      "epoch": 0.2019,
+      "grad_norm": 0.6367450952529907,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 20190
+    },
+    {
+      "epoch": 0.20191,
+      "grad_norm": 0.5951288938522339,
+      "learning_rate": 0.003,
+      "loss": 4.0211,
+      "step": 20191
+    },
+    {
+      "epoch": 0.20192,
+      "grad_norm": 0.7320143580436707,
+      "learning_rate": 0.003,
+      "loss": 4.0068,
+      "step": 20192
+    },
+    {
+      "epoch": 0.20193,
+      "grad_norm": 0.7183142304420471,
+      "learning_rate": 0.003,
+      "loss": 3.9803,
+      "step": 20193
+    },
+    {
+      "epoch": 0.20194,
+      "grad_norm": 0.7571724057197571,
+      "learning_rate": 0.003,
+      "loss": 4.0099,
+      "step": 20194
+    },
+    {
+      "epoch": 0.20195,
+      "grad_norm": 0.6767758727073669,
+      "learning_rate": 0.003,
+      "loss": 4.0511,
+      "step": 20195
+    },
+    {
+      "epoch": 0.20196,
+      "grad_norm": 0.7127629518508911,
+      "learning_rate": 0.003,
+      "loss": 3.9859,
+      "step": 20196
+    },
+    {
+      "epoch": 0.20197,
+      "grad_norm": 0.7226316928863525,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 20197
+    },
+    {
+      "epoch": 0.20198,
+      "grad_norm": 0.9264624118804932,
+      "learning_rate": 0.003,
+      "loss": 3.9735,
+      "step": 20198
+    },
+    {
+      "epoch": 0.20199,
+      "grad_norm": 1.3039424419403076,
+      "learning_rate": 0.003,
+      "loss": 3.9977,
+      "step": 20199
+    },
+    {
+      "epoch": 0.202,
+      "grad_norm": 0.7310620546340942,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 20200
+    },
+    {
+      "epoch": 0.20201,
+      "grad_norm": 0.6267219185829163,
+      "learning_rate": 0.003,
+      "loss": 3.9912,
+      "step": 20201
+    },
+    {
+      "epoch": 0.20202,
+      "grad_norm": 0.5859816074371338,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 20202
+    },
+    {
+      "epoch": 0.20203,
+      "grad_norm": 0.6759727001190186,
+      "learning_rate": 0.003,
+      "loss": 3.993,
+      "step": 20203
+    },
+    {
+      "epoch": 0.20204,
+      "grad_norm": 0.6308323740959167,
+      "learning_rate": 0.003,
+      "loss": 4.0147,
+      "step": 20204
+    },
+    {
+      "epoch": 0.20205,
+      "grad_norm": 0.714514970779419,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 20205
+    },
+    {
+      "epoch": 0.20206,
+      "grad_norm": 0.9549436569213867,
+      "learning_rate": 0.003,
+      "loss": 3.9517,
+      "step": 20206
+    },
+    {
+      "epoch": 0.20207,
+      "grad_norm": 1.2413829565048218,
+      "learning_rate": 0.003,
+      "loss": 3.9861,
+      "step": 20207
+    },
+    {
+      "epoch": 0.20208,
+      "grad_norm": 0.9563367962837219,
+      "learning_rate": 0.003,
+      "loss": 3.9729,
+      "step": 20208
+    },
+    {
+      "epoch": 0.20209,
+      "grad_norm": 0.8096930980682373,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 20209
+    },
+    {
+      "epoch": 0.2021,
+      "grad_norm": 0.6866564750671387,
+      "learning_rate": 0.003,
+      "loss": 3.9923,
+      "step": 20210
+    },
+    {
+      "epoch": 0.20211,
+      "grad_norm": 0.7624746561050415,
+      "learning_rate": 0.003,
+      "loss": 4.0218,
+      "step": 20211
+    },
+    {
+      "epoch": 0.20212,
+      "grad_norm": 0.8786949515342712,
+      "learning_rate": 0.003,
+      "loss": 3.9945,
+      "step": 20212
+    },
+    {
+      "epoch": 0.20213,
+      "grad_norm": 0.8951416611671448,
+      "learning_rate": 0.003,
+      "loss": 3.9905,
+      "step": 20213
+    },
+    {
+      "epoch": 0.20214,
+      "grad_norm": 0.9443157911300659,
+      "learning_rate": 0.003,
+      "loss": 3.9603,
+      "step": 20214
+    },
+    {
+      "epoch": 0.20215,
+      "grad_norm": 0.933879554271698,
+      "learning_rate": 0.003,
+      "loss": 3.9977,
+      "step": 20215
+    },
+    {
+      "epoch": 0.20216,
+      "grad_norm": 0.8607663512229919,
+      "learning_rate": 0.003,
+      "loss": 4.0299,
+      "step": 20216
+    },
+    {
+      "epoch": 0.20217,
+      "grad_norm": 0.9158996939659119,
+      "learning_rate": 0.003,
+      "loss": 3.9843,
+      "step": 20217
+    },
+    {
+      "epoch": 0.20218,
+      "grad_norm": 0.8870666027069092,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 20218
+    },
+    {
+      "epoch": 0.20219,
+      "grad_norm": 0.8453297019004822,
+      "learning_rate": 0.003,
+      "loss": 3.9831,
+      "step": 20219
+    },
+    {
+      "epoch": 0.2022,
+      "grad_norm": 0.9020869731903076,
+      "learning_rate": 0.003,
+      "loss": 3.9731,
+      "step": 20220
+    },
+    {
+      "epoch": 0.20221,
+      "grad_norm": 0.9760621190071106,
+      "learning_rate": 0.003,
+      "loss": 4.0119,
+      "step": 20221
+    },
+    {
+      "epoch": 0.20222,
+      "grad_norm": 1.164143443107605,
+      "learning_rate": 0.003,
+      "loss": 4.0111,
+      "step": 20222
+    },
+    {
+      "epoch": 0.20223,
+      "grad_norm": 0.7390578985214233,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 20223
+    },
+    {
+      "epoch": 0.20224,
+      "grad_norm": 0.7947418689727783,
+      "learning_rate": 0.003,
+      "loss": 4.0384,
+      "step": 20224
+    },
+    {
+      "epoch": 0.20225,
+      "grad_norm": 0.769934892654419,
+      "learning_rate": 0.003,
+      "loss": 3.9831,
+      "step": 20225
+    },
+    {
+      "epoch": 0.20226,
+      "grad_norm": 0.8830723166465759,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 20226
+    },
+    {
+      "epoch": 0.20227,
+      "grad_norm": 1.1716639995574951,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 20227
+    },
+    {
+      "epoch": 0.20228,
+      "grad_norm": 0.7477881908416748,
+      "learning_rate": 0.003,
+      "loss": 3.974,
+      "step": 20228
+    },
+    {
+      "epoch": 0.20229,
+      "grad_norm": 0.6853918433189392,
+      "learning_rate": 0.003,
+      "loss": 3.9884,
+      "step": 20229
+    },
+    {
+      "epoch": 0.2023,
+      "grad_norm": 0.6665154099464417,
+      "learning_rate": 0.003,
+      "loss": 3.9835,
+      "step": 20230
+    },
+    {
+      "epoch": 0.20231,
+      "grad_norm": 0.5964629054069519,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 20231
+    },
+    {
+      "epoch": 0.20232,
+      "grad_norm": 0.6783205270767212,
+      "learning_rate": 0.003,
+      "loss": 3.9627,
+      "step": 20232
+    },
+    {
+      "epoch": 0.20233,
+      "grad_norm": 0.7677664756774902,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 20233
+    },
+    {
+      "epoch": 0.20234,
+      "grad_norm": 0.7461740374565125,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 20234
+    },
+    {
+      "epoch": 0.20235,
+      "grad_norm": 0.6478622555732727,
+      "learning_rate": 0.003,
+      "loss": 3.9809,
+      "step": 20235
+    },
+    {
+      "epoch": 0.20236,
+      "grad_norm": 0.7441017627716064,
+      "learning_rate": 0.003,
+      "loss": 3.9952,
+      "step": 20236
+    },
+    {
+      "epoch": 0.20237,
+      "grad_norm": 0.771283745765686,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 20237
+    },
+    {
+      "epoch": 0.20238,
+      "grad_norm": 0.8057474493980408,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 20238
+    },
+    {
+      "epoch": 0.20239,
+      "grad_norm": 0.7607077360153198,
+      "learning_rate": 0.003,
+      "loss": 3.9846,
+      "step": 20239
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.7734676599502563,
+      "learning_rate": 0.003,
+      "loss": 3.9857,
+      "step": 20240
+    },
+    {
+      "epoch": 0.20241,
+      "grad_norm": 1.048604965209961,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 20241
+    },
+    {
+      "epoch": 0.20242,
+      "grad_norm": 1.2987990379333496,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 20242
+    },
+    {
+      "epoch": 0.20243,
+      "grad_norm": 1.064478874206543,
+      "learning_rate": 0.003,
+      "loss": 3.9619,
+      "step": 20243
+    },
+    {
+      "epoch": 0.20244,
+      "grad_norm": 1.1175570487976074,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 20244
+    },
+    {
+      "epoch": 0.20245,
+      "grad_norm": 0.850923478603363,
+      "learning_rate": 0.003,
+      "loss": 3.9824,
+      "step": 20245
+    },
+    {
+      "epoch": 0.20246,
+      "grad_norm": 0.7859277725219727,
+      "learning_rate": 0.003,
+      "loss": 3.9843,
+      "step": 20246
+    },
+    {
+      "epoch": 0.20247,
+      "grad_norm": 0.7892622351646423,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 20247
+    },
+    {
+      "epoch": 0.20248,
+      "grad_norm": 0.7633906006813049,
+      "learning_rate": 0.003,
+      "loss": 3.983,
+      "step": 20248
+    },
+    {
+      "epoch": 0.20249,
+      "grad_norm": 0.6468591690063477,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 20249
+    },
+    {
+      "epoch": 0.2025,
+      "grad_norm": 0.6362141370773315,
+      "learning_rate": 0.003,
+      "loss": 3.9861,
+      "step": 20250
+    },
+    {
+      "epoch": 0.20251,
+      "grad_norm": 0.7041432857513428,
+      "learning_rate": 0.003,
+      "loss": 3.9743,
+      "step": 20251
+    },
+    {
+      "epoch": 0.20252,
+      "grad_norm": 0.8176714777946472,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 20252
+    },
+    {
+      "epoch": 0.20253,
+      "grad_norm": 0.97947096824646,
+      "learning_rate": 0.003,
+      "loss": 3.985,
+      "step": 20253
+    },
+    {
+      "epoch": 0.20254,
+      "grad_norm": 1.2270678281784058,
+      "learning_rate": 0.003,
+      "loss": 4.045,
+      "step": 20254
+    },
+    {
+      "epoch": 0.20255,
+      "grad_norm": 0.7587008476257324,
+      "learning_rate": 0.003,
+      "loss": 3.9815,
+      "step": 20255
+    },
+    {
+      "epoch": 0.20256,
+      "grad_norm": 0.7780997157096863,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 20256
+    },
+    {
+      "epoch": 0.20257,
+      "grad_norm": 0.8538438677787781,
+      "learning_rate": 0.003,
+      "loss": 3.9786,
+      "step": 20257
+    },
+    {
+      "epoch": 0.20258,
+      "grad_norm": 0.9861294627189636,
+      "learning_rate": 0.003,
+      "loss": 3.9704,
+      "step": 20258
+    },
+    {
+      "epoch": 0.20259,
+      "grad_norm": 0.9398753643035889,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 20259
+    },
+    {
+      "epoch": 0.2026,
+      "grad_norm": 1.0167715549468994,
+      "learning_rate": 0.003,
+      "loss": 4.0074,
+      "step": 20260
+    },
+    {
+      "epoch": 0.20261,
+      "grad_norm": 1.0032497644424438,
+      "learning_rate": 0.003,
+      "loss": 4.0362,
+      "step": 20261
+    },
+    {
+      "epoch": 0.20262,
+      "grad_norm": 0.9985441565513611,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 20262
+    },
+    {
+      "epoch": 0.20263,
+      "grad_norm": 1.136631727218628,
+      "learning_rate": 0.003,
+      "loss": 4.0098,
+      "step": 20263
+    },
+    {
+      "epoch": 0.20264,
+      "grad_norm": 1.066340446472168,
+      "learning_rate": 0.003,
+      "loss": 4.0212,
+      "step": 20264
+    },
+    {
+      "epoch": 0.20265,
+      "grad_norm": 0.9510641694068909,
+      "learning_rate": 0.003,
+      "loss": 3.9807,
+      "step": 20265
+    },
+    {
+      "epoch": 0.20266,
+      "grad_norm": 0.969109833240509,
+      "learning_rate": 0.003,
+      "loss": 3.9926,
+      "step": 20266
+    },
+    {
+      "epoch": 0.20267,
+      "grad_norm": 0.8656512498855591,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 20267
+    },
+    {
+      "epoch": 0.20268,
+      "grad_norm": 0.8168076276779175,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 20268
+    },
+    {
+      "epoch": 0.20269,
+      "grad_norm": 0.8075709939002991,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 20269
+    },
+    {
+      "epoch": 0.2027,
+      "grad_norm": 0.7317737936973572,
+      "learning_rate": 0.003,
+      "loss": 4.006,
+      "step": 20270
+    },
+    {
+      "epoch": 0.20271,
+      "grad_norm": 0.7332939505577087,
+      "learning_rate": 0.003,
+      "loss": 4.0225,
+      "step": 20271
+    },
+    {
+      "epoch": 0.20272,
+      "grad_norm": 0.7304376363754272,
+      "learning_rate": 0.003,
+      "loss": 4.003,
+      "step": 20272
+    },
+    {
+      "epoch": 0.20273,
+      "grad_norm": 0.8709371089935303,
+      "learning_rate": 0.003,
+      "loss": 3.9865,
+      "step": 20273
+    },
+    {
+      "epoch": 0.20274,
+      "grad_norm": 1.1287729740142822,
+      "learning_rate": 0.003,
+      "loss": 4.0263,
+      "step": 20274
+    },
+    {
+      "epoch": 0.20275,
+      "grad_norm": 0.9655367136001587,
+      "learning_rate": 0.003,
+      "loss": 3.9975,
+      "step": 20275
+    },
+    {
+      "epoch": 0.20276,
+      "grad_norm": 1.0948092937469482,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 20276
+    },
+    {
+      "epoch": 0.20277,
+      "grad_norm": 0.9808290600776672,
+      "learning_rate": 0.003,
+      "loss": 4.0258,
+      "step": 20277
+    },
+    {
+      "epoch": 0.20278,
+      "grad_norm": 0.841752290725708,
+      "learning_rate": 0.003,
+      "loss": 3.974,
+      "step": 20278
+    },
+    {
+      "epoch": 0.20279,
+      "grad_norm": 0.7497544884681702,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 20279
+    },
+    {
+      "epoch": 0.2028,
+      "grad_norm": 0.6757055521011353,
+      "learning_rate": 0.003,
+      "loss": 3.9577,
+      "step": 20280
+    },
+    {
+      "epoch": 0.20281,
+      "grad_norm": 0.6535053253173828,
+      "learning_rate": 0.003,
+      "loss": 3.9744,
+      "step": 20281
+    },
+    {
+      "epoch": 0.20282,
+      "grad_norm": 0.7720223665237427,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 20282
+    },
+    {
+      "epoch": 0.20283,
+      "grad_norm": 1.0587536096572876,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 20283
+    },
+    {
+      "epoch": 0.20284,
+      "grad_norm": 1.1201404333114624,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 20284
+    },
+    {
+      "epoch": 0.20285,
+      "grad_norm": 0.8129140138626099,
+      "learning_rate": 0.003,
+      "loss": 3.9906,
+      "step": 20285
+    },
+    {
+      "epoch": 0.20286,
+      "grad_norm": 0.6837013363838196,
+      "learning_rate": 0.003,
+      "loss": 3.9873,
+      "step": 20286
+    },
+    {
+      "epoch": 0.20287,
+      "grad_norm": 0.6548781991004944,
+      "learning_rate": 0.003,
+      "loss": 3.9668,
+      "step": 20287
+    },
+    {
+      "epoch": 0.20288,
+      "grad_norm": 0.8084627389907837,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 20288
+    },
+    {
+      "epoch": 0.20289,
+      "grad_norm": 1.0328177213668823,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 20289
+    },
+    {
+      "epoch": 0.2029,
+      "grad_norm": 1.1478947401046753,
+      "learning_rate": 0.003,
+      "loss": 4.0188,
+      "step": 20290
+    },
+    {
+      "epoch": 0.20291,
+      "grad_norm": 0.7494494318962097,
+      "learning_rate": 0.003,
+      "loss": 3.9796,
+      "step": 20291
+    },
+    {
+      "epoch": 0.20292,
+      "grad_norm": 0.679472029209137,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 20292
+    },
+    {
+      "epoch": 0.20293,
+      "grad_norm": 0.6625007390975952,
+      "learning_rate": 0.003,
+      "loss": 4.0059,
+      "step": 20293
+    },
+    {
+      "epoch": 0.20294,
+      "grad_norm": 0.6561849117279053,
+      "learning_rate": 0.003,
+      "loss": 3.9684,
+      "step": 20294
+    },
+    {
+      "epoch": 0.20295,
+      "grad_norm": 0.7200389504432678,
+      "learning_rate": 0.003,
+      "loss": 4.0005,
+      "step": 20295
+    },
+    {
+      "epoch": 0.20296,
+      "grad_norm": 0.7838565111160278,
+      "learning_rate": 0.003,
+      "loss": 3.9743,
+      "step": 20296
+    },
+    {
+      "epoch": 0.20297,
+      "grad_norm": 0.9034474492073059,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 20297
+    },
+    {
+      "epoch": 0.20298,
+      "grad_norm": 1.0330101251602173,
+      "learning_rate": 0.003,
+      "loss": 3.9992,
+      "step": 20298
+    },
+    {
+      "epoch": 0.20299,
+      "grad_norm": 0.9066010117530823,
+      "learning_rate": 0.003,
+      "loss": 3.9933,
+      "step": 20299
+    },
+    {
+      "epoch": 0.203,
+      "grad_norm": 0.9214524626731873,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 20300
+    },
+    {
+      "epoch": 0.20301,
+      "grad_norm": 1.0282068252563477,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 20301
+    },
+    {
+      "epoch": 0.20302,
+      "grad_norm": 1.0103950500488281,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 20302
+    },
+    {
+      "epoch": 0.20303,
+      "grad_norm": 0.9253962635993958,
+      "learning_rate": 0.003,
+      "loss": 4.0207,
+      "step": 20303
+    },
+    {
+      "epoch": 0.20304,
+      "grad_norm": 1.0147264003753662,
+      "learning_rate": 0.003,
+      "loss": 3.998,
+      "step": 20304
+    },
+    {
+      "epoch": 0.20305,
+      "grad_norm": 0.978193461894989,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 20305
+    },
+    {
+      "epoch": 0.20306,
+      "grad_norm": 1.0849967002868652,
+      "learning_rate": 0.003,
+      "loss": 3.9887,
+      "step": 20306
+    },
+    {
+      "epoch": 0.20307,
+      "grad_norm": 1.0655522346496582,
+      "learning_rate": 0.003,
+      "loss": 4.0164,
+      "step": 20307
+    },
+    {
+      "epoch": 0.20308,
+      "grad_norm": 0.8298773765563965,
+      "learning_rate": 0.003,
+      "loss": 4.0015,
+      "step": 20308
+    },
+    {
+      "epoch": 0.20309,
+      "grad_norm": 0.7420493960380554,
+      "learning_rate": 0.003,
+      "loss": 3.9969,
+      "step": 20309
+    },
+    {
+      "epoch": 0.2031,
+      "grad_norm": 0.6811785101890564,
+      "learning_rate": 0.003,
+      "loss": 3.9897,
+      "step": 20310
+    },
+    {
+      "epoch": 0.20311,
+      "grad_norm": 0.6202403903007507,
+      "learning_rate": 0.003,
+      "loss": 3.9928,
+      "step": 20311
+    },
+    {
+      "epoch": 0.20312,
+      "grad_norm": 0.6893637180328369,
+      "learning_rate": 0.003,
+      "loss": 3.9776,
+      "step": 20312
+    },
+    {
+      "epoch": 0.20313,
+      "grad_norm": 0.7760577201843262,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 20313
+    },
+    {
+      "epoch": 0.20314,
+      "grad_norm": 0.8740729689598083,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 20314
+    },
+    {
+      "epoch": 0.20315,
+      "grad_norm": 1.0910762548446655,
+      "learning_rate": 0.003,
+      "loss": 4.0166,
+      "step": 20315
+    },
+    {
+      "epoch": 0.20316,
+      "grad_norm": 0.9553356766700745,
+      "learning_rate": 0.003,
+      "loss": 3.9925,
+      "step": 20316
+    },
+    {
+      "epoch": 0.20317,
+      "grad_norm": 0.8944961428642273,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 20317
+    },
+    {
+      "epoch": 0.20318,
+      "grad_norm": 1.0081169605255127,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 20318
+    },
+    {
+      "epoch": 0.20319,
+      "grad_norm": 0.8911204934120178,
+      "learning_rate": 0.003,
+      "loss": 4.0129,
+      "step": 20319
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.8690194487571716,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 20320
+    },
+    {
+      "epoch": 0.20321,
+      "grad_norm": 0.7861948013305664,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 20321
+    },
+    {
+      "epoch": 0.20322,
+      "grad_norm": 0.7284369468688965,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 20322
+    },
+    {
+      "epoch": 0.20323,
+      "grad_norm": 0.7329832315444946,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 20323
+    },
+    {
+      "epoch": 0.20324,
+      "grad_norm": 0.5863949060440063,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 20324
+    },
+    {
+      "epoch": 0.20325,
+      "grad_norm": 0.6903998851776123,
+      "learning_rate": 0.003,
+      "loss": 3.9905,
+      "step": 20325
+    },
+    {
+      "epoch": 0.20326,
+      "grad_norm": 0.7285104990005493,
+      "learning_rate": 0.003,
+      "loss": 3.9558,
+      "step": 20326
+    },
+    {
+      "epoch": 0.20327,
+      "grad_norm": 1.0294723510742188,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 20327
+    },
+    {
+      "epoch": 0.20328,
+      "grad_norm": 1.311887264251709,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 20328
+    },
+    {
+      "epoch": 0.20329,
+      "grad_norm": 0.5420759320259094,
+      "learning_rate": 0.003,
+      "loss": 3.9723,
+      "step": 20329
+    },
+    {
+      "epoch": 0.2033,
+      "grad_norm": 0.7763385772705078,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 20330
+    },
+    {
+      "epoch": 0.20331,
+      "grad_norm": 0.9855422377586365,
+      "learning_rate": 0.003,
+      "loss": 3.9935,
+      "step": 20331
+    },
+    {
+      "epoch": 0.20332,
+      "grad_norm": 1.1434701681137085,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 20332
+    },
+    {
+      "epoch": 0.20333,
+      "grad_norm": 0.7839611768722534,
+      "learning_rate": 0.003,
+      "loss": 4.0115,
+      "step": 20333
+    },
+    {
+      "epoch": 0.20334,
+      "grad_norm": 0.6856549978256226,
+      "learning_rate": 0.003,
+      "loss": 3.9878,
+      "step": 20334
+    },
+    {
+      "epoch": 0.20335,
+      "grad_norm": 0.7158058881759644,
+      "learning_rate": 0.003,
+      "loss": 3.9657,
+      "step": 20335
+    },
+    {
+      "epoch": 0.20336,
+      "grad_norm": 0.7355321049690247,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 20336
+    },
+    {
+      "epoch": 0.20337,
+      "grad_norm": 0.8203396797180176,
+      "learning_rate": 0.003,
+      "loss": 3.994,
+      "step": 20337
+    },
+    {
+      "epoch": 0.20338,
+      "grad_norm": 0.8416463136672974,
+      "learning_rate": 0.003,
+      "loss": 4.0011,
+      "step": 20338
+    },
+    {
+      "epoch": 0.20339,
+      "grad_norm": 0.8109787702560425,
+      "learning_rate": 0.003,
+      "loss": 4.0108,
+      "step": 20339
+    },
+    {
+      "epoch": 0.2034,
+      "grad_norm": 0.7734821438789368,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 20340
+    },
+    {
+      "epoch": 0.20341,
+      "grad_norm": 0.8469385504722595,
+      "learning_rate": 0.003,
+      "loss": 3.9853,
+      "step": 20341
+    },
+    {
+      "epoch": 0.20342,
+      "grad_norm": 1.1418490409851074,
+      "learning_rate": 0.003,
+      "loss": 3.9864,
+      "step": 20342
+    },
+    {
+      "epoch": 0.20343,
+      "grad_norm": 0.8195468783378601,
+      "learning_rate": 0.003,
+      "loss": 4.0158,
+      "step": 20343
+    },
+    {
+      "epoch": 0.20344,
+      "grad_norm": 0.7609240412712097,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 20344
+    },
+    {
+      "epoch": 0.20345,
+      "grad_norm": 0.8220378160476685,
+      "learning_rate": 0.003,
+      "loss": 3.9954,
+      "step": 20345
+    },
+    {
+      "epoch": 0.20346,
+      "grad_norm": 0.9295966625213623,
+      "learning_rate": 0.003,
+      "loss": 3.9661,
+      "step": 20346
+    },
+    {
+      "epoch": 0.20347,
+      "grad_norm": 0.8783484101295471,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 20347
+    },
+    {
+      "epoch": 0.20348,
+      "grad_norm": 0.8919439315795898,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 20348
+    },
+    {
+      "epoch": 0.20349,
+      "grad_norm": 0.7911661863327026,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 20349
+    },
+    {
+      "epoch": 0.2035,
+      "grad_norm": 0.7926628589630127,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 20350
+    },
+    {
+      "epoch": 0.20351,
+      "grad_norm": 0.7087711095809937,
+      "learning_rate": 0.003,
+      "loss": 3.9822,
+      "step": 20351
+    },
+    {
+      "epoch": 0.20352,
+      "grad_norm": 0.7259397506713867,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 20352
+    },
+    {
+      "epoch": 0.20353,
+      "grad_norm": 0.6317782998085022,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 20353
+    },
+    {
+      "epoch": 0.20354,
+      "grad_norm": 0.7174623608589172,
+      "learning_rate": 0.003,
+      "loss": 3.9779,
+      "step": 20354
+    },
+    {
+      "epoch": 0.20355,
+      "grad_norm": 0.6670953035354614,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 20355
+    },
+    {
+      "epoch": 0.20356,
+      "grad_norm": 0.6677290797233582,
+      "learning_rate": 0.003,
+      "loss": 3.9773,
+      "step": 20356
+    },
+    {
+      "epoch": 0.20357,
+      "grad_norm": 0.6100268363952637,
+      "learning_rate": 0.003,
+      "loss": 3.9925,
+      "step": 20357
+    },
+    {
+      "epoch": 0.20358,
+      "grad_norm": 0.7424579858779907,
+      "learning_rate": 0.003,
+      "loss": 4.0055,
+      "step": 20358
+    },
+    {
+      "epoch": 0.20359,
+      "grad_norm": 0.956942081451416,
+      "learning_rate": 0.003,
+      "loss": 3.9837,
+      "step": 20359
+    },
+    {
+      "epoch": 0.2036,
+      "grad_norm": 1.204334020614624,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 20360
+    },
+    {
+      "epoch": 0.20361,
+      "grad_norm": 0.9877221584320068,
+      "learning_rate": 0.003,
+      "loss": 3.9897,
+      "step": 20361
+    },
+    {
+      "epoch": 0.20362,
+      "grad_norm": 1.0937947034835815,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 20362
+    },
+    {
+      "epoch": 0.20363,
+      "grad_norm": 0.8922629952430725,
+      "learning_rate": 0.003,
+      "loss": 3.9579,
+      "step": 20363
+    },
+    {
+      "epoch": 0.20364,
+      "grad_norm": 0.8023450970649719,
+      "learning_rate": 0.003,
+      "loss": 3.9957,
+      "step": 20364
+    },
+    {
+      "epoch": 0.20365,
+      "grad_norm": 0.7399235367774963,
+      "learning_rate": 0.003,
+      "loss": 3.994,
+      "step": 20365
+    },
+    {
+      "epoch": 0.20366,
+      "grad_norm": 0.9052486419677734,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 20366
+    },
+    {
+      "epoch": 0.20367,
+      "grad_norm": 1.0596771240234375,
+      "learning_rate": 0.003,
+      "loss": 4.0108,
+      "step": 20367
+    },
+    {
+      "epoch": 0.20368,
+      "grad_norm": 0.8454614281654358,
+      "learning_rate": 0.003,
+      "loss": 3.9848,
+      "step": 20368
+    },
+    {
+      "epoch": 0.20369,
+      "grad_norm": 0.7641255855560303,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 20369
+    },
+    {
+      "epoch": 0.2037,
+      "grad_norm": 0.7677737474441528,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 20370
+    },
+    {
+      "epoch": 0.20371,
+      "grad_norm": 0.7251455187797546,
+      "learning_rate": 0.003,
+      "loss": 3.998,
+      "step": 20371
+    },
+    {
+      "epoch": 0.20372,
+      "grad_norm": 0.6663448214530945,
+      "learning_rate": 0.003,
+      "loss": 3.9735,
+      "step": 20372
+    },
+    {
+      "epoch": 0.20373,
+      "grad_norm": 0.6636404395103455,
+      "learning_rate": 0.003,
+      "loss": 4.0107,
+      "step": 20373
+    },
+    {
+      "epoch": 0.20374,
+      "grad_norm": 0.6882753968238831,
+      "learning_rate": 0.003,
+      "loss": 4.0035,
+      "step": 20374
+    },
+    {
+      "epoch": 0.20375,
+      "grad_norm": 0.7860545516014099,
+      "learning_rate": 0.003,
+      "loss": 3.9935,
+      "step": 20375
+    },
+    {
+      "epoch": 0.20376,
+      "grad_norm": 1.0693397521972656,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 20376
+    },
+    {
+      "epoch": 0.20377,
+      "grad_norm": 1.2248581647872925,
+      "learning_rate": 0.003,
+      "loss": 4.0221,
+      "step": 20377
+    },
+    {
+      "epoch": 0.20378,
+      "grad_norm": 0.8462511301040649,
+      "learning_rate": 0.003,
+      "loss": 3.994,
+      "step": 20378
+    },
+    {
+      "epoch": 0.20379,
+      "grad_norm": 0.8828621506690979,
+      "learning_rate": 0.003,
+      "loss": 4.0254,
+      "step": 20379
+    },
+    {
+      "epoch": 0.2038,
+      "grad_norm": 0.9216285943984985,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 20380
+    },
+    {
+      "epoch": 0.20381,
+      "grad_norm": 0.9714720249176025,
+      "learning_rate": 0.003,
+      "loss": 3.9681,
+      "step": 20381
+    },
+    {
+      "epoch": 0.20382,
+      "grad_norm": 0.9852808713912964,
+      "learning_rate": 0.003,
+      "loss": 3.9842,
+      "step": 20382
+    },
+    {
+      "epoch": 0.20383,
+      "grad_norm": 0.8930702209472656,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 20383
+    },
+    {
+      "epoch": 0.20384,
+      "grad_norm": 1.0646729469299316,
+      "learning_rate": 0.003,
+      "loss": 4.0359,
+      "step": 20384
+    },
+    {
+      "epoch": 0.20385,
+      "grad_norm": 1.072921633720398,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 20385
+    },
+    {
+      "epoch": 0.20386,
+      "grad_norm": 1.1347222328186035,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 20386
+    },
+    {
+      "epoch": 0.20387,
+      "grad_norm": 0.7293988466262817,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 20387
+    },
+    {
+      "epoch": 0.20388,
+      "grad_norm": 0.6437301635742188,
+      "learning_rate": 0.003,
+      "loss": 3.9799,
+      "step": 20388
+    },
+    {
+      "epoch": 0.20389,
+      "grad_norm": 0.6251480579376221,
+      "learning_rate": 0.003,
+      "loss": 3.971,
+      "step": 20389
+    },
+    {
+      "epoch": 0.2039,
+      "grad_norm": 0.6301602721214294,
+      "learning_rate": 0.003,
+      "loss": 4.0149,
+      "step": 20390
+    },
+    {
+      "epoch": 0.20391,
+      "grad_norm": 0.6458478569984436,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 20391
+    },
+    {
+      "epoch": 0.20392,
+      "grad_norm": 0.788723886013031,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 20392
+    },
+    {
+      "epoch": 0.20393,
+      "grad_norm": 0.9218687415122986,
+      "learning_rate": 0.003,
+      "loss": 4.0151,
+      "step": 20393
+    },
+    {
+      "epoch": 0.20394,
+      "grad_norm": 1.1501364707946777,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 20394
+    },
+    {
+      "epoch": 0.20395,
+      "grad_norm": 1.05363130569458,
+      "learning_rate": 0.003,
+      "loss": 4.0224,
+      "step": 20395
+    },
+    {
+      "epoch": 0.20396,
+      "grad_norm": 0.8909705877304077,
+      "learning_rate": 0.003,
+      "loss": 3.968,
+      "step": 20396
+    },
+    {
+      "epoch": 0.20397,
+      "grad_norm": 0.7462480664253235,
+      "learning_rate": 0.003,
+      "loss": 4.0227,
+      "step": 20397
+    },
+    {
+      "epoch": 0.20398,
+      "grad_norm": 0.694731593132019,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 20398
+    },
+    {
+      "epoch": 0.20399,
+      "grad_norm": 0.667031466960907,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 20399
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.6090689897537231,
+      "learning_rate": 0.003,
+      "loss": 4.0233,
+      "step": 20400
+    },
+    {
+      "epoch": 0.20401,
+      "grad_norm": 0.627703070640564,
+      "learning_rate": 0.003,
+      "loss": 3.9898,
+      "step": 20401
+    },
+    {
+      "epoch": 0.20402,
+      "grad_norm": 0.6416513323783875,
+      "learning_rate": 0.003,
+      "loss": 3.9859,
+      "step": 20402
+    },
+    {
+      "epoch": 0.20403,
+      "grad_norm": 0.7873333692550659,
+      "learning_rate": 0.003,
+      "loss": 3.9766,
+      "step": 20403
+    },
+    {
+      "epoch": 0.20404,
+      "grad_norm": 1.1909332275390625,
+      "learning_rate": 0.003,
+      "loss": 4.0047,
+      "step": 20404
+    },
+    {
+      "epoch": 0.20405,
+      "grad_norm": 0.9008625745773315,
+      "learning_rate": 0.003,
+      "loss": 3.98,
+      "step": 20405
+    },
+    {
+      "epoch": 0.20406,
+      "grad_norm": 0.7145691514015198,
+      "learning_rate": 0.003,
+      "loss": 3.993,
+      "step": 20406
+    },
+    {
+      "epoch": 0.20407,
+      "grad_norm": 0.7303884029388428,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 20407
+    },
+    {
+      "epoch": 0.20408,
+      "grad_norm": 0.681465744972229,
+      "learning_rate": 0.003,
+      "loss": 3.9968,
+      "step": 20408
+    },
+    {
+      "epoch": 0.20409,
+      "grad_norm": 0.7974075078964233,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 20409
+    },
+    {
+      "epoch": 0.2041,
+      "grad_norm": 0.9380106329917908,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 20410
+    },
+    {
+      "epoch": 0.20411,
+      "grad_norm": 1.085464358329773,
+      "learning_rate": 0.003,
+      "loss": 3.98,
+      "step": 20411
+    },
+    {
+      "epoch": 0.20412,
+      "grad_norm": 0.9400927424430847,
+      "learning_rate": 0.003,
+      "loss": 3.9652,
+      "step": 20412
+    },
+    {
+      "epoch": 0.20413,
+      "grad_norm": 0.886445939540863,
+      "learning_rate": 0.003,
+      "loss": 3.996,
+      "step": 20413
+    },
+    {
+      "epoch": 0.20414,
+      "grad_norm": 0.9608079791069031,
+      "learning_rate": 0.003,
+      "loss": 3.9871,
+      "step": 20414
+    },
+    {
+      "epoch": 0.20415,
+      "grad_norm": 1.0392273664474487,
+      "learning_rate": 0.003,
+      "loss": 3.9828,
+      "step": 20415
+    },
+    {
+      "epoch": 0.20416,
+      "grad_norm": 0.870465099811554,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 20416
+    },
+    {
+      "epoch": 0.20417,
+      "grad_norm": 0.8516085147857666,
+      "learning_rate": 0.003,
+      "loss": 3.9944,
+      "step": 20417
+    },
+    {
+      "epoch": 0.20418,
+      "grad_norm": 1.0546752214431763,
+      "learning_rate": 0.003,
+      "loss": 3.9953,
+      "step": 20418
+    },
+    {
+      "epoch": 0.20419,
+      "grad_norm": 1.0786575078964233,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 20419
+    },
+    {
+      "epoch": 0.2042,
+      "grad_norm": 0.9115407466888428,
+      "learning_rate": 0.003,
+      "loss": 3.9903,
+      "step": 20420
+    },
+    {
+      "epoch": 0.20421,
+      "grad_norm": 0.8256513476371765,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 20421
+    },
+    {
+      "epoch": 0.20422,
+      "grad_norm": 0.907086968421936,
+      "learning_rate": 0.003,
+      "loss": 4.0177,
+      "step": 20422
+    },
+    {
+      "epoch": 0.20423,
+      "grad_norm": 1.1340298652648926,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 20423
+    },
+    {
+      "epoch": 0.20424,
+      "grad_norm": 0.8989710211753845,
+      "learning_rate": 0.003,
+      "loss": 3.9915,
+      "step": 20424
+    },
+    {
+      "epoch": 0.20425,
+      "grad_norm": 0.8972444534301758,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 20425
+    },
+    {
+      "epoch": 0.20426,
+      "grad_norm": 1.0549273490905762,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 20426
+    },
+    {
+      "epoch": 0.20427,
+      "grad_norm": 1.1332916021347046,
+      "learning_rate": 0.003,
+      "loss": 4.0189,
+      "step": 20427
+    },
+    {
+      "epoch": 0.20428,
+      "grad_norm": 0.774750828742981,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 20428
+    },
+    {
+      "epoch": 0.20429,
+      "grad_norm": 0.6685149073600769,
+      "learning_rate": 0.003,
+      "loss": 3.9698,
+      "step": 20429
+    },
+    {
+      "epoch": 0.2043,
+      "grad_norm": 0.6455409526824951,
+      "learning_rate": 0.003,
+      "loss": 4.0285,
+      "step": 20430
+    },
+    {
+      "epoch": 0.20431,
+      "grad_norm": 0.6255378723144531,
+      "learning_rate": 0.003,
+      "loss": 3.9925,
+      "step": 20431
+    },
+    {
+      "epoch": 0.20432,
+      "grad_norm": 0.7297254204750061,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 20432
+    },
+    {
+      "epoch": 0.20433,
+      "grad_norm": 0.8061620593070984,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 20433
+    },
+    {
+      "epoch": 0.20434,
+      "grad_norm": 0.9574712514877319,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 20434
+    },
+    {
+      "epoch": 0.20435,
+      "grad_norm": 0.9712481498718262,
+      "learning_rate": 0.003,
+      "loss": 4.0171,
+      "step": 20435
+    },
+    {
+      "epoch": 0.20436,
+      "grad_norm": 1.0066169500350952,
+      "learning_rate": 0.003,
+      "loss": 3.99,
+      "step": 20436
+    },
+    {
+      "epoch": 0.20437,
+      "grad_norm": 1.0666333436965942,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 20437
+    },
+    {
+      "epoch": 0.20438,
+      "grad_norm": 0.9452031254768372,
+      "learning_rate": 0.003,
+      "loss": 3.9834,
+      "step": 20438
+    },
+    {
+      "epoch": 0.20439,
+      "grad_norm": 1.003495693206787,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 20439
+    },
+    {
+      "epoch": 0.2044,
+      "grad_norm": 0.910893440246582,
+      "learning_rate": 0.003,
+      "loss": 3.9975,
+      "step": 20440
+    },
+    {
+      "epoch": 0.20441,
+      "grad_norm": 0.8472775816917419,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 20441
+    },
+    {
+      "epoch": 0.20442,
+      "grad_norm": 0.682340681552887,
+      "learning_rate": 0.003,
+      "loss": 4.0002,
+      "step": 20442
+    },
+    {
+      "epoch": 0.20443,
+      "grad_norm": 0.7600615620613098,
+      "learning_rate": 0.003,
+      "loss": 4.0123,
+      "step": 20443
+    },
+    {
+      "epoch": 0.20444,
+      "grad_norm": 0.9301922917366028,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 20444
+    },
+    {
+      "epoch": 0.20445,
+      "grad_norm": 1.0311607122421265,
+      "learning_rate": 0.003,
+      "loss": 3.9902,
+      "step": 20445
+    },
+    {
+      "epoch": 0.20446,
+      "grad_norm": 0.96709144115448,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 20446
+    },
+    {
+      "epoch": 0.20447,
+      "grad_norm": 1.1222336292266846,
+      "learning_rate": 0.003,
+      "loss": 4.0368,
+      "step": 20447
+    },
+    {
+      "epoch": 0.20448,
+      "grad_norm": 0.8827340006828308,
+      "learning_rate": 0.003,
+      "loss": 4.0104,
+      "step": 20448
+    },
+    {
+      "epoch": 0.20449,
+      "grad_norm": 0.7806417346000671,
+      "learning_rate": 0.003,
+      "loss": 3.9907,
+      "step": 20449
+    },
+    {
+      "epoch": 0.2045,
+      "grad_norm": 0.8268476724624634,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 20450
+    },
+    {
+      "epoch": 0.20451,
+      "grad_norm": 0.8784385919570923,
+      "learning_rate": 0.003,
+      "loss": 3.9801,
+      "step": 20451
+    },
+    {
+      "epoch": 0.20452,
+      "grad_norm": 0.8739992380142212,
+      "learning_rate": 0.003,
+      "loss": 3.9847,
+      "step": 20452
+    },
+    {
+      "epoch": 0.20453,
+      "grad_norm": 0.9219098687171936,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 20453
+    },
+    {
+      "epoch": 0.20454,
+      "grad_norm": 1.051000952720642,
+      "learning_rate": 0.003,
+      "loss": 4.0278,
+      "step": 20454
+    },
+    {
+      "epoch": 0.20455,
+      "grad_norm": 1.1686969995498657,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 20455
+    },
+    {
+      "epoch": 0.20456,
+      "grad_norm": 0.7915661931037903,
+      "learning_rate": 0.003,
+      "loss": 3.9725,
+      "step": 20456
+    },
+    {
+      "epoch": 0.20457,
+      "grad_norm": 0.7893812656402588,
+      "learning_rate": 0.003,
+      "loss": 3.9862,
+      "step": 20457
+    },
+    {
+      "epoch": 0.20458,
+      "grad_norm": 0.9086239337921143,
+      "learning_rate": 0.003,
+      "loss": 3.9904,
+      "step": 20458
+    },
+    {
+      "epoch": 0.20459,
+      "grad_norm": 1.0326051712036133,
+      "learning_rate": 0.003,
+      "loss": 3.9822,
+      "step": 20459
+    },
+    {
+      "epoch": 0.2046,
+      "grad_norm": 1.0527185201644897,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 20460
+    },
+    {
+      "epoch": 0.20461,
+      "grad_norm": 0.8683894276618958,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 20461
+    },
+    {
+      "epoch": 0.20462,
+      "grad_norm": 0.636903703212738,
+      "learning_rate": 0.003,
+      "loss": 3.9955,
+      "step": 20462
+    },
+    {
+      "epoch": 0.20463,
+      "grad_norm": 0.6364380717277527,
+      "learning_rate": 0.003,
+      "loss": 3.9854,
+      "step": 20463
+    },
+    {
+      "epoch": 0.20464,
+      "grad_norm": 0.6431009769439697,
+      "learning_rate": 0.003,
+      "loss": 4.0259,
+      "step": 20464
+    },
+    {
+      "epoch": 0.20465,
+      "grad_norm": 0.7139235734939575,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 20465
+    },
+    {
+      "epoch": 0.20466,
+      "grad_norm": 0.6889108419418335,
+      "learning_rate": 0.003,
+      "loss": 4.022,
+      "step": 20466
+    },
+    {
+      "epoch": 0.20467,
+      "grad_norm": 0.668367326259613,
+      "learning_rate": 0.003,
+      "loss": 3.9887,
+      "step": 20467
+    },
+    {
+      "epoch": 0.20468,
+      "grad_norm": 0.6962161660194397,
+      "learning_rate": 0.003,
+      "loss": 4.0069,
+      "step": 20468
+    },
+    {
+      "epoch": 0.20469,
+      "grad_norm": 0.7134402990341187,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 20469
+    },
+    {
+      "epoch": 0.2047,
+      "grad_norm": 0.7592452764511108,
+      "learning_rate": 0.003,
+      "loss": 3.9768,
+      "step": 20470
+    },
+    {
+      "epoch": 0.20471,
+      "grad_norm": 0.8259060978889465,
+      "learning_rate": 0.003,
+      "loss": 3.9824,
+      "step": 20471
+    },
+    {
+      "epoch": 0.20472,
+      "grad_norm": 0.7741798758506775,
+      "learning_rate": 0.003,
+      "loss": 4.0022,
+      "step": 20472
+    },
+    {
+      "epoch": 0.20473,
+      "grad_norm": 0.7275305986404419,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 20473
+    },
+    {
+      "epoch": 0.20474,
+      "grad_norm": 0.8968299627304077,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 20474
+    },
+    {
+      "epoch": 0.20475,
+      "grad_norm": 1.1683197021484375,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 20475
+    },
+    {
+      "epoch": 0.20476,
+      "grad_norm": 1.089591383934021,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 20476
+    },
+    {
+      "epoch": 0.20477,
+      "grad_norm": 0.8169175982475281,
+      "learning_rate": 0.003,
+      "loss": 3.9735,
+      "step": 20477
+    },
+    {
+      "epoch": 0.20478,
+      "grad_norm": 0.625155508518219,
+      "learning_rate": 0.003,
+      "loss": 3.9815,
+      "step": 20478
+    },
+    {
+      "epoch": 0.20479,
+      "grad_norm": 0.6189466714859009,
+      "learning_rate": 0.003,
+      "loss": 4.0228,
+      "step": 20479
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.741943359375,
+      "learning_rate": 0.003,
+      "loss": 4.0042,
+      "step": 20480
+    },
+    {
+      "epoch": 0.20481,
+      "grad_norm": 0.8120701909065247,
+      "learning_rate": 0.003,
+      "loss": 3.9481,
+      "step": 20481
+    },
+    {
+      "epoch": 0.20482,
+      "grad_norm": 0.8646007180213928,
+      "learning_rate": 0.003,
+      "loss": 3.9881,
+      "step": 20482
+    },
+    {
+      "epoch": 0.20483,
+      "grad_norm": 0.9611397385597229,
+      "learning_rate": 0.003,
+      "loss": 4.0036,
+      "step": 20483
+    },
+    {
+      "epoch": 0.20484,
+      "grad_norm": 0.9605675935745239,
+      "learning_rate": 0.003,
+      "loss": 4.0155,
+      "step": 20484
+    },
+    {
+      "epoch": 0.20485,
+      "grad_norm": 0.7831942439079285,
+      "learning_rate": 0.003,
+      "loss": 4.0244,
+      "step": 20485
+    },
+    {
+      "epoch": 0.20486,
+      "grad_norm": 0.6748376488685608,
+      "learning_rate": 0.003,
+      "loss": 3.9838,
+      "step": 20486
+    },
+    {
+      "epoch": 0.20487,
+      "grad_norm": 0.7112252116203308,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 20487
+    },
+    {
+      "epoch": 0.20488,
+      "grad_norm": 0.8787015080451965,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 20488
+    },
+    {
+      "epoch": 0.20489,
+      "grad_norm": 1.1302354335784912,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 20489
+    },
+    {
+      "epoch": 0.2049,
+      "grad_norm": 1.0216718912124634,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 20490
+    },
+    {
+      "epoch": 0.20491,
+      "grad_norm": 0.9042589664459229,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 20491
+    },
+    {
+      "epoch": 0.20492,
+      "grad_norm": 0.8357610702514648,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 20492
+    },
+    {
+      "epoch": 0.20493,
+      "grad_norm": 0.8341541290283203,
+      "learning_rate": 0.003,
+      "loss": 4.0062,
+      "step": 20493
+    },
+    {
+      "epoch": 0.20494,
+      "grad_norm": 0.9072319865226746,
+      "learning_rate": 0.003,
+      "loss": 3.9988,
+      "step": 20494
+    },
+    {
+      "epoch": 0.20495,
+      "grad_norm": 0.8294616341590881,
+      "learning_rate": 0.003,
+      "loss": 3.9762,
+      "step": 20495
+    },
+    {
+      "epoch": 0.20496,
+      "grad_norm": 0.8708383440971375,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 20496
+    },
+    {
+      "epoch": 0.20497,
+      "grad_norm": 0.918099582195282,
+      "learning_rate": 0.003,
+      "loss": 4.0026,
+      "step": 20497
+    },
+    {
+      "epoch": 0.20498,
+      "grad_norm": 0.979347288608551,
+      "learning_rate": 0.003,
+      "loss": 4.0081,
+      "step": 20498
+    },
+    {
+      "epoch": 0.20499,
+      "grad_norm": 0.9320781230926514,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 20499
+    },
+    {
+      "epoch": 0.205,
+      "grad_norm": 0.7862676978111267,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 20500
+    },
+    {
+      "epoch": 0.20501,
+      "grad_norm": 0.8444691300392151,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 20501
+    },
+    {
+      "epoch": 0.20502,
+      "grad_norm": 1.0602794885635376,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 20502
+    },
+    {
+      "epoch": 0.20503,
+      "grad_norm": 0.9569676518440247,
+      "learning_rate": 0.003,
+      "loss": 4.0078,
+      "step": 20503
+    },
+    {
+      "epoch": 0.20504,
+      "grad_norm": 1.1059049367904663,
+      "learning_rate": 0.003,
+      "loss": 4.0332,
+      "step": 20504
+    },
+    {
+      "epoch": 0.20505,
+      "grad_norm": 1.0884469747543335,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 20505
+    },
+    {
+      "epoch": 0.20506,
+      "grad_norm": 0.8607928156852722,
+      "learning_rate": 0.003,
+      "loss": 4.0642,
+      "step": 20506
+    },
+    {
+      "epoch": 0.20507,
+      "grad_norm": 0.8740077614784241,
+      "learning_rate": 0.003,
+      "loss": 4.0253,
+      "step": 20507
+    },
+    {
+      "epoch": 0.20508,
+      "grad_norm": 0.8393558859825134,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 20508
+    },
+    {
+      "epoch": 0.20509,
+      "grad_norm": 0.8504719734191895,
+      "learning_rate": 0.003,
+      "loss": 4.026,
+      "step": 20509
+    },
+    {
+      "epoch": 0.2051,
+      "grad_norm": 0.8674973249435425,
+      "learning_rate": 0.003,
+      "loss": 3.9974,
+      "step": 20510
+    },
+    {
+      "epoch": 0.20511,
+      "grad_norm": 0.8348132967948914,
+      "learning_rate": 0.003,
+      "loss": 4.0018,
+      "step": 20511
+    },
+    {
+      "epoch": 0.20512,
+      "grad_norm": 0.7985794544219971,
+      "learning_rate": 0.003,
+      "loss": 3.9739,
+      "step": 20512
+    },
+    {
+      "epoch": 0.20513,
+      "grad_norm": 0.771939218044281,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 20513
+    },
+    {
+      "epoch": 0.20514,
+      "grad_norm": 0.7467824220657349,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 20514
+    },
+    {
+      "epoch": 0.20515,
+      "grad_norm": 0.7036437392234802,
+      "learning_rate": 0.003,
+      "loss": 3.9977,
+      "step": 20515
+    },
+    {
+      "epoch": 0.20516,
+      "grad_norm": 0.688878059387207,
+      "learning_rate": 0.003,
+      "loss": 3.9669,
+      "step": 20516
+    },
+    {
+      "epoch": 0.20517,
+      "grad_norm": 0.6202580332756042,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 20517
+    },
+    {
+      "epoch": 0.20518,
+      "grad_norm": 0.6291422843933105,
+      "learning_rate": 0.003,
+      "loss": 3.9775,
+      "step": 20518
+    },
+    {
+      "epoch": 0.20519,
+      "grad_norm": 0.6769140362739563,
+      "learning_rate": 0.003,
+      "loss": 4.0022,
+      "step": 20519
+    },
+    {
+      "epoch": 0.2052,
+      "grad_norm": 0.7055646777153015,
+      "learning_rate": 0.003,
+      "loss": 3.9883,
+      "step": 20520
+    },
+    {
+      "epoch": 0.20521,
+      "grad_norm": 0.7506475448608398,
+      "learning_rate": 0.003,
+      "loss": 3.969,
+      "step": 20521
+    },
+    {
+      "epoch": 0.20522,
+      "grad_norm": 0.7492371797561646,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 20522
+    },
+    {
+      "epoch": 0.20523,
+      "grad_norm": 0.7688263654708862,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 20523
+    },
+    {
+      "epoch": 0.20524,
+      "grad_norm": 0.7825839519500732,
+      "learning_rate": 0.003,
+      "loss": 3.9912,
+      "step": 20524
+    },
+    {
+      "epoch": 0.20525,
+      "grad_norm": 0.9201285243034363,
+      "learning_rate": 0.003,
+      "loss": 3.9961,
+      "step": 20525
+    },
+    {
+      "epoch": 0.20526,
+      "grad_norm": 1.0699529647827148,
+      "learning_rate": 0.003,
+      "loss": 3.9953,
+      "step": 20526
+    },
+    {
+      "epoch": 0.20527,
+      "grad_norm": 0.8841052651405334,
+      "learning_rate": 0.003,
+      "loss": 3.9987,
+      "step": 20527
+    },
+    {
+      "epoch": 0.20528,
+      "grad_norm": 0.7189367413520813,
+      "learning_rate": 0.003,
+      "loss": 3.9846,
+      "step": 20528
+    },
+    {
+      "epoch": 0.20529,
+      "grad_norm": 0.689725935459137,
+      "learning_rate": 0.003,
+      "loss": 4.0127,
+      "step": 20529
+    },
+    {
+      "epoch": 0.2053,
+      "grad_norm": 0.7628370523452759,
+      "learning_rate": 0.003,
+      "loss": 3.9956,
+      "step": 20530
+    },
+    {
+      "epoch": 0.20531,
+      "grad_norm": 0.8612552285194397,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 20531
+    },
+    {
+      "epoch": 0.20532,
+      "grad_norm": 1.033647894859314,
+      "learning_rate": 0.003,
+      "loss": 3.9658,
+      "step": 20532
+    },
+    {
+      "epoch": 0.20533,
+      "grad_norm": 1.1336464881896973,
+      "learning_rate": 0.003,
+      "loss": 4.004,
+      "step": 20533
+    },
+    {
+      "epoch": 0.20534,
+      "grad_norm": 0.7695611715316772,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 20534
+    },
+    {
+      "epoch": 0.20535,
+      "grad_norm": 0.6660147309303284,
+      "learning_rate": 0.003,
+      "loss": 3.9865,
+      "step": 20535
+    },
+    {
+      "epoch": 0.20536,
+      "grad_norm": 0.6618660688400269,
+      "learning_rate": 0.003,
+      "loss": 3.9853,
+      "step": 20536
+    },
+    {
+      "epoch": 0.20537,
+      "grad_norm": 0.5880175828933716,
+      "learning_rate": 0.003,
+      "loss": 3.9791,
+      "step": 20537
+    },
+    {
+      "epoch": 0.20538,
+      "grad_norm": 0.7051225900650024,
+      "learning_rate": 0.003,
+      "loss": 3.9744,
+      "step": 20538
+    },
+    {
+      "epoch": 0.20539,
+      "grad_norm": 0.7596197724342346,
+      "learning_rate": 0.003,
+      "loss": 3.9975,
+      "step": 20539
+    },
+    {
+      "epoch": 0.2054,
+      "grad_norm": 0.8910565972328186,
+      "learning_rate": 0.003,
+      "loss": 4.007,
+      "step": 20540
+    },
+    {
+      "epoch": 0.20541,
+      "grad_norm": 1.1327186822891235,
+      "learning_rate": 0.003,
+      "loss": 3.9927,
+      "step": 20541
+    },
+    {
+      "epoch": 0.20542,
+      "grad_norm": 1.0431007146835327,
+      "learning_rate": 0.003,
+      "loss": 4.0518,
+      "step": 20542
+    },
+    {
+      "epoch": 0.20543,
+      "grad_norm": 1.0606658458709717,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 20543
+    },
+    {
+      "epoch": 0.20544,
+      "grad_norm": 1.0663015842437744,
+      "learning_rate": 0.003,
+      "loss": 3.9964,
+      "step": 20544
+    },
+    {
+      "epoch": 0.20545,
+      "grad_norm": 0.9315351247787476,
+      "learning_rate": 0.003,
+      "loss": 3.9924,
+      "step": 20545
+    },
+    {
+      "epoch": 0.20546,
+      "grad_norm": 0.8413115739822388,
+      "learning_rate": 0.003,
+      "loss": 3.9648,
+      "step": 20546
+    },
+    {
+      "epoch": 0.20547,
+      "grad_norm": 0.8238143920898438,
+      "learning_rate": 0.003,
+      "loss": 3.9847,
+      "step": 20547
+    },
+    {
+      "epoch": 0.20548,
+      "grad_norm": 0.9945659041404724,
+      "learning_rate": 0.003,
+      "loss": 4.0338,
+      "step": 20548
+    },
+    {
+      "epoch": 0.20549,
+      "grad_norm": 1.103825569152832,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 20549
+    },
+    {
+      "epoch": 0.2055,
+      "grad_norm": 0.9757719039916992,
+      "learning_rate": 0.003,
+      "loss": 4.0201,
+      "step": 20550
+    },
+    {
+      "epoch": 0.20551,
+      "grad_norm": 1.0624744892120361,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 20551
+    },
+    {
+      "epoch": 0.20552,
+      "grad_norm": 0.9679355621337891,
+      "learning_rate": 0.003,
+      "loss": 4.0024,
+      "step": 20552
+    },
+    {
+      "epoch": 0.20553,
+      "grad_norm": 1.1138170957565308,
+      "learning_rate": 0.003,
+      "loss": 3.9841,
+      "step": 20553
+    },
+    {
+      "epoch": 0.20554,
+      "grad_norm": 1.2719899415969849,
+      "learning_rate": 0.003,
+      "loss": 4.0187,
+      "step": 20554
+    },
+    {
+      "epoch": 0.20555,
+      "grad_norm": 0.926088809967041,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 20555
+    },
+    {
+      "epoch": 0.20556,
+      "grad_norm": 0.9676206111907959,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 20556
+    },
+    {
+      "epoch": 0.20557,
+      "grad_norm": 0.8924266695976257,
+      "learning_rate": 0.003,
+      "loss": 4.0096,
+      "step": 20557
+    },
+    {
+      "epoch": 0.20558,
+      "grad_norm": 0.6859793066978455,
+      "learning_rate": 0.003,
+      "loss": 4.0264,
+      "step": 20558
+    },
+    {
+      "epoch": 0.20559,
+      "grad_norm": 0.7555159330368042,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 20559
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.7331491112709045,
+      "learning_rate": 0.003,
+      "loss": 4.0128,
+      "step": 20560
+    },
+    {
+      "epoch": 0.20561,
+      "grad_norm": 0.7226747870445251,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 20561
+    },
+    {
+      "epoch": 0.20562,
+      "grad_norm": 0.8749589323997498,
+      "learning_rate": 0.003,
+      "loss": 4.0064,
+      "step": 20562
+    },
+    {
+      "epoch": 0.20563,
+      "grad_norm": 1.0173287391662598,
+      "learning_rate": 0.003,
+      "loss": 4.0203,
+      "step": 20563
+    },
+    {
+      "epoch": 0.20564,
+      "grad_norm": 0.9920432567596436,
+      "learning_rate": 0.003,
+      "loss": 3.9898,
+      "step": 20564
+    },
+    {
+      "epoch": 0.20565,
+      "grad_norm": 0.8987650871276855,
+      "learning_rate": 0.003,
+      "loss": 3.9906,
+      "step": 20565
+    },
+    {
+      "epoch": 0.20566,
+      "grad_norm": 0.9268227219581604,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 20566
+    },
+    {
+      "epoch": 0.20567,
+      "grad_norm": 0.8564572930335999,
+      "learning_rate": 0.003,
+      "loss": 3.9759,
+      "step": 20567
+    },
+    {
+      "epoch": 0.20568,
+      "grad_norm": 0.6740842461585999,
+      "learning_rate": 0.003,
+      "loss": 3.971,
+      "step": 20568
+    },
+    {
+      "epoch": 0.20569,
+      "grad_norm": 0.5516361594200134,
+      "learning_rate": 0.003,
+      "loss": 3.9737,
+      "step": 20569
+    },
+    {
+      "epoch": 0.2057,
+      "grad_norm": 0.5278864502906799,
+      "learning_rate": 0.003,
+      "loss": 3.9901,
+      "step": 20570
+    },
+    {
+      "epoch": 0.20571,
+      "grad_norm": 0.5913801193237305,
+      "learning_rate": 0.003,
+      "loss": 3.9957,
+      "step": 20571
+    },
+    {
+      "epoch": 0.20572,
+      "grad_norm": 0.6298001408576965,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 20572
+    },
+    {
+      "epoch": 0.20573,
+      "grad_norm": 0.756517767906189,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 20573
+    },
+    {
+      "epoch": 0.20574,
+      "grad_norm": 0.9460106492042542,
+      "learning_rate": 0.003,
+      "loss": 3.9935,
+      "step": 20574
+    },
+    {
+      "epoch": 0.20575,
+      "grad_norm": 1.0529441833496094,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 20575
+    },
+    {
+      "epoch": 0.20576,
+      "grad_norm": 0.7504254579544067,
+      "learning_rate": 0.003,
+      "loss": 3.9728,
+      "step": 20576
+    },
+    {
+      "epoch": 0.20577,
+      "grad_norm": 0.7312063574790955,
+      "learning_rate": 0.003,
+      "loss": 3.9741,
+      "step": 20577
+    },
+    {
+      "epoch": 0.20578,
+      "grad_norm": 0.8165370225906372,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 20578
+    },
+    {
+      "epoch": 0.20579,
+      "grad_norm": 0.8489643931388855,
+      "learning_rate": 0.003,
+      "loss": 3.9827,
+      "step": 20579
+    },
+    {
+      "epoch": 0.2058,
+      "grad_norm": 0.749298095703125,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 20580
+    },
+    {
+      "epoch": 0.20581,
+      "grad_norm": 0.8343328833580017,
+      "learning_rate": 0.003,
+      "loss": 3.9906,
+      "step": 20581
+    },
+    {
+      "epoch": 0.20582,
+      "grad_norm": 0.9363728165626526,
+      "learning_rate": 0.003,
+      "loss": 4.0131,
+      "step": 20582
+    },
+    {
+      "epoch": 0.20583,
+      "grad_norm": 0.9963992238044739,
+      "learning_rate": 0.003,
+      "loss": 3.9808,
+      "step": 20583
+    },
+    {
+      "epoch": 0.20584,
+      "grad_norm": 0.9953159689903259,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 20584
+    },
+    {
+      "epoch": 0.20585,
+      "grad_norm": 1.0232398509979248,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 20585
+    },
+    {
+      "epoch": 0.20586,
+      "grad_norm": 0.9123949408531189,
+      "learning_rate": 0.003,
+      "loss": 3.9951,
+      "step": 20586
+    },
+    {
+      "epoch": 0.20587,
+      "grad_norm": 0.9217385649681091,
+      "learning_rate": 0.003,
+      "loss": 4.0183,
+      "step": 20587
+    },
+    {
+      "epoch": 0.20588,
+      "grad_norm": 1.0196894407272339,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 20588
+    },
+    {
+      "epoch": 0.20589,
+      "grad_norm": 1.2130417823791504,
+      "learning_rate": 0.003,
+      "loss": 4.0086,
+      "step": 20589
+    },
+    {
+      "epoch": 0.2059,
+      "grad_norm": 0.8121554255485535,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 20590
+    },
+    {
+      "epoch": 0.20591,
+      "grad_norm": 0.7075498104095459,
+      "learning_rate": 0.003,
+      "loss": 3.9949,
+      "step": 20591
+    },
+    {
+      "epoch": 0.20592,
+      "grad_norm": 0.6768858432769775,
+      "learning_rate": 0.003,
+      "loss": 3.9998,
+      "step": 20592
+    },
+    {
+      "epoch": 0.20593,
+      "grad_norm": 0.8094514608383179,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 20593
+    },
+    {
+      "epoch": 0.20594,
+      "grad_norm": 1.0636597871780396,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 20594
+    },
+    {
+      "epoch": 0.20595,
+      "grad_norm": 1.216055154800415,
+      "learning_rate": 0.003,
+      "loss": 4.0089,
+      "step": 20595
+    },
+    {
+      "epoch": 0.20596,
+      "grad_norm": 0.8186025619506836,
+      "learning_rate": 0.003,
+      "loss": 4.0232,
+      "step": 20596
+    },
+    {
+      "epoch": 0.20597,
+      "grad_norm": 0.7135723233222961,
+      "learning_rate": 0.003,
+      "loss": 3.9743,
+      "step": 20597
+    },
+    {
+      "epoch": 0.20598,
+      "grad_norm": 0.7067899107933044,
+      "learning_rate": 0.003,
+      "loss": 4.009,
+      "step": 20598
+    },
+    {
+      "epoch": 0.20599,
+      "grad_norm": 0.6274234056472778,
+      "learning_rate": 0.003,
+      "loss": 4.0436,
+      "step": 20599
+    },
+    {
+      "epoch": 0.206,
+      "grad_norm": 0.7237199544906616,
+      "learning_rate": 0.003,
+      "loss": 3.9822,
+      "step": 20600
+    },
+    {
+      "epoch": 0.20601,
+      "grad_norm": 0.7106822729110718,
+      "learning_rate": 0.003,
+      "loss": 4.0047,
+      "step": 20601
+    },
+    {
+      "epoch": 0.20602,
+      "grad_norm": 0.8345945477485657,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 20602
+    },
+    {
+      "epoch": 0.20603,
+      "grad_norm": 1.030349612236023,
+      "learning_rate": 0.003,
+      "loss": 3.9822,
+      "step": 20603
+    },
+    {
+      "epoch": 0.20604,
+      "grad_norm": 1.2641339302062988,
+      "learning_rate": 0.003,
+      "loss": 4.0063,
+      "step": 20604
+    },
+    {
+      "epoch": 0.20605,
+      "grad_norm": 0.5765160918235779,
+      "learning_rate": 0.003,
+      "loss": 4.0105,
+      "step": 20605
+    },
+    {
+      "epoch": 0.20606,
+      "grad_norm": 0.7353900074958801,
+      "learning_rate": 0.003,
+      "loss": 3.9683,
+      "step": 20606
+    },
+    {
+      "epoch": 0.20607,
+      "grad_norm": 0.771910548210144,
+      "learning_rate": 0.003,
+      "loss": 4.0249,
+      "step": 20607
+    },
+    {
+      "epoch": 0.20608,
+      "grad_norm": 0.9372097253799438,
+      "learning_rate": 0.003,
+      "loss": 3.9804,
+      "step": 20608
+    },
+    {
+      "epoch": 0.20609,
+      "grad_norm": 1.223936676979065,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 20609
+    },
+    {
+      "epoch": 0.2061,
+      "grad_norm": 0.6746963858604431,
+      "learning_rate": 0.003,
+      "loss": 3.9864,
+      "step": 20610
+    },
+    {
+      "epoch": 0.20611,
+      "grad_norm": 0.6881647109985352,
+      "learning_rate": 0.003,
+      "loss": 4.0148,
+      "step": 20611
+    },
+    {
+      "epoch": 0.20612,
+      "grad_norm": 0.9571933150291443,
+      "learning_rate": 0.003,
+      "loss": 4.0153,
+      "step": 20612
+    },
+    {
+      "epoch": 0.20613,
+      "grad_norm": 1.0425173044204712,
+      "learning_rate": 0.003,
+      "loss": 3.9703,
+      "step": 20613
+    },
+    {
+      "epoch": 0.20614,
+      "grad_norm": 1.009878158569336,
+      "learning_rate": 0.003,
+      "loss": 4.024,
+      "step": 20614
+    },
+    {
+      "epoch": 0.20615,
+      "grad_norm": 0.8970416784286499,
+      "learning_rate": 0.003,
+      "loss": 3.9631,
+      "step": 20615
+    },
+    {
+      "epoch": 0.20616,
+      "grad_norm": 0.800406277179718,
+      "learning_rate": 0.003,
+      "loss": 4.0058,
+      "step": 20616
+    },
+    {
+      "epoch": 0.20617,
+      "grad_norm": 0.9475623965263367,
+      "learning_rate": 0.003,
+      "loss": 3.9887,
+      "step": 20617
+    },
+    {
+      "epoch": 0.20618,
+      "grad_norm": 1.1195120811462402,
+      "learning_rate": 0.003,
+      "loss": 4.0102,
+      "step": 20618
+    },
+    {
+      "epoch": 0.20619,
+      "grad_norm": 0.9776735901832581,
+      "learning_rate": 0.003,
+      "loss": 4.0405,
+      "step": 20619
+    },
+    {
+      "epoch": 0.2062,
+      "grad_norm": 0.8268507719039917,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 20620
+    },
+    {
+      "epoch": 0.20621,
+      "grad_norm": 0.6747649312019348,
+      "learning_rate": 0.003,
+      "loss": 4.012,
+      "step": 20621
+    },
+    {
+      "epoch": 0.20622,
+      "grad_norm": 0.6576265096664429,
+      "learning_rate": 0.003,
+      "loss": 3.9799,
+      "step": 20622
+    },
+    {
+      "epoch": 0.20623,
+      "grad_norm": 0.72371506690979,
+      "learning_rate": 0.003,
+      "loss": 3.9899,
+      "step": 20623
+    },
+    {
+      "epoch": 0.20624,
+      "grad_norm": 0.7435429096221924,
+      "learning_rate": 0.003,
+      "loss": 3.9849,
+      "step": 20624
+    },
+    {
+      "epoch": 0.20625,
+      "grad_norm": 0.6706408262252808,
+      "learning_rate": 0.003,
+      "loss": 3.9776,
+      "step": 20625
+    },
+    {
+      "epoch": 0.20626,
+      "grad_norm": 0.6452346444129944,
+      "learning_rate": 0.003,
+      "loss": 4.0208,
+      "step": 20626
+    },
+    {
+      "epoch": 0.20627,
+      "grad_norm": 0.6868218183517456,
+      "learning_rate": 0.003,
+      "loss": 4.0014,
+      "step": 20627
+    },
+    {
+      "epoch": 0.20628,
+      "grad_norm": 0.6958888173103333,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 20628
+    },
+    {
+      "epoch": 0.20629,
+      "grad_norm": 0.9264516830444336,
+      "learning_rate": 0.003,
+      "loss": 3.9972,
+      "step": 20629
+    },
+    {
+      "epoch": 0.2063,
+      "grad_norm": 1.2161481380462646,
+      "learning_rate": 0.003,
+      "loss": 3.9774,
+      "step": 20630
+    },
+    {
+      "epoch": 0.20631,
+      "grad_norm": 0.6730556488037109,
+      "learning_rate": 0.003,
+      "loss": 3.9861,
+      "step": 20631
+    },
+    {
+      "epoch": 0.20632,
+      "grad_norm": 0.6242616176605225,
+      "learning_rate": 0.003,
+      "loss": 3.9935,
+      "step": 20632
+    },
+    {
+      "epoch": 0.20633,
+      "grad_norm": 0.7033523321151733,
+      "learning_rate": 0.003,
+      "loss": 3.9852,
+      "step": 20633
+    },
+    {
+      "epoch": 0.20634,
+      "grad_norm": 0.7647010684013367,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 20634
+    },
+    {
+      "epoch": 0.20635,
+      "grad_norm": 0.9941380023956299,
+      "learning_rate": 0.003,
+      "loss": 3.98,
+      "step": 20635
+    },
+    {
+      "epoch": 0.20636,
+      "grad_norm": 1.1974639892578125,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 20636
+    },
+    {
+      "epoch": 0.20637,
+      "grad_norm": 0.6021768450737,
+      "learning_rate": 0.003,
+      "loss": 3.9727,
+      "step": 20637
+    },
+    {
+      "epoch": 0.20638,
+      "grad_norm": 0.7827711701393127,
+      "learning_rate": 0.003,
+      "loss": 3.981,
+      "step": 20638
+    },
+    {
+      "epoch": 0.20639,
+      "grad_norm": 0.9160796403884888,
+      "learning_rate": 0.003,
+      "loss": 3.9812,
+      "step": 20639
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 1.1267950534820557,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 20640
+    },
+    {
+      "epoch": 0.20641,
+      "grad_norm": 0.9441350698471069,
+      "learning_rate": 0.003,
+      "loss": 3.9982,
+      "step": 20641
+    },
+    {
+      "epoch": 0.20642,
+      "grad_norm": 0.6999184489250183,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 20642
+    },
+    {
+      "epoch": 0.20643,
+      "grad_norm": 0.6596429347991943,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 20643
+    },
+    {
+      "epoch": 0.20644,
+      "grad_norm": 0.6299886703491211,
+      "learning_rate": 0.003,
+      "loss": 4.0103,
+      "step": 20644
+    },
+    {
+      "epoch": 0.20645,
+      "grad_norm": 0.6572009921073914,
+      "learning_rate": 0.003,
+      "loss": 4.0092,
+      "step": 20645
+    },
+    {
+      "epoch": 0.20646,
+      "grad_norm": 0.7711091637611389,
+      "learning_rate": 0.003,
+      "loss": 3.9984,
+      "step": 20646
+    },
+    {
+      "epoch": 0.20647,
+      "grad_norm": 0.8173057436943054,
+      "learning_rate": 0.003,
+      "loss": 4.0009,
+      "step": 20647
+    },
+    {
+      "epoch": 0.20648,
+      "grad_norm": 1.027649998664856,
+      "learning_rate": 0.003,
+      "loss": 3.9758,
+      "step": 20648
+    },
+    {
+      "epoch": 0.20649,
+      "grad_norm": 1.249096155166626,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 20649
+    },
+    {
+      "epoch": 0.2065,
+      "grad_norm": 0.6178358793258667,
+      "learning_rate": 0.003,
+      "loss": 3.9582,
+      "step": 20650
+    },
+    {
+      "epoch": 0.20651,
+      "grad_norm": 0.6693252921104431,
+      "learning_rate": 0.003,
+      "loss": 3.9999,
+      "step": 20651
+    },
+    {
+      "epoch": 0.20652,
+      "grad_norm": 0.7295918464660645,
+      "learning_rate": 0.003,
+      "loss": 3.9404,
+      "step": 20652
+    },
+    {
+      "epoch": 0.20653,
+      "grad_norm": 0.789218544960022,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 20653
+    },
+    {
+      "epoch": 0.20654,
+      "grad_norm": 0.8371591567993164,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 20654
+    },
+    {
+      "epoch": 0.20655,
+      "grad_norm": 0.7333207726478577,
+      "learning_rate": 0.003,
+      "loss": 3.9895,
+      "step": 20655
+    },
+    {
+      "epoch": 0.20656,
+      "grad_norm": 0.6638398170471191,
+      "learning_rate": 0.003,
+      "loss": 3.9938,
+      "step": 20656
+    },
+    {
+      "epoch": 0.20657,
+      "grad_norm": 0.7411385178565979,
+      "learning_rate": 0.003,
+      "loss": 4.0307,
+      "step": 20657
+    },
+    {
+      "epoch": 0.20658,
+      "grad_norm": 0.8418099284172058,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 20658
+    },
+    {
+      "epoch": 0.20659,
+      "grad_norm": 0.8741323351860046,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 20659
+    },
+    {
+      "epoch": 0.2066,
+      "grad_norm": 0.7878667712211609,
+      "learning_rate": 0.003,
+      "loss": 3.9778,
+      "step": 20660
+    },
+    {
+      "epoch": 0.20661,
+      "grad_norm": 0.6896508932113647,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 20661
+    },
+    {
+      "epoch": 0.20662,
+      "grad_norm": 0.7355780601501465,
+      "learning_rate": 0.003,
+      "loss": 3.9909,
+      "step": 20662
+    },
+    {
+      "epoch": 0.20663,
+      "grad_norm": 0.8107819557189941,
+      "learning_rate": 0.003,
+      "loss": 3.9825,
+      "step": 20663
+    },
+    {
+      "epoch": 0.20664,
+      "grad_norm": 1.0781714916229248,
+      "learning_rate": 0.003,
+      "loss": 3.9673,
+      "step": 20664
+    },
+    {
+      "epoch": 0.20665,
+      "grad_norm": 1.2095935344696045,
+      "learning_rate": 0.003,
+      "loss": 3.9941,
+      "step": 20665
+    },
+    {
+      "epoch": 0.20666,
+      "grad_norm": 0.5785348415374756,
+      "learning_rate": 0.003,
+      "loss": 3.9963,
+      "step": 20666
+    },
+    {
+      "epoch": 0.20667,
+      "grad_norm": 0.6400589942932129,
+      "learning_rate": 0.003,
+      "loss": 4.0175,
+      "step": 20667
+    },
+    {
+      "epoch": 0.20668,
+      "grad_norm": 0.7440767288208008,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 20668
+    },
+    {
+      "epoch": 0.20669,
+      "grad_norm": 0.782042384147644,
+      "learning_rate": 0.003,
+      "loss": 3.9614,
+      "step": 20669
+    },
+    {
+      "epoch": 0.2067,
+      "grad_norm": 0.8038446307182312,
+      "learning_rate": 0.003,
+      "loss": 4.0152,
+      "step": 20670
+    },
+    {
+      "epoch": 0.20671,
+      "grad_norm": 0.9218879342079163,
+      "learning_rate": 0.003,
+      "loss": 3.9958,
+      "step": 20671
+    },
+    {
+      "epoch": 0.20672,
+      "grad_norm": 1.0722054243087769,
+      "learning_rate": 0.003,
+      "loss": 4.0247,
+      "step": 20672
+    },
+    {
+      "epoch": 0.20673,
+      "grad_norm": 1.2632372379302979,
+      "learning_rate": 0.003,
+      "loss": 4.0095,
+      "step": 20673
+    },
+    {
+      "epoch": 0.20674,
+      "grad_norm": 0.712864100933075,
+      "learning_rate": 0.003,
+      "loss": 3.975,
+      "step": 20674
+    },
+    {
+      "epoch": 0.20675,
+      "grad_norm": 0.7353287935256958,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 20675
+    },
+    {
+      "epoch": 0.20676,
+      "grad_norm": 0.7543138861656189,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 20676
+    },
+    {
+      "epoch": 0.20677,
+      "grad_norm": 0.9637554287910461,
+      "learning_rate": 0.003,
+      "loss": 3.9674,
+      "step": 20677
+    },
+    {
+      "epoch": 0.20678,
+      "grad_norm": 0.9501532912254333,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 20678
+    },
+    {
+      "epoch": 0.20679,
+      "grad_norm": 1.0105867385864258,
+      "learning_rate": 0.003,
+      "loss": 4.041,
+      "step": 20679
+    },
+    {
+      "epoch": 0.2068,
+      "grad_norm": 1.3445831537246704,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 20680
+    },
+    {
+      "epoch": 0.20681,
+      "grad_norm": 0.7950366139411926,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 20681
+    },
+    {
+      "epoch": 0.20682,
+      "grad_norm": 0.7824738025665283,
+      "learning_rate": 0.003,
+      "loss": 3.9689,
+      "step": 20682
+    },
+    {
+      "epoch": 0.20683,
+      "grad_norm": 0.757445752620697,
+      "learning_rate": 0.003,
+      "loss": 3.9931,
+      "step": 20683
+    },
+    {
+      "epoch": 0.20684,
+      "grad_norm": 0.7416343688964844,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 20684
+    },
+    {
+      "epoch": 0.20685,
+      "grad_norm": 0.8872725367546082,
+      "learning_rate": 0.003,
+      "loss": 3.9846,
+      "step": 20685
+    },
+    {
+      "epoch": 0.20686,
+      "grad_norm": 1.1165724992752075,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 20686
+    },
+    {
+      "epoch": 0.20687,
+      "grad_norm": 0.9379168152809143,
+      "learning_rate": 0.003,
+      "loss": 3.9727,
+      "step": 20687
+    },
+    {
+      "epoch": 0.20688,
+      "grad_norm": 1.0233606100082397,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 20688
+    },
+    {
+      "epoch": 0.20689,
+      "grad_norm": 0.9732743501663208,
+      "learning_rate": 0.003,
+      "loss": 3.9901,
+      "step": 20689
+    },
+    {
+      "epoch": 0.2069,
+      "grad_norm": 0.8878822326660156,
+      "learning_rate": 0.003,
+      "loss": 4.009,
+      "step": 20690
+    },
+    {
+      "epoch": 0.20691,
+      "grad_norm": 0.9220402836799622,
+      "learning_rate": 0.003,
+      "loss": 4.0248,
+      "step": 20691
+    },
+    {
+      "epoch": 0.20692,
+      "grad_norm": 0.9080644845962524,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 20692
+    },
+    {
+      "epoch": 0.20693,
+      "grad_norm": 0.8790196180343628,
+      "learning_rate": 0.003,
+      "loss": 3.9842,
+      "step": 20693
+    },
+    {
+      "epoch": 0.20694,
+      "grad_norm": 0.7577972412109375,
+      "learning_rate": 0.003,
+      "loss": 3.9934,
+      "step": 20694
+    },
+    {
+      "epoch": 0.20695,
+      "grad_norm": 0.658717930316925,
+      "learning_rate": 0.003,
+      "loss": 3.988,
+      "step": 20695
+    },
+    {
+      "epoch": 0.20696,
+      "grad_norm": 0.6440026760101318,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 20696
+    },
+    {
+      "epoch": 0.20697,
+      "grad_norm": 0.7962105870246887,
+      "learning_rate": 0.003,
+      "loss": 3.9911,
+      "step": 20697
+    },
+    {
+      "epoch": 0.20698,
+      "grad_norm": 0.8110194802284241,
+      "learning_rate": 0.003,
+      "loss": 4.0001,
+      "step": 20698
+    },
+    {
+      "epoch": 0.20699,
+      "grad_norm": 0.8429470658302307,
+      "learning_rate": 0.003,
+      "loss": 4.0243,
+      "step": 20699
+    },
+    {
+      "epoch": 0.207,
+      "grad_norm": 0.925254762172699,
+      "learning_rate": 0.003,
+      "loss": 3.9879,
+      "step": 20700
+    },
+    {
+      "epoch": 0.20701,
+      "grad_norm": 0.9512263536453247,
+      "learning_rate": 0.003,
+      "loss": 4.0143,
+      "step": 20701
+    },
+    {
+      "epoch": 0.20702,
+      "grad_norm": 0.9852175116539001,
+      "learning_rate": 0.003,
+      "loss": 4.0192,
+      "step": 20702
+    },
+    {
+      "epoch": 0.20703,
+      "grad_norm": 1.0767631530761719,
+      "learning_rate": 0.003,
+      "loss": 3.991,
+      "step": 20703
+    },
+    {
+      "epoch": 0.20704,
+      "grad_norm": 1.0503569841384888,
+      "learning_rate": 0.003,
+      "loss": 4.0077,
+      "step": 20704
+    },
+    {
+      "epoch": 0.20705,
+      "grad_norm": 0.9670467972755432,
+      "learning_rate": 0.003,
+      "loss": 3.9965,
+      "step": 20705
+    },
+    {
+      "epoch": 0.20706,
+      "grad_norm": 0.7200356721878052,
+      "learning_rate": 0.003,
+      "loss": 3.9839,
+      "step": 20706
+    },
+    {
+      "epoch": 0.20707,
+      "grad_norm": 0.6849318742752075,
+      "learning_rate": 0.003,
+      "loss": 3.9883,
+      "step": 20707
+    },
+    {
+      "epoch": 0.20708,
+      "grad_norm": 0.7746059894561768,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 20708
+    },
+    {
+      "epoch": 0.20709,
+      "grad_norm": 0.9228622913360596,
+      "learning_rate": 0.003,
+      "loss": 4.0097,
+      "step": 20709
+    },
+    {
+      "epoch": 0.2071,
+      "grad_norm": 1.082072377204895,
+      "learning_rate": 0.003,
+      "loss": 3.9812,
+      "step": 20710
+    },
+    {
+      "epoch": 0.20711,
+      "grad_norm": 0.9330927133560181,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 20711
+    },
+    {
+      "epoch": 0.20712,
+      "grad_norm": 0.7052283883094788,
+      "learning_rate": 0.003,
+      "loss": 4.0167,
+      "step": 20712
+    },
+    {
+      "epoch": 0.20713,
+      "grad_norm": 0.6981417536735535,
+      "learning_rate": 0.003,
+      "loss": 3.9852,
+      "step": 20713
+    },
+    {
+      "epoch": 0.20714,
+      "grad_norm": 0.8098515272140503,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 20714
+    },
+    {
+      "epoch": 0.20715,
+      "grad_norm": 0.654692530632019,
+      "learning_rate": 0.003,
+      "loss": 3.9869,
+      "step": 20715
+    },
+    {
+      "epoch": 0.20716,
+      "grad_norm": 0.6670729517936707,
+      "learning_rate": 0.003,
+      "loss": 3.9756,
+      "step": 20716
+    },
+    {
+      "epoch": 0.20717,
+      "grad_norm": 0.6163148880004883,
+      "learning_rate": 0.003,
+      "loss": 3.9801,
+      "step": 20717
+    },
+    {
+      "epoch": 0.20718,
+      "grad_norm": 0.6592658162117004,
+      "learning_rate": 0.003,
+      "loss": 4.0061,
+      "step": 20718
+    },
+    {
+      "epoch": 0.20719,
+      "grad_norm": 0.6538748741149902,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 20719
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.7887983918190002,
+      "learning_rate": 0.003,
+      "loss": 3.9917,
+      "step": 20720
+    },
+    {
+      "epoch": 0.20721,
+      "grad_norm": 0.9480772614479065,
+      "learning_rate": 0.003,
+      "loss": 4.0315,
+      "step": 20721
+    },
+    {
+      "epoch": 0.20722,
+      "grad_norm": 1.082365870475769,
+      "learning_rate": 0.003,
+      "loss": 4.0142,
+      "step": 20722
+    },
+    {
+      "epoch": 0.20723,
+      "grad_norm": 1.0134780406951904,
+      "learning_rate": 0.003,
+      "loss": 4.0156,
+      "step": 20723
+    },
+    {
+      "epoch": 0.20724,
+      "grad_norm": 0.8613335490226746,
+      "learning_rate": 0.003,
+      "loss": 3.9954,
+      "step": 20724
+    },
+    {
+      "epoch": 0.20725,
+      "grad_norm": 0.7501160502433777,
+      "learning_rate": 0.003,
+      "loss": 4.0296,
+      "step": 20725
+    },
+    {
+      "epoch": 0.20726,
+      "grad_norm": 0.7889297604560852,
+      "learning_rate": 0.003,
+      "loss": 3.9914,
+      "step": 20726
+    },
+    {
+      "epoch": 0.20727,
+      "grad_norm": 0.8222788572311401,
+      "learning_rate": 0.003,
+      "loss": 3.9865,
+      "step": 20727
+    },
+    {
+      "epoch": 0.20728,
+      "grad_norm": 0.8500961661338806,
+      "learning_rate": 0.003,
+      "loss": 3.9996,
+      "step": 20728
+    },
+    {
+      "epoch": 0.20729,
+      "grad_norm": 1.0210720300674438,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 20729
+    },
+    {
+      "epoch": 0.2073,
+      "grad_norm": 1.1979697942733765,
+      "learning_rate": 0.003,
+      "loss": 4.0133,
+      "step": 20730
+    },
+    {
+      "epoch": 0.20731,
+      "grad_norm": 0.8383584022521973,
+      "learning_rate": 0.003,
+      "loss": 3.9827,
+      "step": 20731
+    },
+    {
+      "epoch": 0.20732,
+      "grad_norm": 0.7667802572250366,
+      "learning_rate": 0.003,
+      "loss": 3.9574,
+      "step": 20732
+    },
+    {
+      "epoch": 0.20733,
+      "grad_norm": 0.6863619089126587,
+      "learning_rate": 0.003,
+      "loss": 3.9832,
+      "step": 20733
+    },
+    {
+      "epoch": 0.20734,
+      "grad_norm": 0.7429079413414001,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 20734
+    },
+    {
+      "epoch": 0.20735,
+      "grad_norm": 0.7810400724411011,
+      "learning_rate": 0.003,
+      "loss": 4.0257,
+      "step": 20735
+    },
+    {
+      "epoch": 0.20736,
+      "grad_norm": 0.7995291352272034,
+      "learning_rate": 0.003,
+      "loss": 3.9989,
+      "step": 20736
+    },
+    {
+      "epoch": 0.20737,
+      "grad_norm": 1.1064059734344482,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 20737
+    },
+    {
+      "epoch": 0.20738,
+      "grad_norm": 1.1041861772537231,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 20738
+    },
+    {
+      "epoch": 0.20739,
+      "grad_norm": 0.9637973308563232,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 20739
+    },
+    {
+      "epoch": 0.2074,
+      "grad_norm": 0.9645422697067261,
+      "learning_rate": 0.003,
+      "loss": 4.0065,
+      "step": 20740
+    },
+    {
+      "epoch": 0.20741,
+      "grad_norm": 1.0687763690948486,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 20741
+    },
+    {
+      "epoch": 0.20742,
+      "grad_norm": 0.9210143089294434,
+      "learning_rate": 0.003,
+      "loss": 3.9903,
+      "step": 20742
+    },
+    {
+      "epoch": 0.20743,
+      "grad_norm": 0.753804087638855,
+      "learning_rate": 0.003,
+      "loss": 4.0083,
+      "step": 20743
+    },
+    {
+      "epoch": 0.20744,
+      "grad_norm": 0.8265671133995056,
+      "learning_rate": 0.003,
+      "loss": 4.032,
+      "step": 20744
+    },
+    {
+      "epoch": 0.20745,
+      "grad_norm": 0.8992443680763245,
+      "learning_rate": 0.003,
+      "loss": 3.9837,
+      "step": 20745
+    },
+    {
+      "epoch": 0.20746,
+      "grad_norm": 0.948488175868988,
+      "learning_rate": 0.003,
+      "loss": 3.9915,
+      "step": 20746
+    },
+    {
+      "epoch": 0.20747,
+      "grad_norm": 0.8744862675666809,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 20747
+    },
+    {
+      "epoch": 0.20748,
+      "grad_norm": 0.8626583218574524,
+      "learning_rate": 0.003,
+      "loss": 3.9932,
+      "step": 20748
+    },
+    {
+      "epoch": 0.20749,
+      "grad_norm": 1.053406000137329,
+      "learning_rate": 0.003,
+      "loss": 4.0051,
+      "step": 20749
+    },
+    {
+      "epoch": 0.2075,
+      "grad_norm": 0.8808895349502563,
+      "learning_rate": 0.003,
+      "loss": 4.0316,
+      "step": 20750
+    },
+    {
+      "epoch": 0.20751,
+      "grad_norm": 0.7860477566719055,
+      "learning_rate": 0.003,
+      "loss": 3.9975,
+      "step": 20751
+    },
+    {
+      "epoch": 0.20752,
+      "grad_norm": 0.887262225151062,
+      "learning_rate": 0.003,
+      "loss": 4.0348,
+      "step": 20752
+    },
+    {
+      "epoch": 0.20753,
+      "grad_norm": 0.9119638204574585,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 20753
+    },
+    {
+      "epoch": 0.20754,
+      "grad_norm": 0.8986718654632568,
+      "learning_rate": 0.003,
+      "loss": 4.0076,
+      "step": 20754
+    },
+    {
+      "epoch": 0.20755,
+      "grad_norm": 0.8674590587615967,
+      "learning_rate": 0.003,
+      "loss": 3.9993,
+      "step": 20755
+    },
+    {
+      "epoch": 0.20756,
+      "grad_norm": 1.0189762115478516,
+      "learning_rate": 0.003,
+      "loss": 3.9919,
+      "step": 20756
+    },
+    {
+      "epoch": 0.20757,
+      "grad_norm": 1.087554693222046,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 20757
+    },
+    {
+      "epoch": 0.20758,
+      "grad_norm": 0.9033676981925964,
+      "learning_rate": 0.003,
+      "loss": 3.9966,
+      "step": 20758
+    },
+    {
+      "epoch": 0.20759,
+      "grad_norm": 0.8090853095054626,
+      "learning_rate": 0.003,
+      "loss": 3.9813,
+      "step": 20759
+    },
+    {
+      "epoch": 0.2076,
+      "grad_norm": 0.7151592969894409,
+      "learning_rate": 0.003,
+      "loss": 3.9617,
+      "step": 20760
+    },
+    {
+      "epoch": 0.20761,
+      "grad_norm": 0.6113814115524292,
+      "learning_rate": 0.003,
+      "loss": 3.9878,
+      "step": 20761
+    },
+    {
+      "epoch": 0.20762,
+      "grad_norm": 0.5708845257759094,
+      "learning_rate": 0.003,
+      "loss": 3.9982,
+      "step": 20762
+    },
+    {
+      "epoch": 0.20763,
+      "grad_norm": 0.6272398829460144,
+      "learning_rate": 0.003,
+      "loss": 3.9913,
+      "step": 20763
+    },
+    {
+      "epoch": 0.20764,
+      "grad_norm": 0.6918250918388367,
+      "learning_rate": 0.003,
+      "loss": 4.0112,
+      "step": 20764
+    },
+    {
+      "epoch": 0.20765,
+      "grad_norm": 0.6999816298484802,
+      "learning_rate": 0.003,
+      "loss": 3.9842,
+      "step": 20765
+    },
+    {
+      "epoch": 0.20766,
+      "grad_norm": 0.6208664774894714,
+      "learning_rate": 0.003,
+      "loss": 4.0008,
+      "step": 20766
+    },
+    {
+      "epoch": 0.20767,
+      "grad_norm": 0.7076413035392761,
+      "learning_rate": 0.003,
+      "loss": 4.0106,
+      "step": 20767
+    },
+    {
+      "epoch": 0.20768,
+      "grad_norm": 0.8253567814826965,
+      "learning_rate": 0.003,
+      "loss": 4.0185,
+      "step": 20768
+    },
+    {
+      "epoch": 0.20769,
+      "grad_norm": 0.9532641768455505,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 20769
+    },
+    {
+      "epoch": 0.2077,
+      "grad_norm": 1.3245623111724854,
+      "learning_rate": 0.003,
+      "loss": 4.0045,
+      "step": 20770
+    },
+    {
+      "epoch": 0.20771,
+      "grad_norm": 0.6137048006057739,
+      "learning_rate": 0.003,
+      "loss": 3.9498,
+      "step": 20771
+    },
+    {
+      "epoch": 0.20772,
+      "grad_norm": 0.5572570562362671,
+      "learning_rate": 0.003,
+      "loss": 3.9712,
+      "step": 20772
+    },
+    {
+      "epoch": 0.20773,
+      "grad_norm": 0.5751827955245972,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 20773
+    },
+    {
+      "epoch": 0.20774,
+      "grad_norm": 0.624350905418396,
+      "learning_rate": 0.003,
+      "loss": 3.9857,
+      "step": 20774
+    },
+    {
+      "epoch": 0.20775,
+      "grad_norm": 0.6695300340652466,
+      "learning_rate": 0.003,
+      "loss": 3.9902,
+      "step": 20775
+    },
+    {
+      "epoch": 0.20776,
+      "grad_norm": 0.7807313203811646,
+      "learning_rate": 0.003,
+      "loss": 3.9807,
+      "step": 20776
+    },
+    {
+      "epoch": 0.20777,
+      "grad_norm": 0.8226802349090576,
+      "learning_rate": 0.003,
+      "loss": 4.0222,
+      "step": 20777
+    },
+    {
+      "epoch": 0.20778,
+      "grad_norm": 0.7035885453224182,
+      "learning_rate": 0.003,
+      "loss": 4.0013,
+      "step": 20778
+    },
+    {
+      "epoch": 0.20779,
+      "grad_norm": 0.5964787602424622,
+      "learning_rate": 0.003,
+      "loss": 3.9841,
+      "step": 20779
+    },
+    {
+      "epoch": 0.2078,
+      "grad_norm": 0.6474304795265198,
+      "learning_rate": 0.003,
+      "loss": 3.9679,
+      "step": 20780
+    },
+    {
+      "epoch": 0.20781,
+      "grad_norm": 0.7350100874900818,
+      "learning_rate": 0.003,
+      "loss": 3.9991,
+      "step": 20781
+    },
+    {
+      "epoch": 0.20782,
+      "grad_norm": 0.8760716319084167,
+      "learning_rate": 0.003,
+      "loss": 3.9893,
+      "step": 20782
+    },
+    {
+      "epoch": 0.20783,
+      "grad_norm": 1.0762218236923218,
+      "learning_rate": 0.003,
+      "loss": 3.9901,
+      "step": 20783
+    },
+    {
+      "epoch": 0.20784,
+      "grad_norm": 1.0604761838912964,
+      "learning_rate": 0.003,
+      "loss": 3.9775,
+      "step": 20784
+    },
+    {
+      "epoch": 0.20785,
+      "grad_norm": 1.0759546756744385,
+      "learning_rate": 0.003,
+      "loss": 3.9848,
+      "step": 20785
+    },
+    {
+      "epoch": 0.20786,
+      "grad_norm": 1.051599144935608,
+      "learning_rate": 0.003,
+      "loss": 4.0241,
+      "step": 20786
+    },
+    {
+      "epoch": 0.20787,
+      "grad_norm": 1.355031967163086,
+      "learning_rate": 0.003,
+      "loss": 3.9991,
+      "step": 20787
+    },
+    {
+      "epoch": 0.20788,
+      "grad_norm": 0.9172400236129761,
+      "learning_rate": 0.003,
+      "loss": 3.976,
+      "step": 20788
+    },
+    {
+      "epoch": 0.20789,
+      "grad_norm": 0.9965242147445679,
+      "learning_rate": 0.003,
+      "loss": 4.0284,
+      "step": 20789
+    },
+    {
+      "epoch": 0.2079,
+      "grad_norm": 1.13826584815979,
+      "learning_rate": 0.003,
+      "loss": 4.025,
+      "step": 20790
+    },
+    {
+      "epoch": 0.20791,
+      "grad_norm": 0.9952272772789001,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 20791
+    },
+    {
+      "epoch": 0.20792,
+      "grad_norm": 0.8852728009223938,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 20792
+    },
+    {
+      "epoch": 0.20793,
+      "grad_norm": 0.950574517250061,
+      "learning_rate": 0.003,
+      "loss": 4.0401,
+      "step": 20793
+    },
+    {
+      "epoch": 0.20794,
+      "grad_norm": 1.0464208126068115,
+      "learning_rate": 0.003,
+      "loss": 4.0261,
+      "step": 20794
+    },
+    {
+      "epoch": 0.20795,
+      "grad_norm": 0.9873263239860535,
+      "learning_rate": 0.003,
+      "loss": 4.0046,
+      "step": 20795
+    },
+    {
+      "epoch": 0.20796,
+      "grad_norm": 1.309618592262268,
+      "learning_rate": 0.003,
+      "loss": 4.0205,
+      "step": 20796
+    },
+    {
+      "epoch": 0.20797,
+      "grad_norm": 0.74017333984375,
+      "learning_rate": 0.003,
+      "loss": 3.9647,
+      "step": 20797
+    },
+    {
+      "epoch": 0.20798,
+      "grad_norm": 0.6632053256034851,
+      "learning_rate": 0.003,
+      "loss": 4.0073,
+      "step": 20798
+    },
+    {
+      "epoch": 0.20799,
+      "grad_norm": 0.6369159817695618,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 20799
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.6135809421539307,
+      "learning_rate": 0.003,
+      "loss": 3.9903,
+      "step": 20800
+    },
+    {
+      "epoch": 0.20801,
+      "grad_norm": 0.5961670875549316,
+      "learning_rate": 0.003,
+      "loss": 4.0044,
+      "step": 20801
+    },
+    {
+      "epoch": 0.20802,
+      "grad_norm": 0.6939438581466675,
+      "learning_rate": 0.003,
+      "loss": 4.0144,
+      "step": 20802
+    },
+    {
+      "epoch": 0.20803,
+      "grad_norm": 0.8525596857070923,
+      "learning_rate": 0.003,
+      "loss": 3.989,
+      "step": 20803
+    },
+    {
+      "epoch": 0.20804,
+      "grad_norm": 0.898422360420227,
+      "learning_rate": 0.003,
+      "loss": 4.0351,
+      "step": 20804
+    },
+    {
+      "epoch": 0.20805,
+      "grad_norm": 0.8727675080299377,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 20805
+    },
+    {
+      "epoch": 0.20806,
+      "grad_norm": 0.8255506753921509,
+      "learning_rate": 0.003,
+      "loss": 4.0297,
+      "step": 20806
+    },
+    {
+      "epoch": 0.20807,
+      "grad_norm": 0.8468108773231506,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 20807
+    },
+    {
+      "epoch": 0.20808,
+      "grad_norm": 0.9557967782020569,
+      "learning_rate": 0.003,
+      "loss": 3.9826,
+      "step": 20808
+    },
+    {
+      "epoch": 0.20809,
+      "grad_norm": 1.1428208351135254,
+      "learning_rate": 0.003,
+      "loss": 4.016,
+      "step": 20809
+    },
+    {
+      "epoch": 0.2081,
+      "grad_norm": 0.9441230893135071,
+      "learning_rate": 0.003,
+      "loss": 4.0093,
+      "step": 20810
+    },
+    {
+      "epoch": 0.20811,
+      "grad_norm": 1.04192316532135,
+      "learning_rate": 0.003,
+      "loss": 4.0071,
+      "step": 20811
+    },
+    {
+      "epoch": 0.20812,
+      "grad_norm": 0.8832850456237793,
+      "learning_rate": 0.003,
+      "loss": 3.9981,
+      "step": 20812
+    },
+    {
+      "epoch": 0.20813,
+      "grad_norm": 0.7046552300453186,
+      "learning_rate": 0.003,
+      "loss": 4.0181,
+      "step": 20813
+    },
+    {
+      "epoch": 0.20814,
+      "grad_norm": 0.7719017267227173,
+      "learning_rate": 0.003,
+      "loss": 4.0214,
+      "step": 20814
+    },
+    {
+      "epoch": 0.20815,
+      "grad_norm": 0.8758888840675354,
+      "learning_rate": 0.003,
+      "loss": 4.0007,
+      "step": 20815
+    },
+    {
+      "epoch": 0.20816,
+      "grad_norm": 1.175139307975769,
+      "learning_rate": 0.003,
+      "loss": 4.0075,
+      "step": 20816
+    },
+    {
+      "epoch": 0.20817,
+      "grad_norm": 0.7121004462242126,
+      "learning_rate": 0.003,
+      "loss": 4.002,
+      "step": 20817
+    },
+    {
+      "epoch": 0.20818,
+      "grad_norm": 0.6497937440872192,
+      "learning_rate": 0.003,
+      "loss": 4.0298,
+      "step": 20818
+    },
+    {
+      "epoch": 0.20819,
+      "grad_norm": 0.8046772480010986,
+      "learning_rate": 0.003,
+      "loss": 3.9856,
+      "step": 20819
+    },
+    {
+      "epoch": 0.2082,
+      "grad_norm": 0.9324015378952026,
+      "learning_rate": 0.003,
+      "loss": 4.0012,
+      "step": 20820
+    },
+    {
+      "epoch": 0.20821,
+      "grad_norm": 1.0890274047851562,
+      "learning_rate": 0.003,
+      "loss": 3.9836,
+      "step": 20821
+    },
+    {
+      "epoch": 0.20822,
+      "grad_norm": 0.8312063813209534,
+      "learning_rate": 0.003,
+      "loss": 4.0275,
+      "step": 20822
+    },
+    {
+      "epoch": 0.20823,
+      "grad_norm": 0.7680968046188354,
+      "learning_rate": 0.003,
+      "loss": 3.9933,
+      "step": 20823
+    },
+    {
+      "epoch": 0.20824,
+      "grad_norm": 0.792281448841095,
+      "learning_rate": 0.003,
+      "loss": 4.0288,
+      "step": 20824
+    },
+    {
+      "epoch": 0.20825,
+      "grad_norm": 0.9612289667129517,
+      "learning_rate": 0.003,
+      "loss": 4.0213,
+      "step": 20825
+    },
+    {
+      "epoch": 0.20826,
+      "grad_norm": 1.093168020248413,
+      "learning_rate": 0.003,
+      "loss": 4.0291,
+      "step": 20826
+    },
+    {
+      "epoch": 0.20827,
+      "grad_norm": 0.8961457014083862,
+      "learning_rate": 0.003,
+      "loss": 3.9954,
+      "step": 20827
+    },
+    {
+      "epoch": 0.20828,
+      "grad_norm": 0.94273841381073,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 20828
+    },
+    {
+      "epoch": 0.20829,
+      "grad_norm": 1.0375275611877441,
+      "learning_rate": 0.003,
+      "loss": 4.01,
+      "step": 20829
+    },
+    {
+      "epoch": 0.2083,
+      "grad_norm": 1.0365378856658936,
+      "learning_rate": 0.003,
+      "loss": 3.9871,
+      "step": 20830
+    },
+    {
+      "epoch": 0.20831,
+      "grad_norm": 1.000985026359558,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 20831
+    },
+    {
+      "epoch": 0.20832,
+      "grad_norm": 0.8756856918334961,
+      "learning_rate": 0.003,
+      "loss": 3.9884,
+      "step": 20832
+    },
+    {
+      "epoch": 0.20833,
+      "grad_norm": 0.8284798264503479,
+      "learning_rate": 0.003,
+      "loss": 4.0345,
+      "step": 20833
+    },
+    {
+      "epoch": 0.20834,
+      "grad_norm": 0.7230366468429565,
+      "learning_rate": 0.003,
+      "loss": 3.9844,
+      "step": 20834
+    },
+    {
+      "epoch": 0.20835,
+      "grad_norm": 0.6954699158668518,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 20835
+    },
+    {
+      "epoch": 0.20836,
+      "grad_norm": 0.7048358917236328,
+      "learning_rate": 0.003,
+      "loss": 4.0084,
+      "step": 20836
+    },
+    {
+      "epoch": 0.20837,
+      "grad_norm": 0.8838732838630676,
+      "learning_rate": 0.003,
+      "loss": 4.0304,
+      "step": 20837
+    },
+    {
+      "epoch": 0.20838,
+      "grad_norm": 1.0553494691848755,
+      "learning_rate": 0.003,
+      "loss": 3.9733,
+      "step": 20838
+    },
+    {
+      "epoch": 0.20839,
+      "grad_norm": 0.996741533279419,
+      "learning_rate": 0.003,
+      "loss": 4.0056,
+      "step": 20839
+    },
+    {
+      "epoch": 0.2084,
+      "grad_norm": 0.8984896540641785,
+      "learning_rate": 0.003,
+      "loss": 3.9834,
+      "step": 20840
+    },
+    {
+      "epoch": 0.20841,
+      "grad_norm": 0.7694340944290161,
+      "learning_rate": 0.003,
+      "loss": 3.9834,
+      "step": 20841
+    },
+    {
+      "epoch": 0.20842,
+      "grad_norm": 0.8103636503219604,
+      "learning_rate": 0.003,
+      "loss": 4.0193,
+      "step": 20842
+    },
+    {
+      "epoch": 0.20843,
+      "grad_norm": 0.8145402669906616,
+      "learning_rate": 0.003,
+      "loss": 3.9854,
+      "step": 20843
+    },
+    {
+      "epoch": 0.20844,
+      "grad_norm": 0.6706199049949646,
+      "learning_rate": 0.003,
+      "loss": 3.987,
+      "step": 20844
+    },
+    {
+      "epoch": 0.20845,
+      "grad_norm": 0.6789239645004272,
+      "learning_rate": 0.003,
+      "loss": 3.998,
+      "step": 20845
+    },
+    {
+      "epoch": 0.20846,
+      "grad_norm": 0.659748375415802,
+      "learning_rate": 0.003,
+      "loss": 3.9881,
+      "step": 20846
+    },
+    {
+      "epoch": 0.20847,
+      "grad_norm": 0.7569118738174438,
+      "learning_rate": 0.003,
+      "loss": 4.0234,
+      "step": 20847
+    },
+    {
+      "epoch": 0.20848,
+      "grad_norm": 0.7948017120361328,
+      "learning_rate": 0.003,
+      "loss": 3.9821,
+      "step": 20848
+    },
+    {
+      "epoch": 0.20849,
+      "grad_norm": 0.7857460975646973,
+      "learning_rate": 0.003,
+      "loss": 3.9907,
+      "step": 20849
+    },
+    {
+      "epoch": 0.2085,
+      "grad_norm": 0.8948972821235657,
+      "learning_rate": 0.003,
+      "loss": 3.9979,
+      "step": 20850
+    },
+    {
+      "epoch": 0.20851,
+      "grad_norm": 1.0538241863250732,
+      "learning_rate": 0.003,
+      "loss": 3.9869,
+      "step": 20851
+    },
+    {
+      "epoch": 0.20852,
+      "grad_norm": 0.944847047328949,
+      "learning_rate": 0.003,
+      "loss": 4.0094,
+      "step": 20852
+    },
+    {
+      "epoch": 0.20853,
+      "grad_norm": 0.7649935483932495,
+      "learning_rate": 0.003,
+      "loss": 3.9878,
+      "step": 20853
+    },
+    {
+      "epoch": 0.20854,
+      "grad_norm": 0.7309855222702026,
+      "learning_rate": 0.003,
+      "loss": 4.015,
+      "step": 20854
+    },
+    {
+      "epoch": 0.20855,
+      "grad_norm": 0.839871883392334,
+      "learning_rate": 0.003,
+      "loss": 3.9925,
+      "step": 20855
+    },
+    {
+      "epoch": 0.20856,
+      "grad_norm": 1.0781958103179932,
+      "learning_rate": 0.003,
+      "loss": 3.9943,
+      "step": 20856
+    },
+    {
+      "epoch": 0.20857,
+      "grad_norm": 1.0436609983444214,
+      "learning_rate": 0.003,
+      "loss": 4.0162,
+      "step": 20857
+    },
+    {
+      "epoch": 0.20858,
+      "grad_norm": 1.0899194478988647,
+      "learning_rate": 0.003,
+      "loss": 4.0239,
+      "step": 20858
+    },
+    {
+      "epoch": 0.20859,
+      "grad_norm": 0.8802366256713867,
+      "learning_rate": 0.003,
+      "loss": 4.0386,
+      "step": 20859
+    },
+    {
+      "epoch": 0.2086,
+      "grad_norm": 0.8422168493270874,
+      "learning_rate": 0.003,
+      "loss": 4.0034,
+      "step": 20860
+    },
+    {
+      "epoch": 0.20861,
+      "grad_norm": 0.8001992106437683,
+      "learning_rate": 0.003,
+      "loss": 3.9995,
+      "step": 20861
+    },
+    {
+      "epoch": 0.20862,
+      "grad_norm": 0.7634277939796448,
+      "learning_rate": 0.003,
+      "loss": 3.9918,
+      "step": 20862
+    },
+    {
+      "epoch": 0.20863,
+      "grad_norm": 0.7780807018280029,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 20863
+    },
+    {
+      "epoch": 0.20864,
+      "grad_norm": 0.7725064754486084,
+      "learning_rate": 0.003,
+      "loss": 3.9746,
+      "step": 20864
+    },
+    {
+      "epoch": 0.20865,
+      "grad_norm": 0.8087694644927979,
+      "learning_rate": 0.003,
+      "loss": 3.9997,
+      "step": 20865
+    },
+    {
+      "epoch": 0.20866,
+      "grad_norm": 0.8541216850280762,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 20866
+    },
+    {
+      "epoch": 0.20867,
+      "grad_norm": 1.0422252416610718,
+      "learning_rate": 0.003,
+      "loss": 3.999,
+      "step": 20867
+    },
+    {
+      "epoch": 0.20868,
+      "grad_norm": 0.872579038143158,
+      "learning_rate": 0.003,
+      "loss": 3.992,
+      "step": 20868
+    },
+    {
+      "epoch": 0.20869,
+      "grad_norm": 0.9293691515922546,
+      "learning_rate": 0.003,
+      "loss": 4.0165,
+      "step": 20869
+    },
+    {
+      "epoch": 0.2087,
+      "grad_norm": 0.9818077087402344,
+      "learning_rate": 0.003,
+      "loss": 4.0047,
+      "step": 20870
+    },
+    {
+      "epoch": 0.20871,
+      "grad_norm": 0.9938639402389526,
+      "learning_rate": 0.003,
+      "loss": 4.0231,
+      "step": 20871
+    },
+    {
+      "epoch": 0.20872,
+      "grad_norm": 1.0270440578460693,
+      "learning_rate": 0.003,
+      "loss": 4.0358,
+      "step": 20872
+    },
+    {
+      "epoch": 0.20873,
+      "grad_norm": 1.033719539642334,
+      "learning_rate": 0.003,
+      "loss": 4.0004,
+      "step": 20873
+    },
+    {
+      "epoch": 0.20874,
+      "grad_norm": 0.9151277542114258,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 20874
+    },
+    {
+      "epoch": 0.20875,
+      "grad_norm": 0.809447705745697,
+      "learning_rate": 0.003,
+      "loss": 3.9917,
+      "step": 20875
+    },
+    {
+      "epoch": 0.20876,
+      "grad_norm": 0.7976981997489929,
+      "learning_rate": 0.003,
+      "loss": 4.0194,
+      "step": 20876
+    },
+    {
+      "epoch": 0.20877,
+      "grad_norm": 0.7487211227416992,
+      "learning_rate": 0.003,
+      "loss": 3.9888,
+      "step": 20877
+    },
+    {
+      "epoch": 0.20878,
+      "grad_norm": 0.7258038520812988,
+      "learning_rate": 0.003,
+      "loss": 4.0141,
+      "step": 20878
+    },
+    {
+      "epoch": 0.20879,
+      "grad_norm": 0.656012237071991,
+      "learning_rate": 0.003,
+      "loss": 3.9976,
+      "step": 20879
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.5595047473907471,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 20880
+    },
+    {
+      "epoch": 0.20881,
+      "grad_norm": 0.6370016932487488,
+      "learning_rate": 0.003,
+      "loss": 4.0139,
+      "step": 20881
+    },
+    {
+      "epoch": 0.20882,
+      "grad_norm": 0.6755586266517639,
+      "learning_rate": 0.003,
+      "loss": 4.0138,
+      "step": 20882
+    },
+    {
+      "epoch": 0.20883,
+      "grad_norm": 0.6820712089538574,
+      "learning_rate": 0.003,
+      "loss": 4.0023,
+      "step": 20883
+    },
+    {
+      "epoch": 0.20884,
+      "grad_norm": 0.6668835878372192,
+      "learning_rate": 0.003,
+      "loss": 4.0265,
+      "step": 20884
+    },
+    {
+      "epoch": 0.20885,
+      "grad_norm": 0.7880050539970398,
+      "learning_rate": 0.003,
+      "loss": 3.9978,
+      "step": 20885
+    },
+    {
+      "epoch": 0.20886,
+      "grad_norm": 0.7801070213317871,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 20886
+    },
+    {
+      "epoch": 0.20887,
+      "grad_norm": 0.7368282079696655,
+      "learning_rate": 0.003,
+      "loss": 4.037,
+      "step": 20887
+    },
+    {
+      "epoch": 0.20888,
+      "grad_norm": 0.6302228569984436,
+      "learning_rate": 0.003,
+      "loss": 4.017,
+      "step": 20888
+    },
+    {
+      "epoch": 0.20889,
+      "grad_norm": 0.6666967272758484,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 20889
+    },
+    {
+      "epoch": 0.2089,
+      "grad_norm": 0.7467374205589294,
+      "learning_rate": 0.003,
+      "loss": 3.9705,
+      "step": 20890
+    },
+    {
+      "epoch": 0.20891,
+      "grad_norm": 0.8886362910270691,
+      "learning_rate": 0.003,
+      "loss": 3.9844,
+      "step": 20891
+    },
+    {
+      "epoch": 0.20892,
+      "grad_norm": 1.063424825668335,
+      "learning_rate": 0.003,
+      "loss": 4.0101,
+      "step": 20892
+    },
+    {
+      "epoch": 0.20893,
+      "grad_norm": 1.1250430345535278,
+      "learning_rate": 0.003,
+      "loss": 3.9876,
+      "step": 20893
+    },
+    {
+      "epoch": 0.20894,
+      "grad_norm": 0.7578840255737305,
+      "learning_rate": 0.003,
+      "loss": 3.9536,
+      "step": 20894
+    },
+    {
+      "epoch": 0.20895,
+      "grad_norm": 0.6238217353820801,
+      "learning_rate": 0.003,
+      "loss": 4.0173,
+      "step": 20895
+    },
+    {
+      "epoch": 0.20896,
+      "grad_norm": 0.6875156164169312,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 20896
+    },
+    {
+      "epoch": 0.20897,
+      "grad_norm": 0.731509268283844,
+      "learning_rate": 0.003,
+      "loss": 3.9834,
+      "step": 20897
+    },
+    {
+      "epoch": 0.20898,
+      "grad_norm": 0.6633671522140503,
+      "learning_rate": 0.003,
+      "loss": 3.9876,
+      "step": 20898
+    },
+    {
+      "epoch": 0.20899,
+      "grad_norm": 0.6270951628684998,
+      "learning_rate": 0.003,
+      "loss": 4.0002,
+      "step": 20899
+    },
+    {
+      "epoch": 0.209,
+      "grad_norm": 0.7258926033973694,
+      "learning_rate": 0.003,
+      "loss": 3.9805,
+      "step": 20900
+    },
+    {
+      "epoch": 0.20901,
+      "grad_norm": 0.6589219570159912,
+      "learning_rate": 0.003,
+      "loss": 4.0199,
+      "step": 20901
+    },
+    {
+      "epoch": 0.20902,
+      "grad_norm": 0.7099064588546753,
+      "learning_rate": 0.003,
+      "loss": 3.9866,
+      "step": 20902
+    },
+    {
+      "epoch": 0.20903,
+      "grad_norm": 0.9100574254989624,
+      "learning_rate": 0.003,
+      "loss": 3.9694,
+      "step": 20903
+    },
+    {
+      "epoch": 0.20904,
+      "grad_norm": 1.1960123777389526,
+      "learning_rate": 0.003,
+      "loss": 3.9922,
+      "step": 20904
+    },
+    {
+      "epoch": 0.20905,
+      "grad_norm": 0.8406502604484558,
+      "learning_rate": 0.003,
+      "loss": 4.0342,
+      "step": 20905
+    },
+    {
+      "epoch": 0.20906,
+      "grad_norm": 0.8279140591621399,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 20906
+    },
+    {
+      "epoch": 0.20907,
+      "grad_norm": 0.8723680377006531,
+      "learning_rate": 0.003,
+      "loss": 4.008,
+      "step": 20907
+    },
+    {
+      "epoch": 0.20908,
+      "grad_norm": 0.9509908556938171,
+      "learning_rate": 0.003,
+      "loss": 3.9795,
+      "step": 20908
+    },
+    {
+      "epoch": 0.20909,
+      "grad_norm": 1.0323761701583862,
+      "learning_rate": 0.003,
+      "loss": 3.9861,
+      "step": 20909
+    },
+    {
+      "epoch": 0.2091,
+      "grad_norm": 0.9811918139457703,
+      "learning_rate": 0.003,
+      "loss": 4.0176,
+      "step": 20910
+    },
+    {
+      "epoch": 0.20911,
+      "grad_norm": 0.8328064680099487,
+      "learning_rate": 0.003,
+      "loss": 4.0114,
+      "step": 20911
+    },
+    {
+      "epoch": 0.20912,
+      "grad_norm": 0.8476513028144836,
+      "learning_rate": 0.003,
+      "loss": 3.9971,
+      "step": 20912
+    },
+    {
+      "epoch": 0.20913,
+      "grad_norm": 0.8520492315292358,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 20913
+    },
+    {
+      "epoch": 0.20914,
+      "grad_norm": 0.9105722904205322,
+      "learning_rate": 0.003,
+      "loss": 4.0041,
+      "step": 20914
+    },
+    {
+      "epoch": 0.20915,
+      "grad_norm": 0.8135495185852051,
+      "learning_rate": 0.003,
+      "loss": 4.0137,
+      "step": 20915
+    },
+    {
+      "epoch": 0.20916,
+      "grad_norm": 0.8004677295684814,
+      "learning_rate": 0.003,
+      "loss": 4.0332,
+      "step": 20916
+    },
+    {
+      "epoch": 0.20917,
+      "grad_norm": 0.8024815917015076,
+      "learning_rate": 0.003,
+      "loss": 3.9777,
+      "step": 20917
+    },
+    {
+      "epoch": 0.20918,
+      "grad_norm": 0.8938299417495728,
+      "learning_rate": 0.003,
+      "loss": 4.0037,
+      "step": 20918
+    },
+    {
+      "epoch": 0.20919,
+      "grad_norm": 1.1373519897460938,
+      "learning_rate": 0.003,
+      "loss": 4.0191,
+      "step": 20919
+    },
+    {
+      "epoch": 0.2092,
+      "grad_norm": 1.09964919090271,
+      "learning_rate": 0.003,
+      "loss": 4.0185,
+      "step": 20920
+    },
+    {
+      "epoch": 0.20921,
+      "grad_norm": 0.9594411849975586,
+      "learning_rate": 0.003,
+      "loss": 4.0079,
+      "step": 20921
+    },
+    {
+      "epoch": 0.20922,
+      "grad_norm": 1.1219273805618286,
+      "learning_rate": 0.003,
+      "loss": 3.9992,
+      "step": 20922
+    },
+    {
+      "epoch": 0.20923,
+      "grad_norm": 1.0501848459243774,
+      "learning_rate": 0.003,
+      "loss": 4.0384,
+      "step": 20923
+    },
+    {
+      "epoch": 0.20924,
+      "grad_norm": 0.8713542222976685,
+      "learning_rate": 0.003,
+      "loss": 3.9855,
+      "step": 20924
+    },
+    {
+      "epoch": 0.20925,
+      "grad_norm": 0.8268400430679321,
+      "learning_rate": 0.003,
+      "loss": 4.0238,
+      "step": 20925
+    },
+    {
+      "epoch": 0.20926,
+      "grad_norm": 0.9429669976234436,
+      "learning_rate": 0.003,
+      "loss": 4.0057,
+      "step": 20926
+    },
+    {
+      "epoch": 0.20927,
+      "grad_norm": 1.1184543371200562,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 20927
+    },
+    {
+      "epoch": 0.20928,
+      "grad_norm": 0.8756439089775085,
+      "learning_rate": 0.003,
+      "loss": 3.9786,
+      "step": 20928
+    },
+    {
+      "epoch": 0.20929,
+      "grad_norm": 0.7752197980880737,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 20929
+    },
+    {
+      "epoch": 0.2093,
+      "grad_norm": 0.7061675190925598,
+      "learning_rate": 0.003,
+      "loss": 3.9829,
+      "step": 20930
+    },
+    {
+      "epoch": 0.20931,
+      "grad_norm": 0.7310569286346436,
+      "learning_rate": 0.003,
+      "loss": 4.013,
+      "step": 20931
+    },
+    {
+      "epoch": 0.20932,
+      "grad_norm": 0.8134058713912964,
+      "learning_rate": 0.003,
+      "loss": 3.9933,
+      "step": 20932
+    },
+    {
+      "epoch": 0.20933,
+      "grad_norm": 0.8310827016830444,
+      "learning_rate": 0.003,
+      "loss": 3.9776,
+      "step": 20933
+    },
+    {
+      "epoch": 0.20934,
+      "grad_norm": 0.8122318387031555,
+      "learning_rate": 0.003,
+      "loss": 4.0169,
+      "step": 20934
+    },
+    {
+      "epoch": 0.20935,
+      "grad_norm": 0.718316376209259,
+      "learning_rate": 0.003,
+      "loss": 4.0122,
+      "step": 20935
+    },
+    {
+      "epoch": 0.20936,
+      "grad_norm": 0.6134288311004639,
+      "learning_rate": 0.003,
+      "loss": 3.9572,
+      "step": 20936
+    },
+    {
+      "epoch": 0.20937,
+      "grad_norm": 0.564063549041748,
+      "learning_rate": 0.003,
+      "loss": 3.9877,
+      "step": 20937
+    },
+    {
+      "epoch": 0.20938,
+      "grad_norm": 0.6001213192939758,
+      "learning_rate": 0.003,
+      "loss": 3.9893,
+      "step": 20938
+    },
+    {
+      "epoch": 0.20939,
+      "grad_norm": 0.7452312707901001,
+      "learning_rate": 0.003,
+      "loss": 4.0006,
+      "step": 20939
+    },
+    {
+      "epoch": 0.2094,
+      "grad_norm": 0.7726470232009888,
+      "learning_rate": 0.003,
+      "loss": 4.0043,
+      "step": 20940
+    },
+    {
+      "epoch": 0.20941,
+      "grad_norm": 0.9299268126487732,
+      "learning_rate": 0.003,
+      "loss": 4.0087,
+      "step": 20941
+    },
+    {
+      "epoch": 0.20942,
+      "grad_norm": 1.3489012718200684,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 20942
+    },
+    {
+      "epoch": 0.20943,
+      "grad_norm": 0.9157458543777466,
+      "learning_rate": 0.003,
+      "loss": 4.001,
+      "step": 20943
+    },
+    {
+      "epoch": 0.20944,
+      "grad_norm": 0.850161612033844,
+      "learning_rate": 0.003,
+      "loss": 3.9933,
+      "step": 20944
+    },
+    {
+      "epoch": 0.20945,
+      "grad_norm": 0.9274651408195496,
+      "learning_rate": 0.003,
+      "loss": 4.0186,
+      "step": 20945
+    },
+    {
+      "epoch": 0.20946,
+      "grad_norm": 0.9364386796951294,
+      "learning_rate": 0.003,
+      "loss": 4.0072,
+      "step": 20946
+    },
+    {
+      "epoch": 0.20947,
+      "grad_norm": 0.9944769144058228,
+      "learning_rate": 0.003,
+      "loss": 3.9851,
+      "step": 20947
+    },
+    {
+      "epoch": 0.20948,
+      "grad_norm": 0.9468872547149658,
+      "learning_rate": 0.003,
+      "loss": 3.9867,
+      "step": 20948
+    },
+    {
+      "epoch": 0.20949,
+      "grad_norm": 0.8152547478675842,
+      "learning_rate": 0.003,
+      "loss": 3.9939,
+      "step": 20949
+    },
+    {
+      "epoch": 0.2095,
+      "grad_norm": 0.8274593353271484,
+      "learning_rate": 0.003,
+      "loss": 4.0017,
+      "step": 20950
+    },
+    {
+      "epoch": 0.20951,
+      "grad_norm": 0.7041265368461609,
+      "learning_rate": 0.003,
+      "loss": 4.0066,
+      "step": 20951
+    },
+    {
+      "epoch": 0.20952,
+      "grad_norm": 0.6787787079811096,
+      "learning_rate": 0.003,
+      "loss": 4.0301,
+      "step": 20952
+    },
+    {
+      "epoch": 0.20953,
+      "grad_norm": 0.7629432678222656,
+      "learning_rate": 0.003,
+      "loss": 3.9708,
+      "step": 20953
+    },
+    {
+      "epoch": 0.20954,
+      "grad_norm": 0.751412034034729,
+      "learning_rate": 0.003,
+      "loss": 3.9644,
+      "step": 20954
+    },
+    {
+      "epoch": 0.20955,
+      "grad_norm": 0.7168709635734558,
+      "learning_rate": 0.003,
+      "loss": 3.9842,
+      "step": 20955
+    },
+    {
+      "epoch": 0.20956,
+      "grad_norm": 0.7899840474128723,
+      "learning_rate": 0.003,
+      "loss": 3.9915,
+      "step": 20956
+    },
+    {
+      "epoch": 0.20957,
+      "grad_norm": 0.8560346364974976,
+      "learning_rate": 0.003,
+      "loss": 3.9856,
+      "step": 20957
+    },
+    {
+      "epoch": 0.20958,
+      "grad_norm": 0.945913553237915,
+      "learning_rate": 0.003,
+      "loss": 4.0235,
+      "step": 20958
+    },
+    {
+      "epoch": 0.20959,
+      "grad_norm": 1.2707791328430176,
+      "learning_rate": 0.003,
+      "loss": 4.0053,
+      "step": 20959
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.9422053694725037,
+      "learning_rate": 0.003,
+      "loss": 3.99,
+      "step": 20960
+    },
+    {
+      "epoch": 0.20961,
+      "grad_norm": 0.9197450876235962,
+      "learning_rate": 0.003,
+      "loss": 3.9879,
+      "step": 20961
+    },
+    {
+      "epoch": 0.20962,
+      "grad_norm": 1.1179391145706177,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 20962
+    },
+    {
+      "epoch": 0.20963,
+      "grad_norm": 1.0478678941726685,
+      "learning_rate": 0.003,
+      "loss": 3.9892,
+      "step": 20963
+    },
+    {
+      "epoch": 0.20964,
+      "grad_norm": 0.9024834632873535,
+      "learning_rate": 0.003,
+      "loss": 4.0004,
+      "step": 20964
+    },
+    {
+      "epoch": 0.20965,
+      "grad_norm": 0.8023054599761963,
+      "learning_rate": 0.003,
+      "loss": 4.0031,
+      "step": 20965
+    },
+    {
+      "epoch": 0.20966,
+      "grad_norm": 0.8307892680168152,
+      "learning_rate": 0.003,
+      "loss": 3.9773,
+      "step": 20966
+    },
+    {
+      "epoch": 0.20967,
+      "grad_norm": 0.9591643214225769,
+      "learning_rate": 0.003,
+      "loss": 3.9924,
+      "step": 20967
+    },
+    {
+      "epoch": 0.20968,
+      "grad_norm": 1.2103545665740967,
+      "learning_rate": 0.003,
+      "loss": 4.0159,
+      "step": 20968
+    },
+    {
+      "epoch": 0.20969,
+      "grad_norm": 0.9418818354606628,
+      "learning_rate": 0.003,
+      "loss": 4.0029,
+      "step": 20969
+    },
+    {
+      "epoch": 0.2097,
+      "grad_norm": 0.6856959462165833,
+      "learning_rate": 0.003,
+      "loss": 3.9959,
+      "step": 20970
+    },
+    {
+      "epoch": 0.20971,
+      "grad_norm": 0.5946695804595947,
+      "learning_rate": 0.003,
+      "loss": 3.9947,
+      "step": 20971
+    },
+    {
+      "epoch": 0.20972,
+      "grad_norm": 0.7504004240036011,
+      "learning_rate": 0.003,
+      "loss": 4.0145,
+      "step": 20972
+    },
+    {
+      "epoch": 0.20973,
+      "grad_norm": 0.9070709347724915,
+      "learning_rate": 0.003,
+      "loss": 3.9819,
+      "step": 20973
+    },
+    {
+      "epoch": 0.20974,
+      "grad_norm": 1.0570318698883057,
+      "learning_rate": 0.003,
+      "loss": 4.0209,
+      "step": 20974
+    },
+    {
+      "epoch": 0.20975,
+      "grad_norm": 0.8538148403167725,
+      "learning_rate": 0.003,
+      "loss": 4.0266,
+      "step": 20975
+    },
+    {
+      "epoch": 0.20976,
+      "grad_norm": 0.6629658341407776,
+      "learning_rate": 0.003,
+      "loss": 3.979,
+      "step": 20976
+    },
+    {
+      "epoch": 0.20977,
+      "grad_norm": 0.6608455777168274,
+      "learning_rate": 0.003,
+      "loss": 4.0272,
+      "step": 20977
+    },
+    {
+      "epoch": 0.20978,
+      "grad_norm": 0.749142050743103,
+      "learning_rate": 0.003,
+      "loss": 3.9762,
+      "step": 20978
+    },
+    {
+      "epoch": 0.20979,
+      "grad_norm": 0.8641400337219238,
+      "learning_rate": 0.003,
+      "loss": 4.0396,
+      "step": 20979
+    },
+    {
+      "epoch": 0.2098,
+      "grad_norm": 0.871356725692749,
+      "learning_rate": 0.003,
+      "loss": 4.0016,
+      "step": 20980
+    },
+    {
+      "epoch": 0.20981,
+      "grad_norm": 0.7936810255050659,
+      "learning_rate": 0.003,
+      "loss": 3.97,
+      "step": 20981
+    },
+    {
+      "epoch": 0.20982,
+      "grad_norm": 0.7631054520606995,
+      "learning_rate": 0.003,
+      "loss": 4.0025,
+      "step": 20982
+    },
+    {
+      "epoch": 0.20983,
+      "grad_norm": 0.6764727830886841,
+      "learning_rate": 0.003,
+      "loss": 3.9878,
+      "step": 20983
+    },
+    {
+      "epoch": 0.20984,
+      "grad_norm": 0.7330240607261658,
+      "learning_rate": 0.003,
+      "loss": 3.9802,
+      "step": 20984
+    },
+    {
+      "epoch": 0.20985,
+      "grad_norm": 0.7792537212371826,
+      "learning_rate": 0.003,
+      "loss": 4.0196,
+      "step": 20985
+    },
+    {
+      "epoch": 0.20986,
+      "grad_norm": 0.8118487596511841,
+      "learning_rate": 0.003,
+      "loss": 4.0021,
+      "step": 20986
+    },
+    {
+      "epoch": 0.20987,
+      "grad_norm": 0.679811418056488,
+      "learning_rate": 0.003,
+      "loss": 4.0033,
+      "step": 20987
+    },
+    {
+      "epoch": 0.20988,
+      "grad_norm": 0.6592569947242737,
+      "learning_rate": 0.003,
+      "loss": 3.9837,
+      "step": 20988
+    },
+    {
+      "epoch": 0.20989,
+      "grad_norm": 0.6194579005241394,
+      "learning_rate": 0.003,
+      "loss": 3.9828,
+      "step": 20989
+    },
+    {
+      "epoch": 0.2099,
+      "grad_norm": 0.653477668762207,
+      "learning_rate": 0.003,
+      "loss": 3.9986,
+      "step": 20990
+    },
+    {
+      "epoch": 0.20991,
+      "grad_norm": 0.8853954672813416,
+      "learning_rate": 0.003,
+      "loss": 4.0116,
+      "step": 20991
+    },
+    {
+      "epoch": 0.20992,
+      "grad_norm": 1.1030426025390625,
+      "learning_rate": 0.003,
+      "loss": 4.0146,
+      "step": 20992
+    },
+    {
+      "epoch": 0.20993,
+      "grad_norm": 0.9298046827316284,
+      "learning_rate": 0.003,
+      "loss": 4.0512,
+      "step": 20993
+    },
+    {
+      "epoch": 0.20994,
+      "grad_norm": 0.9900102615356445,
+      "learning_rate": 0.003,
+      "loss": 4.0136,
+      "step": 20994
+    },
+    {
+      "epoch": 0.20995,
+      "grad_norm": 1.241927146911621,
+      "learning_rate": 0.003,
+      "loss": 3.9804,
+      "step": 20995
+    },
+    {
+      "epoch": 0.20996,
+      "grad_norm": 0.8189589381217957,
+      "learning_rate": 0.003,
+      "loss": 3.9988,
+      "step": 20996
+    },
+    {
+      "epoch": 0.20997,
+      "grad_norm": 0.6784688830375671,
+      "learning_rate": 0.003,
+      "loss": 3.9769,
+      "step": 20997
+    },
+    {
+      "epoch": 0.20998,
+      "grad_norm": 0.8254179954528809,
+      "learning_rate": 0.003,
+      "loss": 3.9882,
+      "step": 20998
+    },
+    {
+      "epoch": 0.20999,
+      "grad_norm": 1.0192513465881348,
+      "learning_rate": 0.003,
+      "loss": 4.0049,
+      "step": 20999
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.1719720363616943,
+      "learning_rate": 0.003,
+      "loss": 4.0413,
+      "step": 21000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 100000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.29425556832256e+17,
+  "train_batch_size": 256,
+  "trial_name": null,
+  "trial_params": null
+}