muhtasham
/

MiniCPM-V-2_6_lora_20240917_011727

PEFT

Safetensors

Generated from Trainer

Model card Files Files and versions Community

muhtasham commited on Sep 17, 2024

Commit

43282d5

verified ·

1 Parent(s): 0a469ca

Model save

Browse files

Files changed (2) hide show

README.md +67 -0
trainer_state.json +2862 -0

README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+---
+base_model: openbmb/MiniCPM-V-2_6
+library_name: peft
+tags:
+- generated_from_trainer
+model-index:
+- name: MiniCPM-V-2_6_lora_20240917_011727
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# MiniCPM-V-2_6_lora_20240917_011727
+This model is a fine-tuned version of [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.3224
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5e-05
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 4
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 32
+- total_eval_batch_size: 8
+- optimizer: Adam with betas=(0.9,0.95) and epsilon=1e-08
+- lr_scheduler_type: cosine_with_restarts
+- lr_scheduler_warmup_ratio: 0.05
+- training_steps: 1000
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.3412        | 0.0349 | 100  | 1.3380          |
+| 1.3477        | 0.0698 | 200  | 1.3300          |
+| 1.3057        | 0.1047 | 300  | 1.3255          |
+| 1.3116        | 0.1396 | 400  | 1.3224          |
+### Framework versions
+- PEFT 0.12.0
+- Transformers 4.40.0
+- Pytorch 2.1.2+cu121
+- Tokenizers 0.19.1

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2862 @@

+{
+  "best_metric": 1.322394609451294,
+  "best_model_checkpoint": "output/output__lora/checkpoint-400",
+  "epoch": 0.139640425903299,
+  "eval_steps": 100,
+  "global_step": 400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00034910106475824753,
+      "grad_norm": 2.6783504486083984,
+      "learning_rate": 0.0,
+      "loss": 1.5271,
+      "step": 1
+    },
+    {
+      "epoch": 0.0006982021295164951,
+      "grad_norm": 1.3333820104599,
+      "learning_rate": 8.859191006777897e-06,
+      "loss": 1.3963,
+      "step": 2
+    },
+    {
+      "epoch": 0.0010473031942747426,
+      "grad_norm": 1.2807133197784424,
+      "learning_rate": 1.4041485532469073e-05,
+      "loss": 1.4192,
+      "step": 3
+    },
+    {
+      "epoch": 0.0013964042590329901,
+      "grad_norm": 1.1956514120101929,
+      "learning_rate": 1.7718382013555794e-05,
+      "loss": 1.5083,
+      "step": 4
+    },
+    {
+      "epoch": 0.0017455053237912376,
+      "grad_norm": 1.2733005285263062,
+      "learning_rate": 2.0570404496611053e-05,
+      "loss": 1.4963,
+      "step": 5
+    },
+    {
+      "epoch": 0.0020946063885494853,
+      "grad_norm": 0.8666600584983826,
+      "learning_rate": 2.2900676539246968e-05,
+      "loss": 1.5552,
+      "step": 6
+    },
+    {
+      "epoch": 0.0024437074533077328,
+      "grad_norm": 0.7445533275604248,
+      "learning_rate": 2.4870893478326387e-05,
+      "loss": 1.2858,
+      "step": 7
+    },
+    {
+      "epoch": 0.0027928085180659802,
+      "grad_norm": 0.8400186896324158,
+      "learning_rate": 2.6577573020333684e-05,
+      "loss": 1.3413,
+      "step": 8
+    },
+    {
+      "epoch": 0.0031419095828242277,
+      "grad_norm": 0.8454774618148804,
+      "learning_rate": 2.8082971064938146e-05,
+      "loss": 1.467,
+      "step": 9
+    },
+    {
+      "epoch": 0.003491010647582475,
+      "grad_norm": 0.8853550553321838,
+      "learning_rate": 2.9429595503388953e-05,
+      "loss": 1.4477,
+      "step": 10
+    },
+    {
+      "epoch": 0.0038401117123407227,
+      "grad_norm": 1.4953877925872803,
+      "learning_rate": 3.064776548439465e-05,
+      "loss": 1.4012,
+      "step": 11
+    },
+    {
+      "epoch": 0.0041892127770989706,
+      "grad_norm": 0.8356307148933411,
+      "learning_rate": 3.1759867546024865e-05,
+      "loss": 1.3855,
+      "step": 12
+    },
+    {
+      "epoch": 0.004538313841857218,
+      "grad_norm": 0.7591987252235413,
+      "learning_rate": 3.2782902272079295e-05,
+      "loss": 1.3561,
+      "step": 13
+    },
+    {
+      "epoch": 0.0048874149066154655,
+      "grad_norm": 0.9811077117919922,
+      "learning_rate": 3.373008448510428e-05,
+      "loss": 1.3175,
+      "step": 14
+    },
+    {
+      "epoch": 0.005236515971373713,
+      "grad_norm": 0.8403587341308594,
+      "learning_rate": 3.4611890029080124e-05,
+      "loss": 1.341,
+      "step": 15
+    },
+    {
+      "epoch": 0.0055856170361319605,
+      "grad_norm": 0.750234067440033,
+      "learning_rate": 3.543676402711159e-05,
+      "loss": 1.4247,
+      "step": 16
+    },
+    {
+      "epoch": 0.005934718100890208,
+      "grad_norm": 0.7567417621612549,
+      "learning_rate": 3.621161404374383e-05,
+      "loss": 1.416,
+      "step": 17
+    },
+    {
+      "epoch": 0.006283819165648455,
+      "grad_norm": 0.7126427292823792,
+      "learning_rate": 3.694216207171603e-05,
+      "loss": 1.4426,
+      "step": 18
+    },
+    {
+      "epoch": 0.006632920230406703,
+      "grad_norm": 0.7808831930160522,
+      "learning_rate": 3.76332012245438e-05,
+      "loss": 1.4287,
+      "step": 19
+    },
+    {
+      "epoch": 0.00698202129516495,
+      "grad_norm": 0.6165328025817871,
+      "learning_rate": 3.8288786510166846e-05,
+      "loss": 1.3391,
+      "step": 20
+    },
+    {
+      "epoch": 0.007331122359923198,
+      "grad_norm": 0.7212307453155518,
+      "learning_rate": 3.8912379010795455e-05,
+      "loss": 1.3375,
+      "step": 21
+    },
+    {
+      "epoch": 0.007680223424681445,
+      "grad_norm": 0.6797880530357361,
+      "learning_rate": 3.9506956491172545e-05,
+      "loss": 1.2713,
+      "step": 22
+    },
+    {
+      "epoch": 0.008029324489439693,
+      "grad_norm": 0.7757507562637329,
+      "learning_rate": 4.007509939970292e-05,
+      "loss": 1.3599,
+      "step": 23
+    },
+    {
+      "epoch": 0.008378425554197941,
+      "grad_norm": 0.539090096950531,
+      "learning_rate": 4.061905855280276e-05,
+      "loss": 1.5154,
+      "step": 24
+    },
+    {
+      "epoch": 0.008727526618956188,
+      "grad_norm": 0.652180552482605,
+      "learning_rate": 4.1140808993222106e-05,
+      "loss": 1.3438,
+      "step": 25
+    },
+    {
+      "epoch": 0.009076627683714436,
+      "grad_norm": 0.7319611310958862,
+      "learning_rate": 4.164209327885719e-05,
+      "loss": 1.5033,
+      "step": 26
+    },
+    {
+      "epoch": 0.009425728748472683,
+      "grad_norm": 0.702570378780365,
+      "learning_rate": 4.2124456597407214e-05,
+      "loss": 1.2238,
+      "step": 27
+    },
+    {
+      "epoch": 0.009774829813230931,
+      "grad_norm": 0.6835883855819702,
+      "learning_rate": 4.258927549188218e-05,
+      "loss": 1.3648,
+      "step": 28
+    },
+    {
+      "epoch": 0.010123930877989178,
+      "grad_norm": 0.6773353219032288,
+      "learning_rate": 4.303778154313212e-05,
+      "loss": 1.3074,
+      "step": 29
+    },
+    {
+      "epoch": 0.010473031942747426,
+      "grad_norm": 0.6387542486190796,
+      "learning_rate": 4.347108103585803e-05,
+      "loss": 1.2265,
+      "step": 30
+    },
+    {
+      "epoch": 0.010822133007505673,
+      "grad_norm": 0.6249099969863892,
+      "learning_rate": 4.389017139879164e-05,
+      "loss": 1.3321,
+      "step": 31
+    },
+    {
+      "epoch": 0.011171234072263921,
+      "grad_norm": 0.7121676802635193,
+      "learning_rate": 4.429595503388948e-05,
+      "loss": 1.3729,
+      "step": 32
+    },
+    {
+      "epoch": 0.011520335137022168,
+      "grad_norm": 0.7367205619812012,
+      "learning_rate": 4.468925101686371e-05,
+      "loss": 1.3937,
+      "step": 33
+    },
+    {
+      "epoch": 0.011869436201780416,
+      "grad_norm": 0.6183043718338013,
+      "learning_rate": 4.507080505052173e-05,
+      "loss": 1.4321,
+      "step": 34
+    },
+    {
+      "epoch": 0.012218537266538662,
+      "grad_norm": 1.1439142227172852,
+      "learning_rate": 4.544129797493744e-05,
+      "loss": 1.3515,
+      "step": 35
+    },
+    {
+      "epoch": 0.01256763833129691,
+      "grad_norm": 0.7980801463127136,
+      "learning_rate": 4.5801353078493936e-05,
+      "loss": 1.3929,
+      "step": 36
+    },
+    {
+      "epoch": 0.012916739396055157,
+      "grad_norm": 0.8890343904495239,
+      "learning_rate": 4.615154240700883e-05,
+      "loss": 1.2895,
+      "step": 37
+    },
+    {
+      "epoch": 0.013265840460813406,
+      "grad_norm": 0.7107703685760498,
+      "learning_rate": 4.6492392231321696e-05,
+      "loss": 1.3054,
+      "step": 38
+    },
+    {
+      "epoch": 0.013614941525571652,
+      "grad_norm": 0.605403482913971,
+      "learning_rate": 4.682438780454837e-05,
+      "loss": 1.3817,
+      "step": 39
+    },
+    {
+      "epoch": 0.0139640425903299,
+      "grad_norm": 0.6489142775535583,
+      "learning_rate": 4.714797751694474e-05,
+      "loss": 1.4109,
+      "step": 40
+    },
+    {
+      "epoch": 0.014313143655088147,
+      "grad_norm": 0.5896831750869751,
+      "learning_rate": 4.7463576537657414e-05,
+      "loss": 1.3383,
+      "step": 41
+    },
+    {
+      "epoch": 0.014662244719846396,
+      "grad_norm": 0.8319935202598572,
+      "learning_rate": 4.777157001757336e-05,
+      "loss": 1.4239,
+      "step": 42
+    },
+    {
+      "epoch": 0.015011345784604642,
+      "grad_norm": 0.6128418445587158,
+      "learning_rate": 4.8072315915252694e-05,
+      "loss": 1.3541,
+      "step": 43
+    },
+    {
+      "epoch": 0.01536044684936289,
+      "grad_norm": 0.6820589900016785,
+      "learning_rate": 4.8366147497950435e-05,
+      "loss": 1.2663,
+      "step": 44
+    },
+    {
+      "epoch": 0.015709547914121137,
+      "grad_norm": 0.8375743627548218,
+      "learning_rate": 4.8653375561549195e-05,
+      "loss": 1.3803,
+      "step": 45
+    },
+    {
+      "epoch": 0.016058648978879386,
+      "grad_norm": 0.6585806608200073,
+      "learning_rate": 4.8934290406480814e-05,
+      "loss": 1.3143,
+      "step": 46
+    },
+    {
+      "epoch": 0.016407750043637634,
+      "grad_norm": 0.7528412342071533,
+      "learning_rate": 4.920916360113129e-05,
+      "loss": 1.293,
+      "step": 47
+    },
+    {
+      "epoch": 0.016756851108395882,
+      "grad_norm": 0.6918306946754456,
+      "learning_rate": 4.947824955958066e-05,
+      "loss": 1.4991,
+      "step": 48
+    },
+    {
+      "epoch": 0.017105952173154127,
+      "grad_norm": 0.6764557361602783,
+      "learning_rate": 4.9741786956652774e-05,
+      "loss": 1.2755,
+      "step": 49
+    },
+    {
+      "epoch": 0.017455053237912375,
+      "grad_norm": 0.6525936722755432,
+      "learning_rate": 5e-05,
+      "loss": 1.3897,
+      "step": 50
+    },
+    {
+      "epoch": 0.017804154302670624,
+      "grad_norm": 0.627804160118103,
+      "learning_rate": 5e-05,
+      "loss": 1.3027,
+      "step": 51
+    },
+    {
+      "epoch": 0.018153255367428872,
+      "grad_norm": 0.8060218095779419,
+      "learning_rate": 5e-05,
+      "loss": 1.3477,
+      "step": 52
+    },
+    {
+      "epoch": 0.018502356432187117,
+      "grad_norm": 0.6655098795890808,
+      "learning_rate": 5e-05,
+      "loss": 1.3631,
+      "step": 53
+    },
+    {
+      "epoch": 0.018851457496945365,
+      "grad_norm": 0.7165637016296387,
+      "learning_rate": 5e-05,
+      "loss": 1.347,
+      "step": 54
+    },
+    {
+      "epoch": 0.019200558561703614,
+      "grad_norm": 0.6562020778656006,
+      "learning_rate": 5e-05,
+      "loss": 1.3535,
+      "step": 55
+    },
+    {
+      "epoch": 0.019549659626461862,
+      "grad_norm": 0.7588657736778259,
+      "learning_rate": 5e-05,
+      "loss": 1.3291,
+      "step": 56
+    },
+    {
+      "epoch": 0.019898760691220107,
+      "grad_norm": 0.6295105814933777,
+      "learning_rate": 5e-05,
+      "loss": 1.3542,
+      "step": 57
+    },
+    {
+      "epoch": 0.020247861755978355,
+      "grad_norm": 1.339097023010254,
+      "learning_rate": 5e-05,
+      "loss": 1.3649,
+      "step": 58
+    },
+    {
+      "epoch": 0.020596962820736604,
+      "grad_norm": 0.6976660490036011,
+      "learning_rate": 5e-05,
+      "loss": 1.2852,
+      "step": 59
+    },
+    {
+      "epoch": 0.020946063885494852,
+      "grad_norm": 0.7590420246124268,
+      "learning_rate": 5e-05,
+      "loss": 1.354,
+      "step": 60
+    },
+    {
+      "epoch": 0.021295164950253097,
+      "grad_norm": 0.6279817819595337,
+      "learning_rate": 5e-05,
+      "loss": 1.2537,
+      "step": 61
+    },
+    {
+      "epoch": 0.021644266015011345,
+      "grad_norm": 0.6099221110343933,
+      "learning_rate": 5e-05,
+      "loss": 1.2423,
+      "step": 62
+    },
+    {
+      "epoch": 0.021993367079769593,
+      "grad_norm": 0.6252647638320923,
+      "learning_rate": 5e-05,
+      "loss": 1.3667,
+      "step": 63
+    },
+    {
+      "epoch": 0.022342468144527842,
+      "grad_norm": 0.8939846158027649,
+      "learning_rate": 5e-05,
+      "loss": 1.2889,
+      "step": 64
+    },
+    {
+      "epoch": 0.022691569209286087,
+      "grad_norm": 0.85840904712677,
+      "learning_rate": 5e-05,
+      "loss": 1.3747,
+      "step": 65
+    },
+    {
+      "epoch": 0.023040670274044335,
+      "grad_norm": 0.8478113412857056,
+      "learning_rate": 5e-05,
+      "loss": 1.3417,
+      "step": 66
+    },
+    {
+      "epoch": 0.023389771338802583,
+      "grad_norm": 0.6869573593139648,
+      "learning_rate": 5e-05,
+      "loss": 1.4033,
+      "step": 67
+    },
+    {
+      "epoch": 0.02373887240356083,
+      "grad_norm": 0.6566379070281982,
+      "learning_rate": 5e-05,
+      "loss": 1.3617,
+      "step": 68
+    },
+    {
+      "epoch": 0.02408797346831908,
+      "grad_norm": 0.6871697306632996,
+      "learning_rate": 5e-05,
+      "loss": 1.2932,
+      "step": 69
+    },
+    {
+      "epoch": 0.024437074533077325,
+      "grad_norm": 0.7102701663970947,
+      "learning_rate": 5e-05,
+      "loss": 1.4062,
+      "step": 70
+    },
+    {
+      "epoch": 0.024786175597835573,
+      "grad_norm": 0.8392966985702515,
+      "learning_rate": 5e-05,
+      "loss": 1.1992,
+      "step": 71
+    },
+    {
+      "epoch": 0.02513527666259382,
+      "grad_norm": 0.670971155166626,
+      "learning_rate": 5e-05,
+      "loss": 1.4131,
+      "step": 72
+    },
+    {
+      "epoch": 0.02548437772735207,
+      "grad_norm": 0.7271628975868225,
+      "learning_rate": 5e-05,
+      "loss": 1.2928,
+      "step": 73
+    },
+    {
+      "epoch": 0.025833478792110315,
+      "grad_norm": 0.7184221744537354,
+      "learning_rate": 5e-05,
+      "loss": 1.2239,
+      "step": 74
+    },
+    {
+      "epoch": 0.026182579856868563,
+      "grad_norm": 0.5685485005378723,
+      "learning_rate": 5e-05,
+      "loss": 1.2692,
+      "step": 75
+    },
+    {
+      "epoch": 0.02653168092162681,
+      "grad_norm": 0.5677881836891174,
+      "learning_rate": 5e-05,
+      "loss": 1.2951,
+      "step": 76
+    },
+    {
+      "epoch": 0.02688078198638506,
+      "grad_norm": 0.6896436810493469,
+      "learning_rate": 5e-05,
+      "loss": 1.3297,
+      "step": 77
+    },
+    {
+      "epoch": 0.027229883051143305,
+      "grad_norm": 0.6284964084625244,
+      "learning_rate": 5e-05,
+      "loss": 1.2402,
+      "step": 78
+    },
+    {
+      "epoch": 0.027578984115901553,
+      "grad_norm": 0.618015468120575,
+      "learning_rate": 5e-05,
+      "loss": 1.2999,
+      "step": 79
+    },
+    {
+      "epoch": 0.0279280851806598,
+      "grad_norm": 0.7585094571113586,
+      "learning_rate": 5e-05,
+      "loss": 1.3378,
+      "step": 80
+    },
+    {
+      "epoch": 0.02827718624541805,
+      "grad_norm": 0.6674929857254028,
+      "learning_rate": 5e-05,
+      "loss": 1.3585,
+      "step": 81
+    },
+    {
+      "epoch": 0.028626287310176295,
+      "grad_norm": 0.583121120929718,
+      "learning_rate": 5e-05,
+      "loss": 1.3236,
+      "step": 82
+    },
+    {
+      "epoch": 0.028975388374934543,
+      "grad_norm": 0.661668062210083,
+      "learning_rate": 5e-05,
+      "loss": 1.3264,
+      "step": 83
+    },
+    {
+      "epoch": 0.02932448943969279,
+      "grad_norm": 0.8168457746505737,
+      "learning_rate": 5e-05,
+      "loss": 1.3132,
+      "step": 84
+    },
+    {
+      "epoch": 0.02967359050445104,
+      "grad_norm": 0.6123843193054199,
+      "learning_rate": 5e-05,
+      "loss": 1.3224,
+      "step": 85
+    },
+    {
+      "epoch": 0.030022691569209285,
+      "grad_norm": 0.7081793546676636,
+      "learning_rate": 5e-05,
+      "loss": 1.3641,
+      "step": 86
+    },
+    {
+      "epoch": 0.030371792633967533,
+      "grad_norm": 0.7772612571716309,
+      "learning_rate": 5e-05,
+      "loss": 1.3634,
+      "step": 87
+    },
+    {
+      "epoch": 0.03072089369872578,
+      "grad_norm": 0.603370726108551,
+      "learning_rate": 5e-05,
+      "loss": 1.4486,
+      "step": 88
+    },
+    {
+      "epoch": 0.03106999476348403,
+      "grad_norm": 0.6567598581314087,
+      "learning_rate": 5e-05,
+      "loss": 1.4228,
+      "step": 89
+    },
+    {
+      "epoch": 0.031419095828242274,
+      "grad_norm": 0.6245101690292358,
+      "learning_rate": 5e-05,
+      "loss": 1.2928,
+      "step": 90
+    },
+    {
+      "epoch": 0.031768196893000526,
+      "grad_norm": 0.7198782563209534,
+      "learning_rate": 5e-05,
+      "loss": 1.3304,
+      "step": 91
+    },
+    {
+      "epoch": 0.03211729795775877,
+      "grad_norm": 0.526452898979187,
+      "learning_rate": 5e-05,
+      "loss": 1.3418,
+      "step": 92
+    },
+    {
+      "epoch": 0.032466399022517016,
+      "grad_norm": 0.7534317374229431,
+      "learning_rate": 5e-05,
+      "loss": 1.333,
+      "step": 93
+    },
+    {
+      "epoch": 0.03281550008727527,
+      "grad_norm": 0.5721869468688965,
+      "learning_rate": 5e-05,
+      "loss": 1.1849,
+      "step": 94
+    },
+    {
+      "epoch": 0.03316460115203351,
+      "grad_norm": 0.6943261027336121,
+      "learning_rate": 5e-05,
+      "loss": 1.3263,
+      "step": 95
+    },
+    {
+      "epoch": 0.033513702216791764,
+      "grad_norm": 0.5904171466827393,
+      "learning_rate": 5e-05,
+      "loss": 1.3103,
+      "step": 96
+    },
+    {
+      "epoch": 0.03386280328155001,
+      "grad_norm": 0.7743117809295654,
+      "learning_rate": 5e-05,
+      "loss": 1.3633,
+      "step": 97
+    },
+    {
+      "epoch": 0.034211904346308254,
+      "grad_norm": 1.298839807510376,
+      "learning_rate": 5e-05,
+      "loss": 1.335,
+      "step": 98
+    },
+    {
+      "epoch": 0.034561005411066506,
+      "grad_norm": 0.7134571671485901,
+      "learning_rate": 5e-05,
+      "loss": 1.4154,
+      "step": 99
+    },
+    {
+      "epoch": 0.03491010647582475,
+      "grad_norm": 0.6801385879516602,
+      "learning_rate": 5e-05,
+      "loss": 1.3412,
+      "step": 100
+    },
+    {
+      "epoch": 0.03491010647582475,
+      "eval_loss": 1.337953805923462,
+      "eval_runtime": 3305.6905,
+      "eval_samples_per_second": 6.932,
+      "eval_steps_per_second": 0.867,
+      "step": 100
+    },
+    {
+      "epoch": 0.035259207540582996,
+      "grad_norm": 1.0192288160324097,
+      "learning_rate": 5e-05,
+      "loss": 1.2821,
+      "step": 101
+    },
+    {
+      "epoch": 0.03560830860534125,
+      "grad_norm": 0.6322550773620605,
+      "learning_rate": 5e-05,
+      "loss": 1.3561,
+      "step": 102
+    },
+    {
+      "epoch": 0.03595740967009949,
+      "grad_norm": 0.6499407291412354,
+      "learning_rate": 5e-05,
+      "loss": 1.3164,
+      "step": 103
+    },
+    {
+      "epoch": 0.036306510734857744,
+      "grad_norm": 0.7576645612716675,
+      "learning_rate": 5e-05,
+      "loss": 1.2924,
+      "step": 104
+    },
+    {
+      "epoch": 0.03665561179961599,
+      "grad_norm": 0.6215568780899048,
+      "learning_rate": 5e-05,
+      "loss": 1.2551,
+      "step": 105
+    },
+    {
+      "epoch": 0.037004712864374234,
+      "grad_norm": 0.6197790503501892,
+      "learning_rate": 5e-05,
+      "loss": 1.317,
+      "step": 106
+    },
+    {
+      "epoch": 0.037353813929132486,
+      "grad_norm": 0.677772045135498,
+      "learning_rate": 5e-05,
+      "loss": 1.428,
+      "step": 107
+    },
+    {
+      "epoch": 0.03770291499389073,
+      "grad_norm": 0.6386198401451111,
+      "learning_rate": 5e-05,
+      "loss": 1.4206,
+      "step": 108
+    },
+    {
+      "epoch": 0.038052016058648976,
+      "grad_norm": 1.113053798675537,
+      "learning_rate": 5e-05,
+      "loss": 1.3992,
+      "step": 109
+    },
+    {
+      "epoch": 0.03840111712340723,
+      "grad_norm": 0.668409526348114,
+      "learning_rate": 5e-05,
+      "loss": 1.3358,
+      "step": 110
+    },
+    {
+      "epoch": 0.03875021818816547,
+      "grad_norm": 0.6381022930145264,
+      "learning_rate": 5e-05,
+      "loss": 1.245,
+      "step": 111
+    },
+    {
+      "epoch": 0.039099319252923724,
+      "grad_norm": 0.7082274556159973,
+      "learning_rate": 5e-05,
+      "loss": 1.3107,
+      "step": 112
+    },
+    {
+      "epoch": 0.03944842031768197,
+      "grad_norm": 0.6497403979301453,
+      "learning_rate": 5e-05,
+      "loss": 1.3174,
+      "step": 113
+    },
+    {
+      "epoch": 0.039797521382440214,
+      "grad_norm": 0.7390655279159546,
+      "learning_rate": 5e-05,
+      "loss": 1.2791,
+      "step": 114
+    },
+    {
+      "epoch": 0.040146622447198466,
+      "grad_norm": 0.6828505992889404,
+      "learning_rate": 5e-05,
+      "loss": 1.3903,
+      "step": 115
+    },
+    {
+      "epoch": 0.04049572351195671,
+      "grad_norm": 0.6913119554519653,
+      "learning_rate": 5e-05,
+      "loss": 1.3147,
+      "step": 116
+    },
+    {
+      "epoch": 0.04084482457671496,
+      "grad_norm": 0.6394439339637756,
+      "learning_rate": 5e-05,
+      "loss": 1.3308,
+      "step": 117
+    },
+    {
+      "epoch": 0.04119392564147321,
+      "grad_norm": 0.6368663907051086,
+      "learning_rate": 5e-05,
+      "loss": 1.3021,
+      "step": 118
+    },
+    {
+      "epoch": 0.04154302670623145,
+      "grad_norm": 0.625417947769165,
+      "learning_rate": 5e-05,
+      "loss": 1.4122,
+      "step": 119
+    },
+    {
+      "epoch": 0.041892127770989704,
+      "grad_norm": 0.5640509724617004,
+      "learning_rate": 5e-05,
+      "loss": 1.3216,
+      "step": 120
+    },
+    {
+      "epoch": 0.04224122883574795,
+      "grad_norm": 0.6355682611465454,
+      "learning_rate": 5e-05,
+      "loss": 1.2522,
+      "step": 121
+    },
+    {
+      "epoch": 0.042590329900506194,
+      "grad_norm": 2.130183696746826,
+      "learning_rate": 5e-05,
+      "loss": 1.398,
+      "step": 122
+    },
+    {
+      "epoch": 0.042939430965264445,
+      "grad_norm": 0.7858290672302246,
+      "learning_rate": 5e-05,
+      "loss": 1.3543,
+      "step": 123
+    },
+    {
+      "epoch": 0.04328853203002269,
+      "grad_norm": 0.6912608742713928,
+      "learning_rate": 5e-05,
+      "loss": 1.3338,
+      "step": 124
+    },
+    {
+      "epoch": 0.04363763309478094,
+      "grad_norm": 0.6326834559440613,
+      "learning_rate": 5e-05,
+      "loss": 1.2968,
+      "step": 125
+    },
+    {
+      "epoch": 0.04398673415953919,
+      "grad_norm": 0.6076151728630066,
+      "learning_rate": 5e-05,
+      "loss": 1.2705,
+      "step": 126
+    },
+    {
+      "epoch": 0.04433583522429743,
+      "grad_norm": 0.767652153968811,
+      "learning_rate": 5e-05,
+      "loss": 1.3601,
+      "step": 127
+    },
+    {
+      "epoch": 0.044684936289055684,
+      "grad_norm": 0.621769905090332,
+      "learning_rate": 5e-05,
+      "loss": 1.2834,
+      "step": 128
+    },
+    {
+      "epoch": 0.04503403735381393,
+      "grad_norm": 0.6216384768486023,
+      "learning_rate": 5e-05,
+      "loss": 1.3322,
+      "step": 129
+    },
+    {
+      "epoch": 0.04538313841857217,
+      "grad_norm": 0.626325249671936,
+      "learning_rate": 5e-05,
+      "loss": 1.4601,
+      "step": 130
+    },
+    {
+      "epoch": 0.045732239483330425,
+      "grad_norm": 0.8063498735427856,
+      "learning_rate": 5e-05,
+      "loss": 1.293,
+      "step": 131
+    },
+    {
+      "epoch": 0.04608134054808867,
+      "grad_norm": 1.117038369178772,
+      "learning_rate": 5e-05,
+      "loss": 1.3635,
+      "step": 132
+    },
+    {
+      "epoch": 0.04643044161284692,
+      "grad_norm": 1.4540647268295288,
+      "learning_rate": 5e-05,
+      "loss": 1.3346,
+      "step": 133
+    },
+    {
+      "epoch": 0.04677954267760517,
+      "grad_norm": 0.6695774793624878,
+      "learning_rate": 5e-05,
+      "loss": 1.4109,
+      "step": 134
+    },
+    {
+      "epoch": 0.04712864374236341,
+      "grad_norm": 0.8146533370018005,
+      "learning_rate": 5e-05,
+      "loss": 1.3515,
+      "step": 135
+    },
+    {
+      "epoch": 0.04747774480712166,
+      "grad_norm": 0.6705998778343201,
+      "learning_rate": 5e-05,
+      "loss": 1.2752,
+      "step": 136
+    },
+    {
+      "epoch": 0.04782684587187991,
+      "grad_norm": 0.7589219808578491,
+      "learning_rate": 5e-05,
+      "loss": 1.4393,
+      "step": 137
+    },
+    {
+      "epoch": 0.04817594693663816,
+      "grad_norm": 0.9603825807571411,
+      "learning_rate": 5e-05,
+      "loss": 1.4609,
+      "step": 138
+    },
+    {
+      "epoch": 0.048525048001396405,
+      "grad_norm": 0.6351510286331177,
+      "learning_rate": 5e-05,
+      "loss": 1.371,
+      "step": 139
+    },
+    {
+      "epoch": 0.04887414906615465,
+      "grad_norm": 0.5652881860733032,
+      "learning_rate": 5e-05,
+      "loss": 1.2845,
+      "step": 140
+    },
+    {
+      "epoch": 0.0492232501309129,
+      "grad_norm": 0.7579118609428406,
+      "learning_rate": 5e-05,
+      "loss": 1.2526,
+      "step": 141
+    },
+    {
+      "epoch": 0.04957235119567115,
+      "grad_norm": 0.7851598262786865,
+      "learning_rate": 5e-05,
+      "loss": 1.3379,
+      "step": 142
+    },
+    {
+      "epoch": 0.04992145226042939,
+      "grad_norm": 0.5865357518196106,
+      "learning_rate": 5e-05,
+      "loss": 1.4802,
+      "step": 143
+    },
+    {
+      "epoch": 0.05027055332518764,
+      "grad_norm": 1.3862611055374146,
+      "learning_rate": 5e-05,
+      "loss": 1.357,
+      "step": 144
+    },
+    {
+      "epoch": 0.05061965438994589,
+      "grad_norm": 0.6249399185180664,
+      "learning_rate": 5e-05,
+      "loss": 1.2587,
+      "step": 145
+    },
+    {
+      "epoch": 0.05096875545470414,
+      "grad_norm": 0.5966644883155823,
+      "learning_rate": 5e-05,
+      "loss": 1.3534,
+      "step": 146
+    },
+    {
+      "epoch": 0.051317856519462385,
+      "grad_norm": 0.6312971711158752,
+      "learning_rate": 5e-05,
+      "loss": 1.1815,
+      "step": 147
+    },
+    {
+      "epoch": 0.05166695758422063,
+      "grad_norm": 0.6539703011512756,
+      "learning_rate": 5e-05,
+      "loss": 1.3946,
+      "step": 148
+    },
+    {
+      "epoch": 0.05201605864897888,
+      "grad_norm": 0.8756076097488403,
+      "learning_rate": 5e-05,
+      "loss": 1.2384,
+      "step": 149
+    },
+    {
+      "epoch": 0.052365159713737126,
+      "grad_norm": 0.7149311304092407,
+      "learning_rate": 5e-05,
+      "loss": 1.2998,
+      "step": 150
+    },
+    {
+      "epoch": 0.05271426077849537,
+      "grad_norm": 0.79525226354599,
+      "learning_rate": 5e-05,
+      "loss": 1.3376,
+      "step": 151
+    },
+    {
+      "epoch": 0.05306336184325362,
+      "grad_norm": 0.6921191811561584,
+      "learning_rate": 5e-05,
+      "loss": 1.3461,
+      "step": 152
+    },
+    {
+      "epoch": 0.05341246290801187,
+      "grad_norm": 0.7444896697998047,
+      "learning_rate": 5e-05,
+      "loss": 1.4089,
+      "step": 153
+    },
+    {
+      "epoch": 0.05376156397277012,
+      "grad_norm": 0.6216670274734497,
+      "learning_rate": 5e-05,
+      "loss": 1.3402,
+      "step": 154
+    },
+    {
+      "epoch": 0.054110665037528365,
+      "grad_norm": 0.5917710661888123,
+      "learning_rate": 5e-05,
+      "loss": 1.3253,
+      "step": 155
+    },
+    {
+      "epoch": 0.05445976610228661,
+      "grad_norm": 0.8648408055305481,
+      "learning_rate": 5e-05,
+      "loss": 1.4447,
+      "step": 156
+    },
+    {
+      "epoch": 0.05480886716704486,
+      "grad_norm": 0.6752570271492004,
+      "learning_rate": 5e-05,
+      "loss": 1.3097,
+      "step": 157
+    },
+    {
+      "epoch": 0.055157968231803106,
+      "grad_norm": 0.5603750944137573,
+      "learning_rate": 5e-05,
+      "loss": 1.4177,
+      "step": 158
+    },
+    {
+      "epoch": 0.05550706929656136,
+      "grad_norm": 0.6317929029464722,
+      "learning_rate": 5e-05,
+      "loss": 1.3509,
+      "step": 159
+    },
+    {
+      "epoch": 0.0558561703613196,
+      "grad_norm": 0.6017687320709229,
+      "learning_rate": 5e-05,
+      "loss": 1.3471,
+      "step": 160
+    },
+    {
+      "epoch": 0.05620527142607785,
+      "grad_norm": 0.6761009693145752,
+      "learning_rate": 5e-05,
+      "loss": 1.4473,
+      "step": 161
+    },
+    {
+      "epoch": 0.0565543724908361,
+      "grad_norm": 0.7266319990158081,
+      "learning_rate": 5e-05,
+      "loss": 1.2896,
+      "step": 162
+    },
+    {
+      "epoch": 0.056903473555594344,
+      "grad_norm": 0.6436321139335632,
+      "learning_rate": 5e-05,
+      "loss": 1.2812,
+      "step": 163
+    },
+    {
+      "epoch": 0.05725257462035259,
+      "grad_norm": 0.9664864540100098,
+      "learning_rate": 5e-05,
+      "loss": 1.294,
+      "step": 164
+    },
+    {
+      "epoch": 0.05760167568511084,
+      "grad_norm": 0.6690096855163574,
+      "learning_rate": 5e-05,
+      "loss": 1.2801,
+      "step": 165
+    },
+    {
+      "epoch": 0.057950776749869086,
+      "grad_norm": 0.6227753162384033,
+      "learning_rate": 5e-05,
+      "loss": 1.3384,
+      "step": 166
+    },
+    {
+      "epoch": 0.05829987781462734,
+      "grad_norm": 0.7900117039680481,
+      "learning_rate": 5e-05,
+      "loss": 1.3424,
+      "step": 167
+    },
+    {
+      "epoch": 0.05864897887938558,
+      "grad_norm": 0.6928064823150635,
+      "learning_rate": 5e-05,
+      "loss": 1.296,
+      "step": 168
+    },
+    {
+      "epoch": 0.05899807994414383,
+      "grad_norm": 0.8754634261131287,
+      "learning_rate": 5e-05,
+      "loss": 1.4471,
+      "step": 169
+    },
+    {
+      "epoch": 0.05934718100890208,
+      "grad_norm": 0.5537067651748657,
+      "learning_rate": 5e-05,
+      "loss": 1.2825,
+      "step": 170
+    },
+    {
+      "epoch": 0.059696282073660324,
+      "grad_norm": 0.6705783009529114,
+      "learning_rate": 5e-05,
+      "loss": 1.3768,
+      "step": 171
+    },
+    {
+      "epoch": 0.06004538313841857,
+      "grad_norm": 0.5732744932174683,
+      "learning_rate": 5e-05,
+      "loss": 1.3309,
+      "step": 172
+    },
+    {
+      "epoch": 0.06039448420317682,
+      "grad_norm": 1.120721459388733,
+      "learning_rate": 5e-05,
+      "loss": 1.3702,
+      "step": 173
+    },
+    {
+      "epoch": 0.060743585267935066,
+      "grad_norm": 0.7755718231201172,
+      "learning_rate": 5e-05,
+      "loss": 1.3425,
+      "step": 174
+    },
+    {
+      "epoch": 0.06109268633269332,
+      "grad_norm": 0.5984740257263184,
+      "learning_rate": 5e-05,
+      "loss": 1.4886,
+      "step": 175
+    },
+    {
+      "epoch": 0.06144178739745156,
+      "grad_norm": 0.7374542951583862,
+      "learning_rate": 5e-05,
+      "loss": 1.3667,
+      "step": 176
+    },
+    {
+      "epoch": 0.06179088846220981,
+      "grad_norm": 0.5558515787124634,
+      "learning_rate": 5e-05,
+      "loss": 1.3737,
+      "step": 177
+    },
+    {
+      "epoch": 0.06213998952696806,
+      "grad_norm": 0.700268566608429,
+      "learning_rate": 5e-05,
+      "loss": 1.364,
+      "step": 178
+    },
+    {
+      "epoch": 0.062489090591726304,
+      "grad_norm": 0.5781232118606567,
+      "learning_rate": 5e-05,
+      "loss": 1.3443,
+      "step": 179
+    },
+    {
+      "epoch": 0.06283819165648455,
+      "grad_norm": 0.7157448530197144,
+      "learning_rate": 5e-05,
+      "loss": 1.3702,
+      "step": 180
+    },
+    {
+      "epoch": 0.0631872927212428,
+      "grad_norm": 0.5329631567001343,
+      "learning_rate": 5e-05,
+      "loss": 1.1786,
+      "step": 181
+    },
+    {
+      "epoch": 0.06353639378600105,
+      "grad_norm": 0.5949011445045471,
+      "learning_rate": 5e-05,
+      "loss": 1.3809,
+      "step": 182
+    },
+    {
+      "epoch": 0.0638854948507593,
+      "grad_norm": 0.6756107807159424,
+      "learning_rate": 5e-05,
+      "loss": 1.2792,
+      "step": 183
+    },
+    {
+      "epoch": 0.06423459591551754,
+      "grad_norm": 0.7747790813446045,
+      "learning_rate": 5e-05,
+      "loss": 1.3714,
+      "step": 184
+    },
+    {
+      "epoch": 0.06458369698027579,
+      "grad_norm": 1.1907461881637573,
+      "learning_rate": 5e-05,
+      "loss": 1.3055,
+      "step": 185
+    },
+    {
+      "epoch": 0.06493279804503403,
+      "grad_norm": 0.5747818946838379,
+      "learning_rate": 5e-05,
+      "loss": 1.2003,
+      "step": 186
+    },
+    {
+      "epoch": 0.06528189910979229,
+      "grad_norm": 0.614464521408081,
+      "learning_rate": 5e-05,
+      "loss": 1.3108,
+      "step": 187
+    },
+    {
+      "epoch": 0.06563100017455054,
+      "grad_norm": 0.6040724515914917,
+      "learning_rate": 5e-05,
+      "loss": 1.2371,
+      "step": 188
+    },
+    {
+      "epoch": 0.06598010123930878,
+      "grad_norm": 0.6369174122810364,
+      "learning_rate": 5e-05,
+      "loss": 1.1662,
+      "step": 189
+    },
+    {
+      "epoch": 0.06632920230406703,
+      "grad_norm": 0.6132228374481201,
+      "learning_rate": 5e-05,
+      "loss": 1.3257,
+      "step": 190
+    },
+    {
+      "epoch": 0.06667830336882527,
+      "grad_norm": 0.6686124801635742,
+      "learning_rate": 5e-05,
+      "loss": 1.3757,
+      "step": 191
+    },
+    {
+      "epoch": 0.06702740443358353,
+      "grad_norm": 0.6709855794906616,
+      "learning_rate": 5e-05,
+      "loss": 1.3341,
+      "step": 192
+    },
+    {
+      "epoch": 0.06737650549834177,
+      "grad_norm": 0.5295905470848083,
+      "learning_rate": 5e-05,
+      "loss": 1.2587,
+      "step": 193
+    },
+    {
+      "epoch": 0.06772560656310002,
+      "grad_norm": 0.6111523509025574,
+      "learning_rate": 5e-05,
+      "loss": 1.3365,
+      "step": 194
+    },
+    {
+      "epoch": 0.06807470762785826,
+      "grad_norm": 0.5655878782272339,
+      "learning_rate": 5e-05,
+      "loss": 1.3265,
+      "step": 195
+    },
+    {
+      "epoch": 0.06842380869261651,
+      "grad_norm": 0.6125257015228271,
+      "learning_rate": 5e-05,
+      "loss": 1.3475,
+      "step": 196
+    },
+    {
+      "epoch": 0.06877290975737475,
+      "grad_norm": 0.6268573999404907,
+      "learning_rate": 5e-05,
+      "loss": 1.3002,
+      "step": 197
+    },
+    {
+      "epoch": 0.06912201082213301,
+      "grad_norm": 0.7267619967460632,
+      "learning_rate": 5e-05,
+      "loss": 1.4104,
+      "step": 198
+    },
+    {
+      "epoch": 0.06947111188689126,
+      "grad_norm": 0.5741710066795349,
+      "learning_rate": 5e-05,
+      "loss": 1.318,
+      "step": 199
+    },
+    {
+      "epoch": 0.0698202129516495,
+      "grad_norm": 0.6447280049324036,
+      "learning_rate": 5e-05,
+      "loss": 1.3477,
+      "step": 200
+    },
+    {
+      "epoch": 0.0698202129516495,
+      "eval_loss": 1.3300124406814575,
+      "eval_runtime": 3301.7334,
+      "eval_samples_per_second": 6.941,
+      "eval_steps_per_second": 0.868,
+      "step": 200
+    },
+    {
+      "epoch": 0.07016931401640775,
+      "grad_norm": 1.4164685010910034,
+      "learning_rate": 5e-05,
+      "loss": 1.4048,
+      "step": 201
+    },
+    {
+      "epoch": 0.07051841508116599,
+      "grad_norm": 0.5867809057235718,
+      "learning_rate": 5e-05,
+      "loss": 1.4018,
+      "step": 202
+    },
+    {
+      "epoch": 0.07086751614592425,
+      "grad_norm": 0.6882596611976624,
+      "learning_rate": 5e-05,
+      "loss": 1.2737,
+      "step": 203
+    },
+    {
+      "epoch": 0.0712166172106825,
+      "grad_norm": 0.6038634181022644,
+      "learning_rate": 5e-05,
+      "loss": 1.2399,
+      "step": 204
+    },
+    {
+      "epoch": 0.07156571827544074,
+      "grad_norm": 0.6428863406181335,
+      "learning_rate": 5e-05,
+      "loss": 1.3729,
+      "step": 205
+    },
+    {
+      "epoch": 0.07191481934019898,
+      "grad_norm": 0.7008076906204224,
+      "learning_rate": 5e-05,
+      "loss": 1.3353,
+      "step": 206
+    },
+    {
+      "epoch": 0.07226392040495723,
+      "grad_norm": 0.6662419438362122,
+      "learning_rate": 5e-05,
+      "loss": 1.3442,
+      "step": 207
+    },
+    {
+      "epoch": 0.07261302146971549,
+      "grad_norm": 0.7249788045883179,
+      "learning_rate": 5e-05,
+      "loss": 1.2526,
+      "step": 208
+    },
+    {
+      "epoch": 0.07296212253447373,
+      "grad_norm": 0.6323925852775574,
+      "learning_rate": 5e-05,
+      "loss": 1.2929,
+      "step": 209
+    },
+    {
+      "epoch": 0.07331122359923198,
+      "grad_norm": 0.8273724317550659,
+      "learning_rate": 5e-05,
+      "loss": 1.5291,
+      "step": 210
+    },
+    {
+      "epoch": 0.07366032466399022,
+      "grad_norm": 0.8445104956626892,
+      "learning_rate": 5e-05,
+      "loss": 1.2417,
+      "step": 211
+    },
+    {
+      "epoch": 0.07400942572874847,
+      "grad_norm": 0.6157236695289612,
+      "learning_rate": 5e-05,
+      "loss": 1.3739,
+      "step": 212
+    },
+    {
+      "epoch": 0.07435852679350673,
+      "grad_norm": 0.6917769312858582,
+      "learning_rate": 5e-05,
+      "loss": 1.3078,
+      "step": 213
+    },
+    {
+      "epoch": 0.07470762785826497,
+      "grad_norm": 0.7838917970657349,
+      "learning_rate": 5e-05,
+      "loss": 1.3086,
+      "step": 214
+    },
+    {
+      "epoch": 0.07505672892302322,
+      "grad_norm": 0.6962039470672607,
+      "learning_rate": 5e-05,
+      "loss": 1.3907,
+      "step": 215
+    },
+    {
+      "epoch": 0.07540582998778146,
+      "grad_norm": 0.6962039470672607,
+      "learning_rate": 5e-05,
+      "loss": 1.3615,
+      "step": 216
+    },
+    {
+      "epoch": 0.0757549310525397,
+      "grad_norm": 0.6687365770339966,
+      "learning_rate": 5e-05,
+      "loss": 1.3408,
+      "step": 217
+    },
+    {
+      "epoch": 0.07610403211729795,
+      "grad_norm": 0.5566404461860657,
+      "learning_rate": 5e-05,
+      "loss": 1.2872,
+      "step": 218
+    },
+    {
+      "epoch": 0.07645313318205621,
+      "grad_norm": 0.6419705748558044,
+      "learning_rate": 5e-05,
+      "loss": 1.2883,
+      "step": 219
+    },
+    {
+      "epoch": 0.07680223424681445,
+      "grad_norm": 0.7758398652076721,
+      "learning_rate": 5e-05,
+      "loss": 1.3832,
+      "step": 220
+    },
+    {
+      "epoch": 0.0771513353115727,
+      "grad_norm": 0.9763804078102112,
+      "learning_rate": 5e-05,
+      "loss": 1.3414,
+      "step": 221
+    },
+    {
+      "epoch": 0.07750043637633094,
+      "grad_norm": 0.8815904259681702,
+      "learning_rate": 5e-05,
+      "loss": 1.3297,
+      "step": 222
+    },
+    {
+      "epoch": 0.07784953744108919,
+      "grad_norm": 0.590263307094574,
+      "learning_rate": 5e-05,
+      "loss": 1.3401,
+      "step": 223
+    },
+    {
+      "epoch": 0.07819863850584745,
+      "grad_norm": 0.677057147026062,
+      "learning_rate": 5e-05,
+      "loss": 1.2449,
+      "step": 224
+    },
+    {
+      "epoch": 0.07854773957060569,
+      "grad_norm": 1.5185271501541138,
+      "learning_rate": 5e-05,
+      "loss": 1.3127,
+      "step": 225
+    },
+    {
+      "epoch": 0.07889684063536394,
+      "grad_norm": 0.5751495957374573,
+      "learning_rate": 5e-05,
+      "loss": 1.1587,
+      "step": 226
+    },
+    {
+      "epoch": 0.07924594170012218,
+      "grad_norm": 0.8122138977050781,
+      "learning_rate": 5e-05,
+      "loss": 1.2316,
+      "step": 227
+    },
+    {
+      "epoch": 0.07959504276488043,
+      "grad_norm": 0.6675130724906921,
+      "learning_rate": 5e-05,
+      "loss": 1.3539,
+      "step": 228
+    },
+    {
+      "epoch": 0.07994414382963869,
+      "grad_norm": 0.8163532614707947,
+      "learning_rate": 5e-05,
+      "loss": 1.328,
+      "step": 229
+    },
+    {
+      "epoch": 0.08029324489439693,
+      "grad_norm": 0.8377723693847656,
+      "learning_rate": 5e-05,
+      "loss": 1.353,
+      "step": 230
+    },
+    {
+      "epoch": 0.08064234595915518,
+      "grad_norm": 0.7325611710548401,
+      "learning_rate": 5e-05,
+      "loss": 1.3396,
+      "step": 231
+    },
+    {
+      "epoch": 0.08099144702391342,
+      "grad_norm": 0.8941824436187744,
+      "learning_rate": 5e-05,
+      "loss": 1.2906,
+      "step": 232
+    },
+    {
+      "epoch": 0.08134054808867167,
+      "grad_norm": 0.6284440159797668,
+      "learning_rate": 5e-05,
+      "loss": 1.4264,
+      "step": 233
+    },
+    {
+      "epoch": 0.08168964915342992,
+      "grad_norm": 0.689984917640686,
+      "learning_rate": 5e-05,
+      "loss": 1.3696,
+      "step": 234
+    },
+    {
+      "epoch": 0.08203875021818817,
+      "grad_norm": 0.5813177227973938,
+      "learning_rate": 5e-05,
+      "loss": 1.2931,
+      "step": 235
+    },
+    {
+      "epoch": 0.08238785128294641,
+      "grad_norm": 0.5287997126579285,
+      "learning_rate": 5e-05,
+      "loss": 1.3264,
+      "step": 236
+    },
+    {
+      "epoch": 0.08273695234770466,
+      "grad_norm": 0.7944268584251404,
+      "learning_rate": 5e-05,
+      "loss": 1.2708,
+      "step": 237
+    },
+    {
+      "epoch": 0.0830860534124629,
+      "grad_norm": 0.534864068031311,
+      "learning_rate": 5e-05,
+      "loss": 1.2535,
+      "step": 238
+    },
+    {
+      "epoch": 0.08343515447722115,
+      "grad_norm": 0.6260988712310791,
+      "learning_rate": 5e-05,
+      "loss": 1.2757,
+      "step": 239
+    },
+    {
+      "epoch": 0.08378425554197941,
+      "grad_norm": 0.579078197479248,
+      "learning_rate": 5e-05,
+      "loss": 1.2906,
+      "step": 240
+    },
+    {
+      "epoch": 0.08413335660673765,
+      "grad_norm": 0.5578561425209045,
+      "learning_rate": 5e-05,
+      "loss": 1.289,
+      "step": 241
+    },
+    {
+      "epoch": 0.0844824576714959,
+      "grad_norm": 0.626961350440979,
+      "learning_rate": 5e-05,
+      "loss": 1.2807,
+      "step": 242
+    },
+    {
+      "epoch": 0.08483155873625414,
+      "grad_norm": 0.782669186592102,
+      "learning_rate": 5e-05,
+      "loss": 1.3933,
+      "step": 243
+    },
+    {
+      "epoch": 0.08518065980101239,
+      "grad_norm": 0.6670363545417786,
+      "learning_rate": 5e-05,
+      "loss": 1.2732,
+      "step": 244
+    },
+    {
+      "epoch": 0.08552976086577065,
+      "grad_norm": 0.7201350331306458,
+      "learning_rate": 5e-05,
+      "loss": 1.2962,
+      "step": 245
+    },
+    {
+      "epoch": 0.08587886193052889,
+      "grad_norm": 0.6021212339401245,
+      "learning_rate": 5e-05,
+      "loss": 1.35,
+      "step": 246
+    },
+    {
+      "epoch": 0.08622796299528714,
+      "grad_norm": 0.8081540465354919,
+      "learning_rate": 5e-05,
+      "loss": 1.3568,
+      "step": 247
+    },
+    {
+      "epoch": 0.08657706406004538,
+      "grad_norm": 0.5358250737190247,
+      "learning_rate": 5e-05,
+      "loss": 1.4603,
+      "step": 248
+    },
+    {
+      "epoch": 0.08692616512480363,
+      "grad_norm": 0.6927733421325684,
+      "learning_rate": 5e-05,
+      "loss": 1.2506,
+      "step": 249
+    },
+    {
+      "epoch": 0.08727526618956188,
+      "grad_norm": 0.6187159419059753,
+      "learning_rate": 5e-05,
+      "loss": 1.3497,
+      "step": 250
+    },
+    {
+      "epoch": 0.08762436725432013,
+      "grad_norm": 0.6304159760475159,
+      "learning_rate": 5e-05,
+      "loss": 1.3087,
+      "step": 251
+    },
+    {
+      "epoch": 0.08797346831907837,
+      "grad_norm": 0.6446660161018372,
+      "learning_rate": 5e-05,
+      "loss": 1.3424,
+      "step": 252
+    },
+    {
+      "epoch": 0.08832256938383662,
+      "grad_norm": 0.6535473465919495,
+      "learning_rate": 5e-05,
+      "loss": 1.3471,
+      "step": 253
+    },
+    {
+      "epoch": 0.08867167044859486,
+      "grad_norm": 0.601290225982666,
+      "learning_rate": 5e-05,
+      "loss": 1.3557,
+      "step": 254
+    },
+    {
+      "epoch": 0.08902077151335312,
+      "grad_norm": 0.641854465007782,
+      "learning_rate": 5e-05,
+      "loss": 1.3138,
+      "step": 255
+    },
+    {
+      "epoch": 0.08936987257811137,
+      "grad_norm": 0.5452507138252258,
+      "learning_rate": 5e-05,
+      "loss": 1.2898,
+      "step": 256
+    },
+    {
+      "epoch": 0.08971897364286961,
+      "grad_norm": 0.5870373249053955,
+      "learning_rate": 5e-05,
+      "loss": 1.2953,
+      "step": 257
+    },
+    {
+      "epoch": 0.09006807470762786,
+      "grad_norm": 0.5798627734184265,
+      "learning_rate": 5e-05,
+      "loss": 1.2973,
+      "step": 258
+    },
+    {
+      "epoch": 0.0904171757723861,
+      "grad_norm": 0.5798627734184265,
+      "learning_rate": 5e-05,
+      "loss": 1.3628,
+      "step": 259
+    },
+    {
+      "epoch": 0.09076627683714435,
+      "grad_norm": 0.7382280230522156,
+      "learning_rate": 5e-05,
+      "loss": 1.3111,
+      "step": 260
+    },
+    {
+      "epoch": 0.0911153779019026,
+      "grad_norm": 0.6882988810539246,
+      "learning_rate": 5e-05,
+      "loss": 1.329,
+      "step": 261
+    },
+    {
+      "epoch": 0.09146447896666085,
+      "grad_norm": 0.6590788960456848,
+      "learning_rate": 5e-05,
+      "loss": 1.3089,
+      "step": 262
+    },
+    {
+      "epoch": 0.0918135800314191,
+      "grad_norm": 0.682006299495697,
+      "learning_rate": 5e-05,
+      "loss": 1.344,
+      "step": 263
+    },
+    {
+      "epoch": 0.09216268109617734,
+      "grad_norm": 0.6040222644805908,
+      "learning_rate": 5e-05,
+      "loss": 1.3919,
+      "step": 264
+    },
+    {
+      "epoch": 0.09251178216093559,
+      "grad_norm": 0.5964936017990112,
+      "learning_rate": 5e-05,
+      "loss": 1.3397,
+      "step": 265
+    },
+    {
+      "epoch": 0.09286088322569384,
+      "grad_norm": 0.5645217299461365,
+      "learning_rate": 5e-05,
+      "loss": 1.3488,
+      "step": 266
+    },
+    {
+      "epoch": 0.09320998429045209,
+      "grad_norm": 0.7771989703178406,
+      "learning_rate": 5e-05,
+      "loss": 1.3485,
+      "step": 267
+    },
+    {
+      "epoch": 0.09355908535521033,
+      "grad_norm": 0.6003885865211487,
+      "learning_rate": 5e-05,
+      "loss": 1.3109,
+      "step": 268
+    },
+    {
+      "epoch": 0.09390818641996858,
+      "grad_norm": 0.5627903938293457,
+      "learning_rate": 5e-05,
+      "loss": 1.2906,
+      "step": 269
+    },
+    {
+      "epoch": 0.09425728748472682,
+      "grad_norm": 0.6381875276565552,
+      "learning_rate": 5e-05,
+      "loss": 1.3063,
+      "step": 270
+    },
+    {
+      "epoch": 0.09460638854948508,
+      "grad_norm": 1.2558772563934326,
+      "learning_rate": 5e-05,
+      "loss": 1.2985,
+      "step": 271
+    },
+    {
+      "epoch": 0.09495548961424333,
+      "grad_norm": 0.6977007389068604,
+      "learning_rate": 5e-05,
+      "loss": 1.4955,
+      "step": 272
+    },
+    {
+      "epoch": 0.09530459067900157,
+      "grad_norm": 0.7846536040306091,
+      "learning_rate": 5e-05,
+      "loss": 1.4439,
+      "step": 273
+    },
+    {
+      "epoch": 0.09565369174375982,
+      "grad_norm": 0.7036994695663452,
+      "learning_rate": 5e-05,
+      "loss": 1.1942,
+      "step": 274
+    },
+    {
+      "epoch": 0.09600279280851806,
+      "grad_norm": 0.6119917631149292,
+      "learning_rate": 5e-05,
+      "loss": 1.3607,
+      "step": 275
+    },
+    {
+      "epoch": 0.09635189387327632,
+      "grad_norm": 0.6243535280227661,
+      "learning_rate": 5e-05,
+      "loss": 1.3029,
+      "step": 276
+    },
+    {
+      "epoch": 0.09670099493803457,
+      "grad_norm": 0.5424296855926514,
+      "learning_rate": 5e-05,
+      "loss": 1.2995,
+      "step": 277
+    },
+    {
+      "epoch": 0.09705009600279281,
+      "grad_norm": 0.7677564024925232,
+      "learning_rate": 5e-05,
+      "loss": 1.2686,
+      "step": 278
+    },
+    {
+      "epoch": 0.09739919706755105,
+      "grad_norm": 0.625275194644928,
+      "learning_rate": 5e-05,
+      "loss": 1.2897,
+      "step": 279
+    },
+    {
+      "epoch": 0.0977482981323093,
+      "grad_norm": 0.5734910368919373,
+      "learning_rate": 5e-05,
+      "loss": 1.3298,
+      "step": 280
+    },
+    {
+      "epoch": 0.09809739919706754,
+      "grad_norm": 0.660658061504364,
+      "learning_rate": 5e-05,
+      "loss": 1.2643,
+      "step": 281
+    },
+    {
+      "epoch": 0.0984465002618258,
+      "grad_norm": 0.679891049861908,
+      "learning_rate": 5e-05,
+      "loss": 1.3189,
+      "step": 282
+    },
+    {
+      "epoch": 0.09879560132658405,
+      "grad_norm": 0.6248694658279419,
+      "learning_rate": 5e-05,
+      "loss": 1.1688,
+      "step": 283
+    },
+    {
+      "epoch": 0.0991447023913423,
+      "grad_norm": 0.6428897380828857,
+      "learning_rate": 5e-05,
+      "loss": 1.3274,
+      "step": 284
+    },
+    {
+      "epoch": 0.09949380345610054,
+      "grad_norm": 0.586065411567688,
+      "learning_rate": 5e-05,
+      "loss": 1.3852,
+      "step": 285
+    },
+    {
+      "epoch": 0.09984290452085878,
+      "grad_norm": 0.5755594372749329,
+      "learning_rate": 5e-05,
+      "loss": 1.3665,
+      "step": 286
+    },
+    {
+      "epoch": 0.10019200558561704,
+      "grad_norm": 0.7748963236808777,
+      "learning_rate": 5e-05,
+      "loss": 1.4551,
+      "step": 287
+    },
+    {
+      "epoch": 0.10054110665037529,
+      "grad_norm": 0.6308531165122986,
+      "learning_rate": 5e-05,
+      "loss": 1.2793,
+      "step": 288
+    },
+    {
+      "epoch": 0.10089020771513353,
+      "grad_norm": 0.6195006966590881,
+      "learning_rate": 5e-05,
+      "loss": 1.3649,
+      "step": 289
+    },
+    {
+      "epoch": 0.10123930877989178,
+      "grad_norm": 0.6098636984825134,
+      "learning_rate": 5e-05,
+      "loss": 1.2956,
+      "step": 290
+    },
+    {
+      "epoch": 0.10158840984465002,
+      "grad_norm": 0.8072320818901062,
+      "learning_rate": 5e-05,
+      "loss": 1.3469,
+      "step": 291
+    },
+    {
+      "epoch": 0.10193751090940828,
+      "grad_norm": 0.6090126633644104,
+      "learning_rate": 5e-05,
+      "loss": 1.2958,
+      "step": 292
+    },
+    {
+      "epoch": 0.10228661197416652,
+      "grad_norm": 0.5718780159950256,
+      "learning_rate": 5e-05,
+      "loss": 1.363,
+      "step": 293
+    },
+    {
+      "epoch": 0.10263571303892477,
+      "grad_norm": 0.7197532653808594,
+      "learning_rate": 5e-05,
+      "loss": 1.3868,
+      "step": 294
+    },
+    {
+      "epoch": 0.10298481410368301,
+      "grad_norm": 0.5578592419624329,
+      "learning_rate": 5e-05,
+      "loss": 1.2627,
+      "step": 295
+    },
+    {
+      "epoch": 0.10333391516844126,
+      "grad_norm": 0.730226457118988,
+      "learning_rate": 5e-05,
+      "loss": 1.3182,
+      "step": 296
+    },
+    {
+      "epoch": 0.10368301623319952,
+      "grad_norm": 0.6234796047210693,
+      "learning_rate": 5e-05,
+      "loss": 1.1777,
+      "step": 297
+    },
+    {
+      "epoch": 0.10403211729795776,
+      "grad_norm": 0.5563578009605408,
+      "learning_rate": 5e-05,
+      "loss": 1.3275,
+      "step": 298
+    },
+    {
+      "epoch": 0.10438121836271601,
+      "grad_norm": 0.6864249110221863,
+      "learning_rate": 5e-05,
+      "loss": 1.2813,
+      "step": 299
+    },
+    {
+      "epoch": 0.10473031942747425,
+      "grad_norm": 0.8850319385528564,
+      "learning_rate": 5e-05,
+      "loss": 1.3057,
+      "step": 300
+    },
+    {
+      "epoch": 0.10473031942747425,
+      "eval_loss": 1.3255380392074585,
+      "eval_runtime": 3311.4237,
+      "eval_samples_per_second": 6.92,
+      "eval_steps_per_second": 0.865,
+      "step": 300
+    },
+    {
+      "epoch": 0.1050794204922325,
+      "grad_norm": 0.9439303278923035,
+      "learning_rate": 5e-05,
+      "loss": 1.281,
+      "step": 301
+    },
+    {
+      "epoch": 0.10542852155699074,
+      "grad_norm": 0.6651242971420288,
+      "learning_rate": 5e-05,
+      "loss": 1.3492,
+      "step": 302
+    },
+    {
+      "epoch": 0.105777622621749,
+      "grad_norm": 0.9047183394432068,
+      "learning_rate": 5e-05,
+      "loss": 1.4246,
+      "step": 303
+    },
+    {
+      "epoch": 0.10612672368650725,
+      "grad_norm": 0.6983138918876648,
+      "learning_rate": 5e-05,
+      "loss": 1.324,
+      "step": 304
+    },
+    {
+      "epoch": 0.10647582475126549,
+      "grad_norm": 0.6347063779830933,
+      "learning_rate": 5e-05,
+      "loss": 1.3389,
+      "step": 305
+    },
+    {
+      "epoch": 0.10682492581602374,
+      "grad_norm": 0.6051842570304871,
+      "learning_rate": 5e-05,
+      "loss": 1.3278,
+      "step": 306
+    },
+    {
+      "epoch": 0.10717402688078198,
+      "grad_norm": 0.9355935454368591,
+      "learning_rate": 5e-05,
+      "loss": 1.2663,
+      "step": 307
+    },
+    {
+      "epoch": 0.10752312794554024,
+      "grad_norm": 1.0706268548965454,
+      "learning_rate": 5e-05,
+      "loss": 1.3142,
+      "step": 308
+    },
+    {
+      "epoch": 0.10787222901029848,
+      "grad_norm": 0.8131638765335083,
+      "learning_rate": 5e-05,
+      "loss": 1.3445,
+      "step": 309
+    },
+    {
+      "epoch": 0.10822133007505673,
+      "grad_norm": 0.5791985392570496,
+      "learning_rate": 5e-05,
+      "loss": 1.2746,
+      "step": 310
+    },
+    {
+      "epoch": 0.10857043113981497,
+      "grad_norm": 0.5536484718322754,
+      "learning_rate": 5e-05,
+      "loss": 1.2613,
+      "step": 311
+    },
+    {
+      "epoch": 0.10891953220457322,
+      "grad_norm": 0.7847089767456055,
+      "learning_rate": 5e-05,
+      "loss": 1.4607,
+      "step": 312
+    },
+    {
+      "epoch": 0.10926863326933148,
+      "grad_norm": 0.7828165888786316,
+      "learning_rate": 5e-05,
+      "loss": 1.4399,
+      "step": 313
+    },
+    {
+      "epoch": 0.10961773433408972,
+      "grad_norm": 0.5692522525787354,
+      "learning_rate": 5e-05,
+      "loss": 1.3044,
+      "step": 314
+    },
+    {
+      "epoch": 0.10996683539884797,
+      "grad_norm": 0.5592648386955261,
+      "learning_rate": 5e-05,
+      "loss": 1.3211,
+      "step": 315
+    },
+    {
+      "epoch": 0.11031593646360621,
+      "grad_norm": 0.7055444717407227,
+      "learning_rate": 5e-05,
+      "loss": 1.2944,
+      "step": 316
+    },
+    {
+      "epoch": 0.11066503752836446,
+      "grad_norm": 0.5370152592658997,
+      "learning_rate": 5e-05,
+      "loss": 1.2776,
+      "step": 317
+    },
+    {
+      "epoch": 0.11101413859312272,
+      "grad_norm": 0.6320214867591858,
+      "learning_rate": 5e-05,
+      "loss": 1.347,
+      "step": 318
+    },
+    {
+      "epoch": 0.11136323965788096,
+      "grad_norm": 0.6425771713256836,
+      "learning_rate": 5e-05,
+      "loss": 1.5038,
+      "step": 319
+    },
+    {
+      "epoch": 0.1117123407226392,
+      "grad_norm": 0.585542619228363,
+      "learning_rate": 5e-05,
+      "loss": 1.3573,
+      "step": 320
+    },
+    {
+      "epoch": 0.11206144178739745,
+      "grad_norm": 0.5627699494361877,
+      "learning_rate": 5e-05,
+      "loss": 1.2693,
+      "step": 321
+    },
+    {
+      "epoch": 0.1124105428521557,
+      "grad_norm": 0.6050506830215454,
+      "learning_rate": 5e-05,
+      "loss": 1.2787,
+      "step": 322
+    },
+    {
+      "epoch": 0.11275964391691394,
+      "grad_norm": 0.6247337460517883,
+      "learning_rate": 5e-05,
+      "loss": 1.4146,
+      "step": 323
+    },
+    {
+      "epoch": 0.1131087449816722,
+      "grad_norm": 0.7732966542243958,
+      "learning_rate": 5e-05,
+      "loss": 1.2626,
+      "step": 324
+    },
+    {
+      "epoch": 0.11345784604643044,
+      "grad_norm": 0.5666255354881287,
+      "learning_rate": 5e-05,
+      "loss": 1.4219,
+      "step": 325
+    },
+    {
+      "epoch": 0.11380694711118869,
+      "grad_norm": 0.5973132848739624,
+      "learning_rate": 5e-05,
+      "loss": 1.3522,
+      "step": 326
+    },
+    {
+      "epoch": 0.11415604817594693,
+      "grad_norm": 0.8540626764297485,
+      "learning_rate": 5e-05,
+      "loss": 1.304,
+      "step": 327
+    },
+    {
+      "epoch": 0.11450514924070518,
+      "grad_norm": 0.574573278427124,
+      "learning_rate": 5e-05,
+      "loss": 1.3487,
+      "step": 328
+    },
+    {
+      "epoch": 0.11485425030546344,
+      "grad_norm": 0.5949917435646057,
+      "learning_rate": 5e-05,
+      "loss": 1.254,
+      "step": 329
+    },
+    {
+      "epoch": 0.11520335137022168,
+      "grad_norm": 0.6005589365959167,
+      "learning_rate": 5e-05,
+      "loss": 1.3073,
+      "step": 330
+    },
+    {
+      "epoch": 0.11555245243497993,
+      "grad_norm": 0.5026714205741882,
+      "learning_rate": 5e-05,
+      "loss": 1.2418,
+      "step": 331
+    },
+    {
+      "epoch": 0.11590155349973817,
+      "grad_norm": 0.7160278558731079,
+      "learning_rate": 5e-05,
+      "loss": 1.3437,
+      "step": 332
+    },
+    {
+      "epoch": 0.11625065456449642,
+      "grad_norm": 0.6049554347991943,
+      "learning_rate": 5e-05,
+      "loss": 1.4858,
+      "step": 333
+    },
+    {
+      "epoch": 0.11659975562925468,
+      "grad_norm": 0.7706385254859924,
+      "learning_rate": 5e-05,
+      "loss": 1.3971,
+      "step": 334
+    },
+    {
+      "epoch": 0.11694885669401292,
+      "grad_norm": 0.6254088282585144,
+      "learning_rate": 5e-05,
+      "loss": 1.3359,
+      "step": 335
+    },
+    {
+      "epoch": 0.11729795775877117,
+      "grad_norm": 0.5904930830001831,
+      "learning_rate": 5e-05,
+      "loss": 1.3262,
+      "step": 336
+    },
+    {
+      "epoch": 0.11764705882352941,
+      "grad_norm": 1.9982556104660034,
+      "learning_rate": 5e-05,
+      "loss": 1.3656,
+      "step": 337
+    },
+    {
+      "epoch": 0.11799615988828766,
+      "grad_norm": 0.5776758790016174,
+      "learning_rate": 5e-05,
+      "loss": 1.2654,
+      "step": 338
+    },
+    {
+      "epoch": 0.1183452609530459,
+      "grad_norm": 0.6094497442245483,
+      "learning_rate": 5e-05,
+      "loss": 1.3505,
+      "step": 339
+    },
+    {
+      "epoch": 0.11869436201780416,
+      "grad_norm": 0.9940481185913086,
+      "learning_rate": 5e-05,
+      "loss": 1.2853,
+      "step": 340
+    },
+    {
+      "epoch": 0.1190434630825624,
+      "grad_norm": 1.1043668985366821,
+      "learning_rate": 5e-05,
+      "loss": 1.2813,
+      "step": 341
+    },
+    {
+      "epoch": 0.11939256414732065,
+      "grad_norm": 0.5494128465652466,
+      "learning_rate": 5e-05,
+      "loss": 1.202,
+      "step": 342
+    },
+    {
+      "epoch": 0.1197416652120789,
+      "grad_norm": 0.6436132192611694,
+      "learning_rate": 5e-05,
+      "loss": 1.2898,
+      "step": 343
+    },
+    {
+      "epoch": 0.12009076627683714,
+      "grad_norm": 0.6878450512886047,
+      "learning_rate": 5e-05,
+      "loss": 1.3392,
+      "step": 344
+    },
+    {
+      "epoch": 0.1204398673415954,
+      "grad_norm": 0.5806905627250671,
+      "learning_rate": 5e-05,
+      "loss": 1.2221,
+      "step": 345
+    },
+    {
+      "epoch": 0.12078896840635364,
+      "grad_norm": 0.5916112065315247,
+      "learning_rate": 5e-05,
+      "loss": 1.2761,
+      "step": 346
+    },
+    {
+      "epoch": 0.12113806947111189,
+      "grad_norm": 0.5216647386550903,
+      "learning_rate": 5e-05,
+      "loss": 1.223,
+      "step": 347
+    },
+    {
+      "epoch": 0.12148717053587013,
+      "grad_norm": 0.707747220993042,
+      "learning_rate": 5e-05,
+      "loss": 1.2933,
+      "step": 348
+    },
+    {
+      "epoch": 0.12183627160062838,
+      "grad_norm": 0.6644443273544312,
+      "learning_rate": 5e-05,
+      "loss": 1.3367,
+      "step": 349
+    },
+    {
+      "epoch": 0.12218537266538664,
+      "grad_norm": 0.7112720012664795,
+      "learning_rate": 5e-05,
+      "loss": 1.2368,
+      "step": 350
+    },
+    {
+      "epoch": 0.12253447373014488,
+      "grad_norm": 0.6551552414894104,
+      "learning_rate": 5e-05,
+      "loss": 1.3348,
+      "step": 351
+    },
+    {
+      "epoch": 0.12288357479490312,
+      "grad_norm": 0.5377748012542725,
+      "learning_rate": 5e-05,
+      "loss": 1.2859,
+      "step": 352
+    },
+    {
+      "epoch": 0.12323267585966137,
+      "grad_norm": 0.580769956111908,
+      "learning_rate": 5e-05,
+      "loss": 1.2442,
+      "step": 353
+    },
+    {
+      "epoch": 0.12358177692441961,
+      "grad_norm": 0.6772916316986084,
+      "learning_rate": 5e-05,
+      "loss": 1.2994,
+      "step": 354
+    },
+    {
+      "epoch": 0.12393087798917787,
+      "grad_norm": 0.6245989799499512,
+      "learning_rate": 5e-05,
+      "loss": 1.2093,
+      "step": 355
+    },
+    {
+      "epoch": 0.12427997905393612,
+      "grad_norm": 0.6136452555656433,
+      "learning_rate": 5e-05,
+      "loss": 1.2258,
+      "step": 356
+    },
+    {
+      "epoch": 0.12462908011869436,
+      "grad_norm": 0.5786277055740356,
+      "learning_rate": 5e-05,
+      "loss": 1.2856,
+      "step": 357
+    },
+    {
+      "epoch": 0.12497818118345261,
+      "grad_norm": 0.5986611247062683,
+      "learning_rate": 5e-05,
+      "loss": 1.4524,
+      "step": 358
+    },
+    {
+      "epoch": 0.12532728224821085,
+      "grad_norm": 0.6240454316139221,
+      "learning_rate": 5e-05,
+      "loss": 1.3325,
+      "step": 359
+    },
+    {
+      "epoch": 0.1256763833129691,
+      "grad_norm": 0.6426084041595459,
+      "learning_rate": 5e-05,
+      "loss": 1.219,
+      "step": 360
+    },
+    {
+      "epoch": 0.12602548437772734,
+      "grad_norm": 0.6227401494979858,
+      "learning_rate": 5e-05,
+      "loss": 1.3342,
+      "step": 361
+    },
+    {
+      "epoch": 0.1263745854424856,
+      "grad_norm": 0.7462456226348877,
+      "learning_rate": 5e-05,
+      "loss": 1.3747,
+      "step": 362
+    },
+    {
+      "epoch": 0.12672368650724386,
+      "grad_norm": 0.7022641897201538,
+      "learning_rate": 5e-05,
+      "loss": 1.2957,
+      "step": 363
+    },
+    {
+      "epoch": 0.1270727875720021,
+      "grad_norm": 0.657645046710968,
+      "learning_rate": 5e-05,
+      "loss": 1.3125,
+      "step": 364
+    },
+    {
+      "epoch": 0.12742188863676035,
+      "grad_norm": 0.662497878074646,
+      "learning_rate": 5e-05,
+      "loss": 1.321,
+      "step": 365
+    },
+    {
+      "epoch": 0.1277709897015186,
+      "grad_norm": 0.6295817494392395,
+      "learning_rate": 5e-05,
+      "loss": 1.3814,
+      "step": 366
+    },
+    {
+      "epoch": 0.12812009076627684,
+      "grad_norm": 0.7357390522956848,
+      "learning_rate": 5e-05,
+      "loss": 1.374,
+      "step": 367
+    },
+    {
+      "epoch": 0.12846919183103508,
+      "grad_norm": 0.6728739142417908,
+      "learning_rate": 5e-05,
+      "loss": 1.1957,
+      "step": 368
+    },
+    {
+      "epoch": 0.12881829289579333,
+      "grad_norm": 0.6290231943130493,
+      "learning_rate": 5e-05,
+      "loss": 1.2948,
+      "step": 369
+    },
+    {
+      "epoch": 0.12916739396055157,
+      "grad_norm": 1.0889554023742676,
+      "learning_rate": 5e-05,
+      "loss": 1.3465,
+      "step": 370
+    },
+    {
+      "epoch": 0.12951649502530982,
+      "grad_norm": 0.6978388428688049,
+      "learning_rate": 5e-05,
+      "loss": 1.2898,
+      "step": 371
+    },
+    {
+      "epoch": 0.12986559609006806,
+      "grad_norm": 1.0806949138641357,
+      "learning_rate": 5e-05,
+      "loss": 1.2656,
+      "step": 372
+    },
+    {
+      "epoch": 0.1302146971548263,
+      "grad_norm": 0.5989696979522705,
+      "learning_rate": 5e-05,
+      "loss": 1.354,
+      "step": 373
+    },
+    {
+      "epoch": 0.13056379821958458,
+      "grad_norm": 0.5808868408203125,
+      "learning_rate": 5e-05,
+      "loss": 1.2911,
+      "step": 374
+    },
+    {
+      "epoch": 0.13091289928434283,
+      "grad_norm": 0.6175510883331299,
+      "learning_rate": 5e-05,
+      "loss": 1.3392,
+      "step": 375
+    },
+    {
+      "epoch": 0.13126200034910107,
+      "grad_norm": 0.7896063923835754,
+      "learning_rate": 5e-05,
+      "loss": 1.3598,
+      "step": 376
+    },
+    {
+      "epoch": 0.13161110141385932,
+      "grad_norm": 0.6890353560447693,
+      "learning_rate": 5e-05,
+      "loss": 1.2259,
+      "step": 377
+    },
+    {
+      "epoch": 0.13196020247861756,
+      "grad_norm": 0.7264868021011353,
+      "learning_rate": 5e-05,
+      "loss": 1.3747,
+      "step": 378
+    },
+    {
+      "epoch": 0.1323093035433758,
+      "grad_norm": 0.5779114365577698,
+      "learning_rate": 5e-05,
+      "loss": 1.2566,
+      "step": 379
+    },
+    {
+      "epoch": 0.13265840460813405,
+      "grad_norm": 0.6164990067481995,
+      "learning_rate": 5e-05,
+      "loss": 1.3123,
+      "step": 380
+    },
+    {
+      "epoch": 0.1330075056728923,
+      "grad_norm": 0.5990901589393616,
+      "learning_rate": 5e-05,
+      "loss": 1.399,
+      "step": 381
+    },
+    {
+      "epoch": 0.13335660673765054,
+      "grad_norm": 0.5799390077590942,
+      "learning_rate": 5e-05,
+      "loss": 1.2697,
+      "step": 382
+    },
+    {
+      "epoch": 0.13370570780240879,
+      "grad_norm": 0.6446252465248108,
+      "learning_rate": 5e-05,
+      "loss": 1.3321,
+      "step": 383
+    },
+    {
+      "epoch": 0.13405480886716706,
+      "grad_norm": 0.5626406669616699,
+      "learning_rate": 5e-05,
+      "loss": 1.2867,
+      "step": 384
+    },
+    {
+      "epoch": 0.1344039099319253,
+      "grad_norm": 0.5967420935630798,
+      "learning_rate": 5e-05,
+      "loss": 1.3514,
+      "step": 385
+    },
+    {
+      "epoch": 0.13475301099668355,
+      "grad_norm": 0.622344434261322,
+      "learning_rate": 5e-05,
+      "loss": 1.2814,
+      "step": 386
+    },
+    {
+      "epoch": 0.1351021120614418,
+      "grad_norm": 0.5952975749969482,
+      "learning_rate": 5e-05,
+      "loss": 1.3616,
+      "step": 387
+    },
+    {
+      "epoch": 0.13545121312620004,
+      "grad_norm": 1.6270025968551636,
+      "learning_rate": 5e-05,
+      "loss": 1.3057,
+      "step": 388
+    },
+    {
+      "epoch": 0.13580031419095828,
+      "grad_norm": 0.6453176736831665,
+      "learning_rate": 5e-05,
+      "loss": 1.2203,
+      "step": 389
+    },
+    {
+      "epoch": 0.13614941525571653,
+      "grad_norm": 0.6074663400650024,
+      "learning_rate": 5e-05,
+      "loss": 1.2705,
+      "step": 390
+    },
+    {
+      "epoch": 0.13649851632047477,
+      "grad_norm": 0.5617640018463135,
+      "learning_rate": 5e-05,
+      "loss": 1.2692,
+      "step": 391
+    },
+    {
+      "epoch": 0.13684761738523302,
+      "grad_norm": 0.5138052701950073,
+      "learning_rate": 5e-05,
+      "loss": 1.2914,
+      "step": 392
+    },
+    {
+      "epoch": 0.13719671844999126,
+      "grad_norm": 0.6522411108016968,
+      "learning_rate": 5e-05,
+      "loss": 1.3055,
+      "step": 393
+    },
+    {
+      "epoch": 0.1375458195147495,
+      "grad_norm": 0.6821246147155762,
+      "learning_rate": 5e-05,
+      "loss": 1.2674,
+      "step": 394
+    },
+    {
+      "epoch": 0.13789492057950778,
+      "grad_norm": 0.6284828186035156,
+      "learning_rate": 5e-05,
+      "loss": 1.2842,
+      "step": 395
+    },
+    {
+      "epoch": 0.13824402164426602,
+      "grad_norm": 0.6461937427520752,
+      "learning_rate": 5e-05,
+      "loss": 1.305,
+      "step": 396
+    },
+    {
+      "epoch": 0.13859312270902427,
+      "grad_norm": 0.8084800243377686,
+      "learning_rate": 5e-05,
+      "loss": 1.3539,
+      "step": 397
+    },
+    {
+      "epoch": 0.1389422237737825,
+      "grad_norm": 0.5511135458946228,
+      "learning_rate": 5e-05,
+      "loss": 1.2364,
+      "step": 398
+    },
+    {
+      "epoch": 0.13929132483854076,
+      "grad_norm": 0.6121107339859009,
+      "learning_rate": 5e-05,
+      "loss": 1.3212,
+      "step": 399
+    },
+    {
+      "epoch": 0.139640425903299,
+      "grad_norm": 0.5705773234367371,
+      "learning_rate": 5e-05,
+      "loss": 1.3116,
+      "step": 400
+    },
+    {
+      "epoch": 0.139640425903299,
+      "eval_loss": 1.322394609451294,
+      "eval_runtime": 3311.45,
+      "eval_samples_per_second": 6.92,
+      "eval_steps_per_second": 0.865,
+      "step": 400
+    },
+    {
+      "epoch": 0.139640425903299,
+      "step": 400,
+      "total_flos": 8.590417732871127e+17,
+      "train_loss": 1.3312159395217895,
+      "train_runtime": 17991.8527,
+      "train_samples_per_second": 1.779,
+      "train_steps_per_second": 0.056
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "total_flos": 8.590417732871127e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}