Training in progress, step 420, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +991 -3

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c9d19d032b2720552ed5a8c04c8453d710ed0eed172ae313734cb428d3f003fc
 size 80013120

 version https://git-lfs.github.com/spec/v1
+oid sha256:4b543d9e019f6d5c3cd652914901b2739520d85b9e6044fe4d75f753c8dd4dc9
 size 80013120

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:08e5683a29463e32746f14f186f042dd447b12cafcad678bbbddb34b9249098a
 size 41120084

 version https://git-lfs.github.com/spec/v1
+oid sha256:14c61ad2090140d21c6df9e0a61cd6d3225a4e43d63a4283da89db183775f6ae
 size 41120084

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3369e2942ff752b68da734b9eaf1a12b8c42e1d8b80214950313c71f22a426be
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:c08654f8daac8b1091c235d2fb6bd8b249208c723b2dd501bc93b7ca776f4cba
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9fe9c01b8c53647998de80cbc88fe3102f7ee94466c3d3ba6db0d6d4b3bdc06d
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:24b34984058cd5169df3c13d6905d5f65a7c10a7cf4235e831bb570e73473147
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.5,
   "eval_steps": 140,
-  "global_step": 280,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -1983,6 +1983,994 @@
       "eval_samples_per_second": 16.888,
       "eval_steps_per_second": 8.444,
       "step": 280
     }
   ],
   "logging_steps": 1,
@@ -2002,7 +2990,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 9.129139501635994e+16,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.75,
   "eval_steps": 140,
+  "global_step": 420,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 16.888,
       "eval_steps_per_second": 8.444,
       "step": 280
+    },
+    {
+      "epoch": 0.5017857142857143,
+      "grad_norm": 0.3074837923049927,
+      "learning_rate": 0.00010228459587429497,
+      "loss": 1.1389,
+      "step": 281
+    },
+    {
+      "epoch": 0.5035714285714286,
+      "grad_norm": 0.3143484592437744,
+      "learning_rate": 0.00010171351213038993,
+      "loss": 0.9542,
+      "step": 282
+    },
+    {
+      "epoch": 0.5053571428571428,
+      "grad_norm": 0.3524804413318634,
+      "learning_rate": 0.00010114237248023404,
+      "loss": 0.8578,
+      "step": 283
+    },
+    {
+      "epoch": 0.5071428571428571,
+      "grad_norm": 0.335183322429657,
+      "learning_rate": 0.00010057119555823085,
+      "loss": 0.9228,
+      "step": 284
+    },
+    {
+      "epoch": 0.5089285714285714,
+      "grad_norm": 0.35537081956863403,
+      "learning_rate": 0.0001,
+      "loss": 0.9541,
+      "step": 285
+    },
+    {
+      "epoch": 0.5107142857142857,
+      "grad_norm": 0.3294563591480255,
+      "learning_rate": 9.942880444176918e-05,
+      "loss": 1.4223,
+      "step": 286
+    },
+    {
+      "epoch": 0.5125,
+      "grad_norm": 0.32628077268600464,
+      "learning_rate": 9.8857627519766e-05,
+      "loss": 1.2942,
+      "step": 287
+    },
+    {
+      "epoch": 0.5142857142857142,
+      "grad_norm": 0.32195404171943665,
+      "learning_rate": 9.828648786961008e-05,
+      "loss": 1.4239,
+      "step": 288
+    },
+    {
+      "epoch": 0.5160714285714286,
+      "grad_norm": 0.3283804655075073,
+      "learning_rate": 9.771540412570504e-05,
+      "loss": 1.0303,
+      "step": 289
+    },
+    {
+      "epoch": 0.5178571428571429,
+      "grad_norm": 0.356052428483963,
+      "learning_rate": 9.71443949206304e-05,
+      "loss": 1.0199,
+      "step": 290
+    },
+    {
+      "epoch": 0.5196428571428572,
+      "grad_norm": 0.39479124546051025,
+      "learning_rate": 9.657347888453367e-05,
+      "loss": 1.1343,
+      "step": 291
+    },
+    {
+      "epoch": 0.5214285714285715,
+      "grad_norm": 0.34791451692581177,
+      "learning_rate": 9.60026746445227e-05,
+      "loss": 1.521,
+      "step": 292
+    },
+    {
+      "epoch": 0.5232142857142857,
+      "grad_norm": 0.3614530861377716,
+      "learning_rate": 9.543200082405768e-05,
+      "loss": 1.2346,
+      "step": 293
+    },
+    {
+      "epoch": 0.525,
+      "grad_norm": 0.36958596110343933,
+      "learning_rate": 9.486147604234371e-05,
+      "loss": 1.0457,
+      "step": 294
+    },
+    {
+      "epoch": 0.5267857142857143,
+      "grad_norm": 0.4293418824672699,
+      "learning_rate": 9.42911189137232e-05,
+      "loss": 1.1071,
+      "step": 295
+    },
+    {
+      "epoch": 0.5285714285714286,
+      "grad_norm": 0.40408602356910706,
+      "learning_rate": 9.372094804706867e-05,
+      "loss": 1.3805,
+      "step": 296
+    },
+    {
+      "epoch": 0.5303571428571429,
+      "grad_norm": 0.3892784118652344,
+      "learning_rate": 9.315098204517543e-05,
+      "loss": 0.9136,
+      "step": 297
+    },
+    {
+      "epoch": 0.5321428571428571,
+      "grad_norm": 0.4003988206386566,
+      "learning_rate": 9.258123950415479e-05,
+      "loss": 1.3684,
+      "step": 298
+    },
+    {
+      "epoch": 0.5339285714285714,
+      "grad_norm": 0.37116503715515137,
+      "learning_rate": 9.201173901282724e-05,
+      "loss": 1.7824,
+      "step": 299
+    },
+    {
+      "epoch": 0.5357142857142857,
+      "grad_norm": 0.5286569595336914,
+      "learning_rate": 9.144249915211605e-05,
+      "loss": 1.9392,
+      "step": 300
+    },
+    {
+      "epoch": 0.5375,
+      "grad_norm": 0.18338526785373688,
+      "learning_rate": 9.087353849444085e-05,
+      "loss": 1.4422,
+      "step": 301
+    },
+    {
+      "epoch": 0.5392857142857143,
+      "grad_norm": 0.20191389322280884,
+      "learning_rate": 9.030487560311186e-05,
+      "loss": 1.5443,
+      "step": 302
+    },
+    {
+      "epoch": 0.5410714285714285,
+      "grad_norm": 0.19955602288246155,
+      "learning_rate": 8.973652903172423e-05,
+      "loss": 1.6521,
+      "step": 303
+    },
+    {
+      "epoch": 0.5428571428571428,
+      "grad_norm": 0.2253991812467575,
+      "learning_rate": 8.916851732355255e-05,
+      "loss": 1.6596,
+      "step": 304
+    },
+    {
+      "epoch": 0.5446428571428571,
+      "grad_norm": 0.18380412459373474,
+      "learning_rate": 8.860085901094595e-05,
+      "loss": 1.5462,
+      "step": 305
+    },
+    {
+      "epoch": 0.5464285714285714,
+      "grad_norm": 0.21318556368350983,
+      "learning_rate": 8.803357261472343e-05,
+      "loss": 1.6713,
+      "step": 306
+    },
+    {
+      "epoch": 0.5482142857142858,
+      "grad_norm": 0.20480670034885406,
+      "learning_rate": 8.746667664356956e-05,
+      "loss": 1.7537,
+      "step": 307
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.21598470211029053,
+      "learning_rate": 8.690018959343072e-05,
+      "loss": 1.6955,
+      "step": 308
+    },
+    {
+      "epoch": 0.5517857142857143,
+      "grad_norm": 0.21172207593917847,
+      "learning_rate": 8.633412994691144e-05,
+      "loss": 1.7187,
+      "step": 309
+    },
+    {
+      "epoch": 0.5535714285714286,
+      "grad_norm": 0.22086191177368164,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 1.7579,
+      "step": 310
+    },
+    {
+      "epoch": 0.5553571428571429,
+      "grad_norm": 0.2266392558813095,
+      "learning_rate": 8.520336672482338e-05,
+      "loss": 1.7486,
+      "step": 311
+    },
+    {
+      "epoch": 0.5571428571428572,
+      "grad_norm": 0.2660948634147644,
+      "learning_rate": 8.463870004233008e-05,
+      "loss": 1.7903,
+      "step": 312
+    },
+    {
+      "epoch": 0.5589285714285714,
+      "grad_norm": 0.2297395020723343,
+      "learning_rate": 8.407453454840357e-05,
+      "loss": 1.8017,
+      "step": 313
+    },
+    {
+      "epoch": 0.5607142857142857,
+      "grad_norm": 0.21526682376861572,
+      "learning_rate": 8.351088864990368e-05,
+      "loss": 1.855,
+      "step": 314
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 0.2516147196292877,
+      "learning_rate": 8.294778073673762e-05,
+      "loss": 1.7103,
+      "step": 315
+    },
+    {
+      "epoch": 0.5642857142857143,
+      "grad_norm": 0.25641700625419617,
+      "learning_rate": 8.238522918125983e-05,
+      "loss": 1.9301,
+      "step": 316
+    },
+    {
+      "epoch": 0.5660714285714286,
+      "grad_norm": 0.26828062534332275,
+      "learning_rate": 8.182325233767267e-05,
+      "loss": 1.8575,
+      "step": 317
+    },
+    {
+      "epoch": 0.5678571428571428,
+      "grad_norm": 0.24787884950637817,
+      "learning_rate": 8.126186854142752e-05,
+      "loss": 2.0228,
+      "step": 318
+    },
+    {
+      "epoch": 0.5696428571428571,
+      "grad_norm": 0.23658955097198486,
+      "learning_rate": 8.070109610862668e-05,
+      "loss": 1.7813,
+      "step": 319
+    },
+    {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 0.2818485498428345,
+      "learning_rate": 8.014095333542548e-05,
+      "loss": 1.7571,
+      "step": 320
+    },
+    {
+      "epoch": 0.5732142857142857,
+      "grad_norm": 0.24982373416423798,
+      "learning_rate": 7.958145849743569e-05,
+      "loss": 1.602,
+      "step": 321
+    },
+    {
+      "epoch": 0.575,
+      "grad_norm": 0.2815864682197571,
+      "learning_rate": 7.902262984912909e-05,
+      "loss": 1.7216,
+      "step": 322
+    },
+    {
+      "epoch": 0.5767857142857142,
+      "grad_norm": 0.2675464451313019,
+      "learning_rate": 7.846448562324183e-05,
+      "loss": 1.0704,
+      "step": 323
+    },
+    {
+      "epoch": 0.5785714285714286,
+      "grad_norm": 0.23938840627670288,
+      "learning_rate": 7.79070440301796e-05,
+      "loss": 1.282,
+      "step": 324
+    },
+    {
+      "epoch": 0.5803571428571429,
+      "grad_norm": 0.2834213972091675,
+      "learning_rate": 7.735032325742355e-05,
+      "loss": 1.7934,
+      "step": 325
+    },
+    {
+      "epoch": 0.5821428571428572,
+      "grad_norm": 0.3555513918399811,
+      "learning_rate": 7.679434146893685e-05,
+      "loss": 1.2089,
+      "step": 326
+    },
+    {
+      "epoch": 0.5839285714285715,
+      "grad_norm": 0.3254348933696747,
+      "learning_rate": 7.623911680457198e-05,
+      "loss": 1.0845,
+      "step": 327
+    },
+    {
+      "epoch": 0.5857142857142857,
+      "grad_norm": 0.3558744192123413,
+      "learning_rate": 7.568466737947905e-05,
+      "loss": 1.2339,
+      "step": 328
+    },
+    {
+      "epoch": 0.5875,
+      "grad_norm": 0.32738080620765686,
+      "learning_rate": 7.513101128351454e-05,
+      "loss": 0.8492,
+      "step": 329
+    },
+    {
+      "epoch": 0.5892857142857143,
+      "grad_norm": 0.36939212679862976,
+      "learning_rate": 7.457816658065134e-05,
+      "loss": 0.8973,
+      "step": 330
+    },
+    {
+      "epoch": 0.5910714285714286,
+      "grad_norm": 0.3393215835094452,
+      "learning_rate": 7.402615130838917e-05,
+      "loss": 1.0078,
+      "step": 331
+    },
+    {
+      "epoch": 0.5928571428571429,
+      "grad_norm": 0.402725487947464,
+      "learning_rate": 7.347498347716624e-05,
+      "loss": 1.2381,
+      "step": 332
+    },
+    {
+      "epoch": 0.5946428571428571,
+      "grad_norm": 0.33164989948272705,
+      "learning_rate": 7.292468106977148e-05,
+      "loss": 1.197,
+      "step": 333
+    },
+    {
+      "epoch": 0.5964285714285714,
+      "grad_norm": 0.3454689681529999,
+      "learning_rate": 7.237526204075797e-05,
+      "loss": 1.0244,
+      "step": 334
+    },
+    {
+      "epoch": 0.5982142857142857,
+      "grad_norm": 0.3584868907928467,
+      "learning_rate": 7.182674431585704e-05,
+      "loss": 1.062,
+      "step": 335
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.3318020701408386,
+      "learning_rate": 7.127914579139338e-05,
+      "loss": 1.3696,
+      "step": 336
+    },
+    {
+      "epoch": 0.6017857142857143,
+      "grad_norm": 0.38772639632225037,
+      "learning_rate": 7.073248433370124e-05,
+      "loss": 1.1725,
+      "step": 337
+    },
+    {
+      "epoch": 0.6035714285714285,
+      "grad_norm": 0.33527833223342896,
+      "learning_rate": 7.018677777854157e-05,
+      "loss": 1.0849,
+      "step": 338
+    },
+    {
+      "epoch": 0.6053571428571428,
+      "grad_norm": 0.3958960473537445,
+      "learning_rate": 6.964204393051981e-05,
+      "loss": 0.8494,
+      "step": 339
+    },
+    {
+      "epoch": 0.6071428571428571,
+      "grad_norm": 0.3545495569705963,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.843,
+      "step": 340
+    },
+    {
+      "epoch": 0.6089285714285714,
+      "grad_norm": 0.3475857675075531,
+      "learning_rate": 6.855556541505122e-05,
+      "loss": 1.1228,
+      "step": 341
+    },
+    {
+      "epoch": 0.6107142857142858,
+      "grad_norm": 0.4085654616355896,
+      "learning_rate": 6.801385619581592e-05,
+      "loss": 0.8092,
+      "step": 342
+    },
+    {
+      "epoch": 0.6125,
+      "grad_norm": 0.4605577886104584,
+      "learning_rate": 6.747319057898503e-05,
+      "loss": 1.0999,
+      "step": 343
+    },
+    {
+      "epoch": 0.6142857142857143,
+      "grad_norm": 0.3840469717979431,
+      "learning_rate": 6.693358620469487e-05,
+      "loss": 1.2712,
+      "step": 344
+    },
+    {
+      "epoch": 0.6160714285714286,
+      "grad_norm": 0.3712100684642792,
+      "learning_rate": 6.639506067845697e-05,
+      "loss": 1.1401,
+      "step": 345
+    },
+    {
+      "epoch": 0.6178571428571429,
+      "grad_norm": 0.41670674085617065,
+      "learning_rate": 6.585763157058358e-05,
+      "loss": 1.151,
+      "step": 346
+    },
+    {
+      "epoch": 0.6196428571428572,
+      "grad_norm": 0.5912812352180481,
+      "learning_rate": 6.53213164156144e-05,
+      "loss": 1.447,
+      "step": 347
+    },
+    {
+      "epoch": 0.6214285714285714,
+      "grad_norm": 0.3995843231678009,
+      "learning_rate": 6.478613271174453e-05,
+      "loss": 1.6645,
+      "step": 348
+    },
+    {
+      "epoch": 0.6232142857142857,
+      "grad_norm": 0.5337942242622375,
+      "learning_rate": 6.425209792025358e-05,
+      "loss": 1.8703,
+      "step": 349
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.5726771354675293,
+      "learning_rate": 6.371922946493591e-05,
+      "loss": 1.9795,
+      "step": 350
+    },
+    {
+      "epoch": 0.6267857142857143,
+      "grad_norm": 0.17994743585586548,
+      "learning_rate": 6.318754473153221e-05,
+      "loss": 1.5463,
+      "step": 351
+    },
+    {
+      "epoch": 0.6285714285714286,
+      "grad_norm": 0.16961722075939178,
+      "learning_rate": 6.26570610671622e-05,
+      "loss": 1.5388,
+      "step": 352
+    },
+    {
+      "epoch": 0.6303571428571428,
+      "grad_norm": 0.19410116970539093,
+      "learning_rate": 6.21277957797587e-05,
+      "loss": 1.7282,
+      "step": 353
+    },
+    {
+      "epoch": 0.6321428571428571,
+      "grad_norm": 0.1906638890504837,
+      "learning_rate": 6.159976613750286e-05,
+      "loss": 1.5167,
+      "step": 354
+    },
+    {
+      "epoch": 0.6339285714285714,
+      "grad_norm": 0.21918904781341553,
+      "learning_rate": 6.107298936826086e-05,
+      "loss": 1.7446,
+      "step": 355
+    },
+    {
+      "epoch": 0.6357142857142857,
+      "grad_norm": 0.19429104030132294,
+      "learning_rate": 6.0547482659021706e-05,
+      "loss": 1.7166,
+      "step": 356
+    },
+    {
+      "epoch": 0.6375,
+      "grad_norm": 0.21244099736213684,
+      "learning_rate": 6.002326315533665e-05,
+      "loss": 1.7319,
+      "step": 357
+    },
+    {
+      "epoch": 0.6392857142857142,
+      "grad_norm": 0.22793884575366974,
+      "learning_rate": 5.950034796075947e-05,
+      "loss": 1.6573,
+      "step": 358
+    },
+    {
+      "epoch": 0.6410714285714286,
+      "grad_norm": 0.23558048903942108,
+      "learning_rate": 5.897875413628884e-05,
+      "loss": 1.7359,
+      "step": 359
+    },
+    {
+      "epoch": 0.6428571428571429,
+      "grad_norm": 0.21951396763324738,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 1.6536,
+      "step": 360
+    },
+    {
+      "epoch": 0.6446428571428572,
+      "grad_norm": 0.22028475999832153,
+      "learning_rate": 5.793959862554652e-05,
+      "loss": 1.7257,
+      "step": 361
+    },
+    {
+      "epoch": 0.6464285714285715,
+      "grad_norm": 0.23070622980594635,
+      "learning_rate": 5.7422070843492734e-05,
+      "loss": 1.6149,
+      "step": 362
+    },
+    {
+      "epoch": 0.6482142857142857,
+      "grad_norm": 0.22408127784729004,
+      "learning_rate": 5.6905932238875123e-05,
+      "loss": 1.5936,
+      "step": 363
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.23098969459533691,
+      "learning_rate": 5.639119965159446e-05,
+      "loss": 1.7313,
+      "step": 364
+    },
+    {
+      "epoch": 0.6517857142857143,
+      "grad_norm": 0.2533377707004547,
+      "learning_rate": 5.5877889875677845e-05,
+      "loss": 1.9422,
+      "step": 365
+    },
+    {
+      "epoch": 0.6535714285714286,
+      "grad_norm": 0.2515897750854492,
+      "learning_rate": 5.5366019658730825e-05,
+      "loss": 1.8331,
+      "step": 366
+    },
+    {
+      "epoch": 0.6553571428571429,
+      "grad_norm": 0.2757863700389862,
+      "learning_rate": 5.485560570139061e-05,
+      "loss": 1.7759,
+      "step": 367
+    },
+    {
+      "epoch": 0.6571428571428571,
+      "grad_norm": 0.29774826765060425,
+      "learning_rate": 5.434666465678175e-05,
+      "loss": 1.334,
+      "step": 368
+    },
+    {
+      "epoch": 0.6589285714285714,
+      "grad_norm": 0.28664523363113403,
+      "learning_rate": 5.383921312997242e-05,
+      "loss": 1.6234,
+      "step": 369
+    },
+    {
+      "epoch": 0.6607142857142857,
+      "grad_norm": 0.30774933099746704,
+      "learning_rate": 5.333326767743263e-05,
+      "loss": 1.4743,
+      "step": 370
+    },
+    {
+      "epoch": 0.6625,
+      "grad_norm": 0.26094669103622437,
+      "learning_rate": 5.282884480649435e-05,
+      "loss": 1.1046,
+      "step": 371
+    },
+    {
+      "epoch": 0.6642857142857143,
+      "grad_norm": 0.2818247079849243,
+      "learning_rate": 5.232596097481251e-05,
+      "loss": 0.8417,
+      "step": 372
+    },
+    {
+      "epoch": 0.6660714285714285,
+      "grad_norm": 0.3089035749435425,
+      "learning_rate": 5.182463258982846e-05,
+      "loss": 1.3922,
+      "step": 373
+    },
+    {
+      "epoch": 0.6678571428571428,
+      "grad_norm": 0.2375502586364746,
+      "learning_rate": 5.132487600823438e-05,
+      "loss": 1.0855,
+      "step": 374
+    },
+    {
+      "epoch": 0.6696428571428571,
+      "grad_norm": 0.3417636454105377,
+      "learning_rate": 5.082670753543961e-05,
+      "loss": 1.0819,
+      "step": 375
+    },
+    {
+      "epoch": 0.6714285714285714,
+      "grad_norm": 0.2587840259075165,
+      "learning_rate": 5.033014342503889e-05,
+      "loss": 1.1154,
+      "step": 376
+    },
+    {
+      "epoch": 0.6732142857142858,
+      "grad_norm": 0.29829278588294983,
+      "learning_rate": 4.9835199878281765e-05,
+      "loss": 0.9634,
+      "step": 377
+    },
+    {
+      "epoch": 0.675,
+      "grad_norm": 0.307190477848053,
+      "learning_rate": 4.9341893043544185e-05,
+      "loss": 1.1533,
+      "step": 378
+    },
+    {
+      "epoch": 0.6767857142857143,
+      "grad_norm": 0.3548847734928131,
+      "learning_rate": 4.8850239015801625e-05,
+      "loss": 1.2046,
+      "step": 379
+    },
+    {
+      "epoch": 0.6785714285714286,
+      "grad_norm": 0.3130282759666443,
+      "learning_rate": 4.836025383610382e-05,
+      "loss": 1.1391,
+      "step": 380
+    },
+    {
+      "epoch": 0.6803571428571429,
+      "grad_norm": 0.3400501012802124,
+      "learning_rate": 4.787195349105159e-05,
+      "loss": 1.0226,
+      "step": 381
+    },
+    {
+      "epoch": 0.6821428571428572,
+      "grad_norm": 0.3462565541267395,
+      "learning_rate": 4.7385353912275165e-05,
+      "loss": 1.0968,
+      "step": 382
+    },
+    {
+      "epoch": 0.6839285714285714,
+      "grad_norm": 0.331123948097229,
+      "learning_rate": 4.690047097591427e-05,
+      "loss": 1.1918,
+      "step": 383
+    },
+    {
+      "epoch": 0.6857142857142857,
+      "grad_norm": 0.354432612657547,
+      "learning_rate": 4.6417320502100316e-05,
+      "loss": 1.2261,
+      "step": 384
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 0.34844398498535156,
+      "learning_rate": 4.593591825444028e-05,
+      "loss": 0.7991,
+      "step": 385
+    },
+    {
+      "epoch": 0.6892857142857143,
+      "grad_norm": 0.3462367653846741,
+      "learning_rate": 4.545627993950201e-05,
+      "loss": 1.1343,
+      "step": 386
+    },
+    {
+      "epoch": 0.6910714285714286,
+      "grad_norm": 0.3352709114551544,
+      "learning_rate": 4.497842120630229e-05,
+      "loss": 1.1023,
+      "step": 387
+    },
+    {
+      "epoch": 0.6928571428571428,
+      "grad_norm": 0.3581717610359192,
+      "learning_rate": 4.4502357645795976e-05,
+      "loss": 0.9181,
+      "step": 388
+    },
+    {
+      "epoch": 0.6946428571428571,
+      "grad_norm": 0.35995498299598694,
+      "learning_rate": 4.402810479036725e-05,
+      "loss": 1.3445,
+      "step": 389
+    },
+    {
+      "epoch": 0.6964285714285714,
+      "grad_norm": 0.372935950756073,
+      "learning_rate": 4.355567811332311e-05,
+      "loss": 1.0725,
+      "step": 390
+    },
+    {
+      "epoch": 0.6982142857142857,
+      "grad_norm": 0.3759123980998993,
+      "learning_rate": 4.30850930283882e-05,
+      "loss": 1.1824,
+      "step": 391
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.391770601272583,
+      "learning_rate": 4.2616364889202254e-05,
+      "loss": 1.3516,
+      "step": 392
+    },
+    {
+      "epoch": 0.7017857142857142,
+      "grad_norm": 0.3785625696182251,
+      "learning_rate": 4.214950898881892e-05,
+      "loss": 1.1624,
+      "step": 393
+    },
+    {
+      "epoch": 0.7035714285714286,
+      "grad_norm": 0.4284125864505768,
+      "learning_rate": 4.168454055920681e-05,
+      "loss": 1.1318,
+      "step": 394
+    },
+    {
+      "epoch": 0.7053571428571429,
+      "grad_norm": 0.391161173582077,
+      "learning_rate": 4.12214747707527e-05,
+      "loss": 1.4543,
+      "step": 395
+    },
+    {
+      "epoch": 0.7071428571428572,
+      "grad_norm": 0.3723802864551544,
+      "learning_rate": 4.0760326731766374e-05,
+      "loss": 1.4265,
+      "step": 396
+    },
+    {
+      "epoch": 0.7089285714285715,
+      "grad_norm": 0.4321235418319702,
+      "learning_rate": 4.030111148798775e-05,
+      "loss": 1.4523,
+      "step": 397
+    },
+    {
+      "epoch": 0.7107142857142857,
+      "grad_norm": 0.43963977694511414,
+      "learning_rate": 3.9843844022096135e-05,
+      "loss": 1.7426,
+      "step": 398
+    },
+    {
+      "epoch": 0.7125,
+      "grad_norm": 0.5444914698600769,
+      "learning_rate": 3.938853925322118e-05,
+      "loss": 1.8907,
+      "step": 399
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.9059175252914429,
+      "learning_rate": 3.893521203645618e-05,
+      "loss": 2.2147,
+      "step": 400
+    },
+    {
+      "epoch": 0.7160714285714286,
+      "grad_norm": 0.20250235497951508,
+      "learning_rate": 3.848387716237353e-05,
+      "loss": 1.7341,
+      "step": 401
+    },
+    {
+      "epoch": 0.7178571428571429,
+      "grad_norm": 0.1863853931427002,
+      "learning_rate": 3.8034549356541894e-05,
+      "loss": 1.6956,
+      "step": 402
+    },
+    {
+      "epoch": 0.7196428571428571,
+      "grad_norm": 0.19317291676998138,
+      "learning_rate": 3.7587243279046056e-05,
+      "loss": 1.7165,
+      "step": 403
+    },
+    {
+      "epoch": 0.7214285714285714,
+      "grad_norm": 0.21101966500282288,
+      "learning_rate": 3.714197352400849e-05,
+      "loss": 1.8306,
+      "step": 404
+    },
+    {
+      "epoch": 0.7232142857142857,
+      "grad_norm": 0.22385361790657043,
+      "learning_rate": 3.669875461911297e-05,
+      "loss": 1.7104,
+      "step": 405
+    },
+    {
+      "epoch": 0.725,
+      "grad_norm": 0.22555914521217346,
+      "learning_rate": 3.6257601025131026e-05,
+      "loss": 1.5668,
+      "step": 406
+    },
+    {
+      "epoch": 0.7267857142857143,
+      "grad_norm": 0.21916812658309937,
+      "learning_rate": 3.581852713544983e-05,
+      "loss": 1.7827,
+      "step": 407
+    },
+    {
+      "epoch": 0.7285714285714285,
+      "grad_norm": 0.23447498679161072,
+      "learning_rate": 3.538154727560259e-05,
+      "loss": 1.8308,
+      "step": 408
+    },
+    {
+      "epoch": 0.7303571428571428,
+      "grad_norm": 0.21024593710899353,
+      "learning_rate": 3.494667570280132e-05,
+      "loss": 1.613,
+      "step": 409
+    },
+    {
+      "epoch": 0.7321428571428571,
+      "grad_norm": 0.23882578313350677,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 1.6853,
+      "step": 410
+    },
+    {
+      "epoch": 0.7339285714285714,
+      "grad_norm": 0.23162604868412018,
+      "learning_rate": 3.408331410278929e-05,
+      "loss": 1.7371,
+      "step": 411
+    },
+    {
+      "epoch": 0.7357142857142858,
+      "grad_norm": 0.23150567710399628,
+      "learning_rate": 3.3654852244220826e-05,
+      "loss": 1.7505,
+      "step": 412
+    },
+    {
+      "epoch": 0.7375,
+      "grad_norm": 0.23027552664279938,
+      "learning_rate": 3.322855500906373e-05,
+      "loss": 1.7128,
+      "step": 413
+    },
+    {
+      "epoch": 0.7392857142857143,
+      "grad_norm": 0.22426114976406097,
+      "learning_rate": 3.2804436305991214e-05,
+      "loss": 1.7721,
+      "step": 414
+    },
+    {
+      "epoch": 0.7410714285714286,
+      "grad_norm": 0.22792723774909973,
+      "learning_rate": 3.238250997259808e-05,
+      "loss": 1.7089,
+      "step": 415
+    },
+    {
+      "epoch": 0.7428571428571429,
+      "grad_norm": 0.2450588494539261,
+      "learning_rate": 3.196278977494934e-05,
+      "loss": 1.744,
+      "step": 416
+    },
+    {
+      "epoch": 0.7446428571428572,
+      "grad_norm": 0.2348526120185852,
+      "learning_rate": 3.154528940713113e-05,
+      "loss": 1.8496,
+      "step": 417
+    },
+    {
+      "epoch": 0.7464285714285714,
+      "grad_norm": 0.2519124746322632,
+      "learning_rate": 3.113002249080386e-05,
+      "loss": 1.76,
+      "step": 418
+    },
+    {
+      "epoch": 0.7482142857142857,
+      "grad_norm": 0.27859431505203247,
+      "learning_rate": 3.071700257475768e-05,
+      "loss": 1.6493,
+      "step": 419
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.2539427876472473,
+      "learning_rate": 3.030624313447067e-05,
+      "loss": 1.7619,
+      "step": 420
+    },
+    {
+      "epoch": 0.75,
+      "eval_loss": 1.4375773668289185,
+      "eval_runtime": 13.3809,
+      "eval_samples_per_second": 17.637,
+      "eval_steps_per_second": 8.819,
+      "step": 420
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 1.3697781125559091e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null