Training in progress, step 423, checkpoint

Browse files

Files changed (6) hide show

last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state_0.pth +1 -1
last-checkpoint/rng_state_1.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +998 -3

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:41c04db13401440bb120e3569a23dbda67cd78267d7c0b1c77f3d3b3cee4cdee
 size 101752088

 version https://git-lfs.github.com/spec/v1
+oid sha256:efedcd7712efe5df4242d40d0fc157567550dc57198de0fde11a067a253c3786
 size 101752088

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f1a032c7471714a5d4a253e904e854da99f9722e45c96bc0da82257681a15490
 size 203713238

 version https://git-lfs.github.com/spec/v1
+oid sha256:2a3093ef84d124bf4f3a388a3f58cedd89b5fbf3ec80a866e1189f65649a0f5e
 size 203713238

last-checkpoint/rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:45144a3e80d33a7835b701c1b7b63faebde586b75158a47eb826cd0228136ec0
 size 14512

 version https://git-lfs.github.com/spec/v1
+oid sha256:9e52b4ddcd925a725a65812af6610fe4debc708c6e4fc1ee7e0e17160e2a6fc5
 size 14512

last-checkpoint/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:feb6925b0db33b6f02f0ccbd50be336d8d47178a933641d2c637051d854a6c60
 size 14512

 version https://git-lfs.github.com/spec/v1
+oid sha256:7d9aa8c4c4812086f9a0cd74c7d98dc727224f492c2c8deb8168a9fa04e2846e
 size 14512

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c28833a5c9fe2e108390575900c0ade8d470ff95484328f12052b199c28b6360
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:b2bb049f58262ac24b66ea8e4bbb35c588cda72b0f20c7495d16197e65e5d114
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.5007769145394007,
   "eval_steps": 141,
-  "global_step": 282,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2005,6 +2005,1001 @@
       "eval_samples_per_second": 5.989,
       "eval_steps_per_second": 1.502,
       "step": 282
     }
   ],
   "logging_steps": 1,
@@ -2024,7 +3019,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 3.7141566324945715e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.751165371809101,
   "eval_steps": 141,
+  "global_step": 423,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 5.989,
       "eval_steps_per_second": 1.502,
       "step": 282
+    },
+    {
+      "epoch": 0.502552719200888,
+      "grad_norm": 0.331061989068985,
+      "learning_rate": 0.00011434168642236964,
+      "loss": 0.8114,
+      "step": 283
+    },
+    {
+      "epoch": 0.5043285238623751,
+      "grad_norm": 0.3186919689178467,
+      "learning_rate": 0.00011373535578184082,
+      "loss": 0.7872,
+      "step": 284
+    },
+    {
+      "epoch": 0.5061043285238623,
+      "grad_norm": 0.3114188611507416,
+      "learning_rate": 0.00011312851002705383,
+      "loss": 0.7311,
+      "step": 285
+    },
+    {
+      "epoch": 0.5078801331853496,
+      "grad_norm": 0.3148879408836365,
+      "learning_rate": 0.00011252117191642175,
+      "loss": 0.7311,
+      "step": 286
+    },
+    {
+      "epoch": 0.5096559378468368,
+      "grad_norm": 0.3390887379646301,
+      "learning_rate": 0.00011191336422682237,
+      "loss": 0.7773,
+      "step": 287
+    },
+    {
+      "epoch": 0.5114317425083241,
+      "grad_norm": 0.31982842087745667,
+      "learning_rate": 0.00011130510975274409,
+      "loss": 0.7474,
+      "step": 288
+    },
+    {
+      "epoch": 0.5132075471698113,
+      "grad_norm": 0.31643104553222656,
+      "learning_rate": 0.00011069643130543084,
+      "loss": 0.7375,
+      "step": 289
+    },
+    {
+      "epoch": 0.5149833518312985,
+      "grad_norm": 0.33758479356765747,
+      "learning_rate": 0.00011008735171202684,
+      "loss": 0.7411,
+      "step": 290
+    },
+    {
+      "epoch": 0.5167591564927858,
+      "grad_norm": 0.324556440114975,
+      "learning_rate": 0.00010947789381472035,
+      "loss": 0.7235,
+      "step": 291
+    },
+    {
+      "epoch": 0.518534961154273,
+      "grad_norm": 0.3768496513366699,
+      "learning_rate": 0.00010886808046988717,
+      "loss": 0.7618,
+      "step": 292
+    },
+    {
+      "epoch": 0.5203107658157603,
+      "grad_norm": 0.34034618735313416,
+      "learning_rate": 0.00010825793454723325,
+      "loss": 0.7426,
+      "step": 293
+    },
+    {
+      "epoch": 0.5220865704772475,
+      "grad_norm": 0.3409979045391083,
+      "learning_rate": 0.00010764747892893723,
+      "loss": 0.7327,
+      "step": 294
+    },
+    {
+      "epoch": 0.5238623751387348,
+      "grad_norm": 0.35839787125587463,
+      "learning_rate": 0.00010703673650879218,
+      "loss": 0.7057,
+      "step": 295
+    },
+    {
+      "epoch": 0.525638179800222,
+      "grad_norm": 0.3807874023914337,
+      "learning_rate": 0.00010642573019134703,
+      "loss": 0.7225,
+      "step": 296
+    },
+    {
+      "epoch": 0.5274139844617092,
+      "grad_norm": 0.4682140648365021,
+      "learning_rate": 0.00010581448289104758,
+      "loss": 0.715,
+      "step": 297
+    },
+    {
+      "epoch": 0.5291897891231965,
+      "grad_norm": 0.4261273145675659,
+      "learning_rate": 0.00010520301753137724,
+      "loss": 0.7239,
+      "step": 298
+    },
+    {
+      "epoch": 0.5309655937846837,
+      "grad_norm": 0.4854682981967926,
+      "learning_rate": 0.00010459135704399718,
+      "loss": 0.7304,
+      "step": 299
+    },
+    {
+      "epoch": 0.532741398446171,
+      "grad_norm": 0.6740989685058594,
+      "learning_rate": 0.00010397952436788642,
+      "loss": 0.8604,
+      "step": 300
+    },
+    {
+      "epoch": 0.5345172031076582,
+      "grad_norm": 0.2903907299041748,
+      "learning_rate": 0.00010336754244848157,
+      "loss": 0.8551,
+      "step": 301
+    },
+    {
+      "epoch": 0.5362930077691453,
+      "grad_norm": 0.28648582100868225,
+      "learning_rate": 0.00010275543423681621,
+      "loss": 0.7958,
+      "step": 302
+    },
+    {
+      "epoch": 0.5380688124306326,
+      "grad_norm": 0.33123767375946045,
+      "learning_rate": 0.00010214322268866032,
+      "loss": 0.7853,
+      "step": 303
+    },
+    {
+      "epoch": 0.5398446170921198,
+      "grad_norm": 0.31327784061431885,
+      "learning_rate": 0.00010153093076365923,
+      "loss": 0.7856,
+      "step": 304
+    },
+    {
+      "epoch": 0.5416204217536071,
+      "grad_norm": 0.3101854622364044,
+      "learning_rate": 0.00010091858142447265,
+      "loss": 0.7694,
+      "step": 305
+    },
+    {
+      "epoch": 0.5433962264150943,
+      "grad_norm": 0.3217926621437073,
+      "learning_rate": 0.00010030619763591347,
+      "loss": 0.7899,
+      "step": 306
+    },
+    {
+      "epoch": 0.5451720310765816,
+      "grad_norm": 0.33827194571495056,
+      "learning_rate": 9.969380236408656e-05,
+      "loss": 0.8088,
+      "step": 307
+    },
+    {
+      "epoch": 0.5469478357380688,
+      "grad_norm": 0.32632124423980713,
+      "learning_rate": 9.908141857552737e-05,
+      "loss": 0.769,
+      "step": 308
+    },
+    {
+      "epoch": 0.548723640399556,
+      "grad_norm": 0.3152617812156677,
+      "learning_rate": 9.846906923634079e-05,
+      "loss": 0.7804,
+      "step": 309
+    },
+    {
+      "epoch": 0.5504994450610433,
+      "grad_norm": 0.33337536454200745,
+      "learning_rate": 9.78567773113397e-05,
+      "loss": 0.7379,
+      "step": 310
+    },
+    {
+      "epoch": 0.5522752497225305,
+      "grad_norm": 0.3020349144935608,
+      "learning_rate": 9.724456576318381e-05,
+      "loss": 0.7146,
+      "step": 311
+    },
+    {
+      "epoch": 0.5540510543840178,
+      "grad_norm": 0.34656378626823425,
+      "learning_rate": 9.663245755151846e-05,
+      "loss": 0.7437,
+      "step": 312
+    },
+    {
+      "epoch": 0.555826859045505,
+      "grad_norm": 0.3417186737060547,
+      "learning_rate": 9.602047563211359e-05,
+      "loss": 0.7472,
+      "step": 313
+    },
+    {
+      "epoch": 0.5576026637069922,
+      "grad_norm": 0.34442222118377686,
+      "learning_rate": 9.540864295600283e-05,
+      "loss": 0.7426,
+      "step": 314
+    },
+    {
+      "epoch": 0.5593784683684795,
+      "grad_norm": 0.3521478772163391,
+      "learning_rate": 9.479698246862276e-05,
+      "loss": 0.7522,
+      "step": 315
+    },
+    {
+      "epoch": 0.5611542730299667,
+      "grad_norm": 0.3358227014541626,
+      "learning_rate": 9.418551710895243e-05,
+      "loss": 0.7454,
+      "step": 316
+    },
+    {
+      "epoch": 0.562930077691454,
+      "grad_norm": 0.343226820230484,
+      "learning_rate": 9.357426980865301e-05,
+      "loss": 0.7341,
+      "step": 317
+    },
+    {
+      "epoch": 0.5647058823529412,
+      "grad_norm": 0.3432699739933014,
+      "learning_rate": 9.296326349120785e-05,
+      "loss": 0.6836,
+      "step": 318
+    },
+    {
+      "epoch": 0.5664816870144284,
+      "grad_norm": 0.3710852265357971,
+      "learning_rate": 9.235252107106279e-05,
+      "loss": 0.6961,
+      "step": 319
+    },
+    {
+      "epoch": 0.5682574916759157,
+      "grad_norm": 0.351094514131546,
+      "learning_rate": 9.174206545276677e-05,
+      "loss": 0.6668,
+      "step": 320
+    },
+    {
+      "epoch": 0.5700332963374029,
+      "grad_norm": 0.4484163224697113,
+      "learning_rate": 9.113191953011287e-05,
+      "loss": 0.7427,
+      "step": 321
+    },
+    {
+      "epoch": 0.5718091009988902,
+      "grad_norm": 0.44636109471321106,
+      "learning_rate": 9.052210618527966e-05,
+      "loss": 0.8119,
+      "step": 322
+    },
+    {
+      "epoch": 0.5735849056603773,
+      "grad_norm": 0.43749314546585083,
+      "learning_rate": 8.991264828797319e-05,
+      "loss": 0.7846,
+      "step": 323
+    },
+    {
+      "epoch": 0.5753607103218646,
+      "grad_norm": 0.4471510350704193,
+      "learning_rate": 8.930356869456919e-05,
+      "loss": 0.7215,
+      "step": 324
+    },
+    {
+      "epoch": 0.5771365149833518,
+      "grad_norm": 0.5141078233718872,
+      "learning_rate": 8.869489024725595e-05,
+      "loss": 0.7492,
+      "step": 325
+    },
+    {
+      "epoch": 0.578912319644839,
+      "grad_norm": 0.2640296518802643,
+      "learning_rate": 8.808663577317764e-05,
+      "loss": 0.8625,
+      "step": 326
+    },
+    {
+      "epoch": 0.5806881243063263,
+      "grad_norm": 0.28867048025131226,
+      "learning_rate": 8.747882808357828e-05,
+      "loss": 0.8352,
+      "step": 327
+    },
+    {
+      "epoch": 0.5824639289678135,
+      "grad_norm": 0.2925030589103699,
+      "learning_rate": 8.687148997294621e-05,
+      "loss": 0.8091,
+      "step": 328
+    },
+    {
+      "epoch": 0.5842397336293008,
+      "grad_norm": 0.28383681178092957,
+      "learning_rate": 8.626464421815919e-05,
+      "loss": 0.784,
+      "step": 329
+    },
+    {
+      "epoch": 0.586015538290788,
+      "grad_norm": 0.3055633306503296,
+      "learning_rate": 8.565831357763039e-05,
+      "loss": 0.79,
+      "step": 330
+    },
+    {
+      "epoch": 0.5877913429522752,
+      "grad_norm": 0.30299943685531616,
+      "learning_rate": 8.505252079045458e-05,
+      "loss": 0.8105,
+      "step": 331
+    },
+    {
+      "epoch": 0.5895671476137625,
+      "grad_norm": 0.3154890239238739,
+      "learning_rate": 8.444728857555572e-05,
+      "loss": 0.7664,
+      "step": 332
+    },
+    {
+      "epoch": 0.5913429522752497,
+      "grad_norm": 0.31844133138656616,
+      "learning_rate": 8.384263963083453e-05,
+      "loss": 0.7709,
+      "step": 333
+    },
+    {
+      "epoch": 0.593118756936737,
+      "grad_norm": 0.31844353675842285,
+      "learning_rate": 8.323859663231768e-05,
+      "loss": 0.7426,
+      "step": 334
+    },
+    {
+      "epoch": 0.5948945615982242,
+      "grad_norm": 0.31527841091156006,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 0.7441,
+      "step": 335
+    },
+    {
+      "epoch": 0.5966703662597115,
+      "grad_norm": 0.32145699858665466,
+      "learning_rate": 8.203241906353014e-05,
+      "loss": 0.7333,
+      "step": 336
+    },
+    {
+      "epoch": 0.5984461709211987,
+      "grad_norm": 0.3175109922885895,
+      "learning_rate": 8.143032972829183e-05,
+      "loss": 0.7488,
+      "step": 337
+    },
+    {
+      "epoch": 0.6002219755826859,
+      "grad_norm": 0.3342651128768921,
+      "learning_rate": 8.082893680762619e-05,
+      "loss": 0.7265,
+      "step": 338
+    },
+    {
+      "epoch": 0.6019977802441732,
+      "grad_norm": 0.339743971824646,
+      "learning_rate": 8.022826285544968e-05,
+      "loss": 0.7005,
+      "step": 339
+    },
+    {
+      "epoch": 0.6037735849056604,
+      "grad_norm": 0.35757359862327576,
+      "learning_rate": 7.96283303987156e-05,
+      "loss": 0.7806,
+      "step": 340
+    },
+    {
+      "epoch": 0.6055493895671477,
+      "grad_norm": 0.4024328291416168,
+      "learning_rate": 7.902916193656898e-05,
+      "loss": 0.6895,
+      "step": 341
+    },
+    {
+      "epoch": 0.6073251942286348,
+      "grad_norm": 0.3628247380256653,
+      "learning_rate": 7.843077993950302e-05,
+      "loss": 0.7285,
+      "step": 342
+    },
+    {
+      "epoch": 0.609100998890122,
+      "grad_norm": 0.3793889582157135,
+      "learning_rate": 7.783320684851614e-05,
+      "loss": 0.729,
+      "step": 343
+    },
+    {
+      "epoch": 0.6108768035516093,
+      "grad_norm": 0.37614578008651733,
+      "learning_rate": 7.72364650742707e-05,
+      "loss": 0.6869,
+      "step": 344
+    },
+    {
+      "epoch": 0.6126526082130965,
+      "grad_norm": 0.3737132251262665,
+      "learning_rate": 7.664057699625214e-05,
+      "loss": 0.7373,
+      "step": 345
+    },
+    {
+      "epoch": 0.6144284128745838,
+      "grad_norm": 0.40523961186408997,
+      "learning_rate": 7.604556496193015e-05,
+      "loss": 0.729,
+      "step": 346
+    },
+    {
+      "epoch": 0.616204217536071,
+      "grad_norm": 0.3903469145298004,
+      "learning_rate": 7.54514512859201e-05,
+      "loss": 0.7063,
+      "step": 347
+    },
+    {
+      "epoch": 0.6179800221975583,
+      "grad_norm": 0.43782973289489746,
+      "learning_rate": 7.485825824914659e-05,
+      "loss": 0.6763,
+      "step": 348
+    },
+    {
+      "epoch": 0.6197558268590455,
+      "grad_norm": 0.4907206594944,
+      "learning_rate": 7.426600809800752e-05,
+      "loss": 0.7405,
+      "step": 349
+    },
+    {
+      "epoch": 0.6215316315205327,
+      "grad_norm": 0.5378274917602539,
+      "learning_rate": 7.36747230435401e-05,
+      "loss": 0.7417,
+      "step": 350
+    },
+    {
+      "epoch": 0.62330743618202,
+      "grad_norm": 0.266481876373291,
+      "learning_rate": 7.308442526058756e-05,
+      "loss": 0.8434,
+      "step": 351
+    },
+    {
+      "epoch": 0.6250832408435072,
+      "grad_norm": 0.28670433163642883,
+      "learning_rate": 7.249513688696786e-05,
+      "loss": 0.8049,
+      "step": 352
+    },
+    {
+      "epoch": 0.6268590455049945,
+      "grad_norm": 0.29961690306663513,
+      "learning_rate": 7.190688002264308e-05,
+      "loss": 0.762,
+      "step": 353
+    },
+    {
+      "epoch": 0.6286348501664817,
+      "grad_norm": 0.2873949706554413,
+      "learning_rate": 7.131967672889101e-05,
+      "loss": 0.7389,
+      "step": 354
+    },
+    {
+      "epoch": 0.6304106548279689,
+      "grad_norm": 0.3315136730670929,
+      "learning_rate": 7.073354902747741e-05,
+      "loss": 0.7719,
+      "step": 355
+    },
+    {
+      "epoch": 0.6321864594894562,
+      "grad_norm": 0.31057095527648926,
+      "learning_rate": 7.014851889983057e-05,
+      "loss": 0.7407,
+      "step": 356
+    },
+    {
+      "epoch": 0.6339622641509434,
+      "grad_norm": 0.345838725566864,
+      "learning_rate": 6.95646082862164e-05,
+      "loss": 0.7838,
+      "step": 357
+    },
+    {
+      "epoch": 0.6357380688124307,
+      "grad_norm": 0.31915196776390076,
+      "learning_rate": 6.898183908491617e-05,
+      "loss": 0.7591,
+      "step": 358
+    },
+    {
+      "epoch": 0.6375138734739179,
+      "grad_norm": 0.3124110698699951,
+      "learning_rate": 6.840023315140475e-05,
+      "loss": 0.7222,
+      "step": 359
+    },
+    {
+      "epoch": 0.6392896781354052,
+      "grad_norm": 0.3307512104511261,
+      "learning_rate": 6.781981229753145e-05,
+      "loss": 0.7472,
+      "step": 360
+    },
+    {
+      "epoch": 0.6410654827968923,
+      "grad_norm": 0.3425205945968628,
+      "learning_rate": 6.724059829070158e-05,
+      "loss": 0.764,
+      "step": 361
+    },
+    {
+      "epoch": 0.6428412874583795,
+      "grad_norm": 0.33861225843429565,
+      "learning_rate": 6.666261285306047e-05,
+      "loss": 0.7396,
+      "step": 362
+    },
+    {
+      "epoch": 0.6446170921198668,
+      "grad_norm": 0.3248923420906067,
+      "learning_rate": 6.608587766067852e-05,
+      "loss": 0.7158,
+      "step": 363
+    },
+    {
+      "epoch": 0.646392896781354,
+      "grad_norm": 0.349185049533844,
+      "learning_rate": 6.551041434273861e-05,
+      "loss": 0.7415,
+      "step": 364
+    },
+    {
+      "epoch": 0.6481687014428413,
+      "grad_norm": 0.33934569358825684,
+      "learning_rate": 6.493624448072457e-05,
+      "loss": 0.744,
+      "step": 365
+    },
+    {
+      "epoch": 0.6499445061043285,
+      "grad_norm": 0.3628052771091461,
+      "learning_rate": 6.43633896076122e-05,
+      "loss": 0.7328,
+      "step": 366
+    },
+    {
+      "epoch": 0.6517203107658157,
+      "grad_norm": 0.348979115486145,
+      "learning_rate": 6.379187120706138e-05,
+      "loss": 0.6755,
+      "step": 367
+    },
+    {
+      "epoch": 0.653496115427303,
+      "grad_norm": 0.38474076986312866,
+      "learning_rate": 6.322171071261071e-05,
+      "loss": 0.711,
+      "step": 368
+    },
+    {
+      "epoch": 0.6552719200887902,
+      "grad_norm": 0.34556257724761963,
+      "learning_rate": 6.26529295068733e-05,
+      "loss": 0.6995,
+      "step": 369
+    },
+    {
+      "epoch": 0.6570477247502775,
+      "grad_norm": 0.4337230622768402,
+      "learning_rate": 6.208554892073528e-05,
+      "loss": 0.7412,
+      "step": 370
+    },
+    {
+      "epoch": 0.6588235294117647,
+      "grad_norm": 0.37804853916168213,
+      "learning_rate": 6.151959023255545e-05,
+      "loss": 0.6724,
+      "step": 371
+    },
+    {
+      "epoch": 0.6605993340732519,
+      "grad_norm": 0.40870919823646545,
+      "learning_rate": 6.095507466736763e-05,
+      "loss": 0.7243,
+      "step": 372
+    },
+    {
+      "epoch": 0.6623751387347392,
+      "grad_norm": 0.45504140853881836,
+      "learning_rate": 6.039202339608432e-05,
+      "loss": 0.7373,
+      "step": 373
+    },
+    {
+      "epoch": 0.6641509433962264,
+      "grad_norm": 0.46973538398742676,
+      "learning_rate": 5.983045753470308e-05,
+      "loss": 0.7101,
+      "step": 374
+    },
+    {
+      "epoch": 0.6659267480577137,
+      "grad_norm": 0.5572993755340576,
+      "learning_rate": 5.927039814351426e-05,
+      "loss": 0.7393,
+      "step": 375
+    },
+    {
+      "epoch": 0.6677025527192009,
+      "grad_norm": 0.2691468596458435,
+      "learning_rate": 5.8711866226311553e-05,
+      "loss": 0.8102,
+      "step": 376
+    },
+    {
+      "epoch": 0.6694783573806882,
+      "grad_norm": 0.2898322641849518,
+      "learning_rate": 5.8154882729603876e-05,
+      "loss": 0.7968,
+      "step": 377
+    },
+    {
+      "epoch": 0.6712541620421754,
+      "grad_norm": 0.3048444092273712,
+      "learning_rate": 5.7599468541830356e-05,
+      "loss": 0.775,
+      "step": 378
+    },
+    {
+      "epoch": 0.6730299667036626,
+      "grad_norm": 0.3111611604690552,
+      "learning_rate": 5.7045644492576346e-05,
+      "loss": 0.7742,
+      "step": 379
+    },
+    {
+      "epoch": 0.6748057713651499,
+      "grad_norm": 0.31889772415161133,
+      "learning_rate": 5.64934313517927e-05,
+      "loss": 0.7304,
+      "step": 380
+    },
+    {
+      "epoch": 0.676581576026637,
+      "grad_norm": 0.3219664692878723,
+      "learning_rate": 5.5942849829016695e-05,
+      "loss": 0.7679,
+      "step": 381
+    },
+    {
+      "epoch": 0.6783573806881243,
+      "grad_norm": 0.30955034494400024,
+      "learning_rate": 5.5393920572595356e-05,
+      "loss": 0.7443,
+      "step": 382
+    },
+    {
+      "epoch": 0.6801331853496115,
+      "grad_norm": 0.344043105840683,
+      "learning_rate": 5.484666416891109e-05,
+      "loss": 0.7272,
+      "step": 383
+    },
+    {
+      "epoch": 0.6819089900110987,
+      "grad_norm": 0.33895599842071533,
+      "learning_rate": 5.430110114160964e-05,
+      "loss": 0.7585,
+      "step": 384
+    },
+    {
+      "epoch": 0.683684794672586,
+      "grad_norm": 0.37816834449768066,
+      "learning_rate": 5.375725195083046e-05,
+      "loss": 0.7749,
+      "step": 385
+    },
+    {
+      "epoch": 0.6854605993340732,
+      "grad_norm": 0.3477395176887512,
+      "learning_rate": 5.321513699243924e-05,
+      "loss": 0.7022,
+      "step": 386
+    },
+    {
+      "epoch": 0.6872364039955605,
+      "grad_norm": 0.3380398154258728,
+      "learning_rate": 5.2674776597263186e-05,
+      "loss": 0.7266,
+      "step": 387
+    },
+    {
+      "epoch": 0.6890122086570477,
+      "grad_norm": 0.35505762696266174,
+      "learning_rate": 5.2136191030328455e-05,
+      "loss": 0.7411,
+      "step": 388
+    },
+    {
+      "epoch": 0.690788013318535,
+      "grad_norm": 0.38739171624183655,
+      "learning_rate": 5.159940049010015e-05,
+      "loss": 0.7666,
+      "step": 389
+    },
+    {
+      "epoch": 0.6925638179800222,
+      "grad_norm": 0.38473132252693176,
+      "learning_rate": 5.106442510772489e-05,
+      "loss": 0.7038,
+      "step": 390
+    },
+    {
+      "epoch": 0.6943396226415094,
+      "grad_norm": 0.37635302543640137,
+      "learning_rate": 5.0531284946275784e-05,
+      "loss": 0.7488,
+      "step": 391
+    },
+    {
+      "epoch": 0.6961154273029967,
+      "grad_norm": 0.37422046065330505,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.693,
+      "step": 392
+    },
+    {
+      "epoch": 0.6978912319644839,
+      "grad_norm": 0.3987278342247009,
+      "learning_rate": 4.9470590193569044e-05,
+      "loss": 0.6965,
+      "step": 393
+    },
+    {
+      "epoch": 0.6996670366259712,
+      "grad_norm": 0.34372609853744507,
+      "learning_rate": 4.894307538133129e-05,
+      "loss": 0.6632,
+      "step": 394
+    },
+    {
+      "epoch": 0.7014428412874584,
+      "grad_norm": 0.4215118885040283,
+      "learning_rate": 4.841747534656763e-05,
+      "loss": 0.7081,
+      "step": 395
+    },
+    {
+      "epoch": 0.7032186459489456,
+      "grad_norm": 0.4211183488368988,
+      "learning_rate": 4.7893809800749403e-05,
+      "loss": 0.687,
+      "step": 396
+    },
+    {
+      "epoch": 0.7049944506104329,
+      "grad_norm": 0.44248080253601074,
+      "learning_rate": 4.737209838279922e-05,
+      "loss": 0.7118,
+      "step": 397
+    },
+    {
+      "epoch": 0.7067702552719201,
+      "grad_norm": 0.38100606203079224,
+      "learning_rate": 4.685236065835443e-05,
+      "loss": 0.6259,
+      "step": 398
+    },
+    {
+      "epoch": 0.7085460599334074,
+      "grad_norm": 0.46482354402542114,
+      "learning_rate": 4.6334616119033356e-05,
+      "loss": 0.6668,
+      "step": 399
+    },
+    {
+      "epoch": 0.7103218645948945,
+      "grad_norm": 0.5484885573387146,
+      "learning_rate": 4.5818884181704294e-05,
+      "loss": 0.7973,
+      "step": 400
+    },
+    {
+      "epoch": 0.7120976692563818,
+      "grad_norm": 0.2660059928894043,
+      "learning_rate": 4.530518418775733e-05,
+      "loss": 0.7845,
+      "step": 401
+    },
+    {
+      "epoch": 0.713873473917869,
+      "grad_norm": 0.30005505681037903,
+      "learning_rate": 4.479353540237903e-05,
+      "loss": 0.8141,
+      "step": 402
+    },
+    {
+      "epoch": 0.7156492785793562,
+      "grad_norm": 0.3031437397003174,
+      "learning_rate": 4.4283957013829846e-05,
+      "loss": 0.7505,
+      "step": 403
+    },
+    {
+      "epoch": 0.7174250832408435,
+      "grad_norm": 0.3152884542942047,
+      "learning_rate": 4.3776468132724604e-05,
+      "loss": 0.8191,
+      "step": 404
+    },
+    {
+      "epoch": 0.7192008879023307,
+      "grad_norm": 0.3122805058956146,
+      "learning_rate": 4.3271087791315734e-05,
+      "loss": 0.7732,
+      "step": 405
+    },
+    {
+      "epoch": 0.720976692563818,
+      "grad_norm": 0.3241139054298401,
+      "learning_rate": 4.276783494277954e-05,
+      "loss": 0.7652,
+      "step": 406
+    },
+    {
+      "epoch": 0.7227524972253052,
+      "grad_norm": 0.3523857295513153,
+      "learning_rate": 4.2266728460505375e-05,
+      "loss": 0.7923,
+      "step": 407
+    },
+    {
+      "epoch": 0.7245283018867924,
+      "grad_norm": 0.3518478274345398,
+      "learning_rate": 4.176778713738787e-05,
+      "loss": 0.8046,
+      "step": 408
+    },
+    {
+      "epoch": 0.7263041065482797,
+      "grad_norm": 0.35740435123443604,
+      "learning_rate": 4.127102968512214e-05,
+      "loss": 0.741,
+      "step": 409
+    },
+    {
+      "epoch": 0.7280799112097669,
+      "grad_norm": 0.3561273217201233,
+      "learning_rate": 4.077647473350201e-05,
+      "loss": 0.7304,
+      "step": 410
+    },
+    {
+      "epoch": 0.7298557158712542,
+      "grad_norm": 0.3595544397830963,
+      "learning_rate": 4.028414082972141e-05,
+      "loss": 0.7601,
+      "step": 411
+    },
+    {
+      "epoch": 0.7316315205327414,
+      "grad_norm": 0.38603028655052185,
+      "learning_rate": 3.97940464376787e-05,
+      "loss": 0.768,
+      "step": 412
+    },
+    {
+      "epoch": 0.7334073251942287,
+      "grad_norm": 0.347781240940094,
+      "learning_rate": 3.9306209937284346e-05,
+      "loss": 0.7255,
+      "step": 413
+    },
+    {
+      "epoch": 0.7351831298557159,
+      "grad_norm": 0.3760242462158203,
+      "learning_rate": 3.882064962377154e-05,
+      "loss": 0.7371,
+      "step": 414
+    },
+    {
+      "epoch": 0.7369589345172031,
+      "grad_norm": 0.359371542930603,
+      "learning_rate": 3.83373837070101e-05,
+      "loss": 0.7422,
+      "step": 415
+    },
+    {
+      "epoch": 0.7387347391786904,
+      "grad_norm": 0.3574449419975281,
+      "learning_rate": 3.7856430310823545e-05,
+      "loss": 0.6915,
+      "step": 416
+    },
+    {
+      "epoch": 0.7405105438401776,
+      "grad_norm": 0.3730245530605316,
+      "learning_rate": 3.737780747230941e-05,
+      "loss": 0.7309,
+      "step": 417
+    },
+    {
+      "epoch": 0.7422863485016649,
+      "grad_norm": 0.36496400833129883,
+      "learning_rate": 3.69015331411628e-05,
+      "loss": 0.7245,
+      "step": 418
+    },
+    {
+      "epoch": 0.744062153163152,
+      "grad_norm": 0.3593985140323639,
+      "learning_rate": 3.642762517900322e-05,
+      "loss": 0.6389,
+      "step": 419
+    },
+    {
+      "epoch": 0.7458379578246392,
+      "grad_norm": 0.3603939116001129,
+      "learning_rate": 3.595610135870472e-05,
+      "loss": 0.703,
+      "step": 420
+    },
+    {
+      "epoch": 0.7476137624861265,
+      "grad_norm": 0.397124320268631,
+      "learning_rate": 3.548697936372937e-05,
+      "loss": 0.7265,
+      "step": 421
+    },
+    {
+      "epoch": 0.7493895671476137,
+      "grad_norm": 0.4071907103061676,
+      "learning_rate": 3.5020276787464056e-05,
+      "loss": 0.6752,
+      "step": 422
+    },
+    {
+      "epoch": 0.751165371809101,
+      "grad_norm": 0.3834024965763092,
+      "learning_rate": 3.455601113256073e-05,
+      "loss": 0.6297,
+      "step": 423
+    },
+    {
+      "epoch": 0.751165371809101,
+      "eval_loss": 0.7374839186668396,
+      "eval_runtime": 156.6123,
+      "eval_samples_per_second": 6.06,
+      "eval_steps_per_second": 1.52,
+      "step": 423
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 5.571234948741857e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null