cocolalala
/

asset-generation-sft-qlora

@@ -17,6 +17,8 @@ should probably proofread and complete it, then remove this comment. -->
 # asset-generation-sft-qlora
 This model was trained from scratch on the generator dataset.
 ## Model description
@@ -40,15 +42,20 @@ The following hyperparameters were used during training:
 - eval_batch_size: 32
 - seed: 42
 - distributed_type: multi-GPU
 - gradient_accumulation_steps: 2
-- total_train_batch_size: 32
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
-- num_epochs: 0
 ### Training results
 ### Framework versions

 # asset-generation-sft-qlora
 This model was trained from scratch on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.7983
 ## Model description
 - eval_batch_size: 32
 - seed: 42
 - distributed_type: multi-GPU
+- num_devices: 2
 - gradient_accumulation_steps: 2
+- total_train_batch_size: 64
+- total_eval_batch_size: 64
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
 ### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 0.8288        | 1.0   | 5088 | 0.7983          |
 ### Framework versions

all_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 0.8844339622641509,
-    "total_flos": 1.393076937424896e+19,
-    "train_loss": 0.0,
-    "train_runtime": 0.0091,
-    "train_samples": 100,
-    "train_samples_per_second": 0.0,
-    "train_steps_per_second": 0.0
 }

 {
+    "epoch": 1.0,
+    "total_flos": 1.5751056572484157e+19,
+    "train_loss": 0.09284130807192821,
+    "train_runtime": 20560.1048,
+    "train_samples": 1055292,
+    "train_samples_per_second": 15.837,
+    "train_steps_per_second": 0.247
 }

runs/May25_13-55-16_br1t43-s3-25/events.out.tfevents.1716645331.br1t43-s3-25.187086.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:779d91afd5197d0f286a91734150e54d762ff168e7a61a2096656d93bb1907a8
-size 26553

 version https://git-lfs.github.com/spec/v1
+oid sha256:346fafda02c3a931a24d9615d6f5811e351bbaabbefbf31491855a578e6ea4f9
+size 30765

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 0.8844339622641509,
-    "total_flos": 1.393076937424896e+19,
-    "train_loss": 0.0,
-    "train_runtime": 0.0091,
-    "train_samples": 100,
-    "train_samples_per_second": 0.0,
-    "train_steps_per_second": 0.0
 }

 {
+    "epoch": 1.0,
+    "total_flos": 1.5751056572484157e+19,
+    "train_loss": 0.09284130807192821,
+    "train_runtime": 20560.1048,
+    "train_samples": 1055292,
+    "train_samples_per_second": 15.837,
+    "train_steps_per_second": 0.247
 }

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.8844339622641509,
   "eval_steps": 500,
-  "global_step": 4500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -6316,19 +6316,846 @@
       "step": 4500
     },
     {
-      "epoch": 0.8844339622641509,
-      "step": 4500,
-      "total_flos": 1.393076937424896e+19,
-      "train_loss": 0.0,
-      "train_runtime": 0.0091,
-      "train_samples_per_second": 0.0,
-      "train_steps_per_second": 0.0
     }
   ],
   "logging_steps": 5,
-  "max_steps": 0,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 0,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -6342,7 +7169,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.393076937424896e+19,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 1.0,
   "eval_steps": 500,
+  "global_step": 5088,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "step": 4500
     },
     {
+      "epoch": 0.8854166666666666,
+      "grad_norm": 0.43359375,
+      "learning_rate": 7.893462771773996e-06,
+      "loss": 0.782,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8863993710691824,
+      "grad_norm": 0.337890625,
+      "learning_rate": 7.760421092313152e-06,
+      "loss": 0.7891,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8873820754716981,
+      "grad_norm": 0.326171875,
+      "learning_rate": 7.628464876673202e-06,
+      "loss": 0.8201,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8883647798742138,
+      "grad_norm": 0.3203125,
+      "learning_rate": 7.497595677698388e-06,
+      "loss": 0.8031,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8893474842767296,
+      "grad_norm": 0.32421875,
+      "learning_rate": 7.3678150354410615e-06,
+      "loss": 0.8013,
+      "step": 4525
+    },
+    {
+      "epoch": 0.8903301886792453,
+      "grad_norm": 0.326171875,
+      "learning_rate": 7.239124477143578e-06,
+      "loss": 0.8075,
+      "step": 4530
+    },
+    {
+      "epoch": 0.891312893081761,
+      "grad_norm": 0.310546875,
+      "learning_rate": 7.111525517220308e-06,
+      "loss": 0.7919,
+      "step": 4535
+    },
+    {
+      "epoch": 0.8922955974842768,
+      "grad_norm": 0.298828125,
+      "learning_rate": 6.985019657239867e-06,
+      "loss": 0.8074,
+      "step": 4540
+    },
+    {
+      "epoch": 0.8932783018867925,
+      "grad_norm": 0.314453125,
+      "learning_rate": 6.859608385907379e-06,
+      "loss": 0.8009,
+      "step": 4545
+    },
+    {
+      "epoch": 0.8942610062893082,
+      "grad_norm": 0.310546875,
+      "learning_rate": 6.735293179046975e-06,
+      "loss": 0.8081,
+      "step": 4550
+    },
+    {
+      "epoch": 0.8952437106918238,
+      "grad_norm": 0.3046875,
+      "learning_rate": 6.612075499584458e-06,
+      "loss": 0.8067,
+      "step": 4555
+    },
+    {
+      "epoch": 0.8962264150943396,
+      "grad_norm": 0.3125,
+      "learning_rate": 6.489956797530084e-06,
+      "loss": 0.811,
+      "step": 4560
+    },
+    {
+      "epoch": 0.8972091194968553,
+      "grad_norm": 0.30078125,
+      "learning_rate": 6.368938509961398e-06,
+      "loss": 0.7966,
+      "step": 4565
+    },
+    {
+      "epoch": 0.898191823899371,
+      "grad_norm": 0.328125,
+      "learning_rate": 6.2490220610065155e-06,
+      "loss": 0.8123,
+      "step": 4570
+    },
+    {
+      "epoch": 0.8991745283018868,
+      "grad_norm": 0.30859375,
+      "learning_rate": 6.130208861827202e-06,
+      "loss": 0.8045,
+      "step": 4575
+    },
+    {
+      "epoch": 0.9001572327044025,
+      "grad_norm": 0.302734375,
+      "learning_rate": 6.012500310602254e-06,
+      "loss": 0.7923,
+      "step": 4580
+    },
+    {
+      "epoch": 0.9011399371069182,
+      "grad_norm": 0.30859375,
+      "learning_rate": 5.8958977925112405e-06,
+      "loss": 0.7986,
+      "step": 4585
+    },
+    {
+      "epoch": 0.902122641509434,
+      "grad_norm": 0.322265625,
+      "learning_rate": 5.780402679717989e-06,
+      "loss": 0.8166,
+      "step": 4590
+    },
+    {
+      "epoch": 0.9031053459119497,
+      "grad_norm": 0.298828125,
+      "learning_rate": 5.666016331354485e-06,
+      "loss": 0.7845,
+      "step": 4595
+    },
+    {
+      "epoch": 0.9040880503144654,
+      "grad_norm": 0.330078125,
+      "learning_rate": 5.552740093505015e-06,
+      "loss": 0.7865,
+      "step": 4600
+    },
+    {
+      "epoch": 0.9050707547169812,
+      "grad_norm": 0.30859375,
+      "learning_rate": 5.440575299190165e-06,
+      "loss": 0.8243,
+      "step": 4605
+    },
+    {
+      "epoch": 0.9060534591194969,
+      "grad_norm": 0.31640625,
+      "learning_rate": 5.329523268351155e-06,
+      "loss": 0.8041,
+      "step": 4610
+    },
+    {
+      "epoch": 0.9070361635220126,
+      "grad_norm": 0.310546875,
+      "learning_rate": 5.219585307834407e-06,
+      "loss": 0.8057,
+      "step": 4615
+    },
+    {
+      "epoch": 0.9080188679245284,
+      "grad_norm": 0.294921875,
+      "learning_rate": 5.110762711376116e-06,
+      "loss": 0.7987,
+      "step": 4620
+    },
+    {
+      "epoch": 0.909001572327044,
+      "grad_norm": 0.326171875,
+      "learning_rate": 5.003056759586944e-06,
+      "loss": 0.7983,
+      "step": 4625
+    },
+    {
+      "epoch": 0.9099842767295597,
+      "grad_norm": 0.3203125,
+      "learning_rate": 4.89646871993703e-06,
+      "loss": 0.7872,
+      "step": 4630
+    },
+    {
+      "epoch": 0.9109669811320755,
+      "grad_norm": 0.31640625,
+      "learning_rate": 4.79099984674114e-06,
+      "loss": 0.8203,
+      "step": 4635
+    },
+    {
+      "epoch": 0.9119496855345912,
+      "grad_norm": 0.318359375,
+      "learning_rate": 4.6866513811437475e-06,
+      "loss": 0.7816,
+      "step": 4640
+    },
+    {
+      "epoch": 0.9129323899371069,
+      "grad_norm": 0.30078125,
+      "learning_rate": 4.58342455110452e-06,
+      "loss": 0.8151,
+      "step": 4645
+    },
+    {
+      "epoch": 0.9139150943396226,
+      "grad_norm": 0.29296875,
+      "learning_rate": 4.481320571383907e-06,
+      "loss": 0.8052,
+      "step": 4650
+    },
+    {
+      "epoch": 0.9148977987421384,
+      "grad_norm": 0.31640625,
+      "learning_rate": 4.380340643528735e-06,
+      "loss": 0.8069,
+      "step": 4655
+    },
+    {
+      "epoch": 0.9158805031446541,
+      "grad_norm": 0.328125,
+      "learning_rate": 4.280485955858171e-06,
+      "loss": 0.7986,
+      "step": 4660
+    },
+    {
+      "epoch": 0.9168632075471698,
+      "grad_norm": 0.310546875,
+      "learning_rate": 4.181757683449694e-06,
+      "loss": 0.8219,
+      "step": 4665
+    },
+    {
+      "epoch": 0.9178459119496856,
+      "grad_norm": 0.30859375,
+      "learning_rate": 4.084156988125231e-06,
+      "loss": 0.8162,
+      "step": 4670
+    },
+    {
+      "epoch": 0.9188286163522013,
+      "grad_norm": 0.314453125,
+      "learning_rate": 3.987685018437581e-06,
+      "loss": 0.7972,
+      "step": 4675
+    },
+    {
+      "epoch": 0.9198113207547169,
+      "grad_norm": 0.30859375,
+      "learning_rate": 3.892342909656776e-06,
+      "loss": 0.8163,
+      "step": 4680
+    },
+    {
+      "epoch": 0.9207940251572327,
+      "grad_norm": 0.310546875,
+      "learning_rate": 3.798131783756853e-06,
+      "loss": 0.8151,
+      "step": 4685
+    },
+    {
+      "epoch": 0.9217767295597484,
+      "grad_norm": 0.310546875,
+      "learning_rate": 3.7050527494025265e-06,
+      "loss": 0.8023,
+      "step": 4690
+    },
+    {
+      "epoch": 0.9227594339622641,
+      "grad_norm": 0.322265625,
+      "learning_rate": 3.6131069019362362e-06,
+      "loss": 0.8229,
+      "step": 4695
+    },
+    {
+      "epoch": 0.9237421383647799,
+      "grad_norm": 0.302734375,
+      "learning_rate": 3.52229532336521e-06,
+      "loss": 0.7951,
+      "step": 4700
+    },
+    {
+      "epoch": 0.9247248427672956,
+      "grad_norm": 0.314453125,
+      "learning_rate": 3.4326190823487315e-06,
+      "loss": 0.8034,
+      "step": 4705
+    },
+    {
+      "epoch": 0.9257075471698113,
+      "grad_norm": 0.30859375,
+      "learning_rate": 3.344079234185604e-06,
+      "loss": 0.807,
+      "step": 4710
+    },
+    {
+      "epoch": 0.9266902515723271,
+      "grad_norm": 0.306640625,
+      "learning_rate": 3.2566768208016297e-06,
+      "loss": 0.8122,
+      "step": 4715
+    },
+    {
+      "epoch": 0.9276729559748428,
+      "grad_norm": 0.30859375,
+      "learning_rate": 3.170412870737516e-06,
+      "loss": 0.8023,
+      "step": 4720
+    },
+    {
+      "epoch": 0.9286556603773585,
+      "grad_norm": 0.3046875,
+      "learning_rate": 3.0852883991366322e-06,
+      "loss": 0.7757,
+      "step": 4725
+    },
+    {
+      "epoch": 0.9296383647798742,
+      "grad_norm": 0.306640625,
+      "learning_rate": 3.0013044077330744e-06,
+      "loss": 0.7709,
+      "step": 4730
+    },
+    {
+      "epoch": 0.93062106918239,
+      "grad_norm": 0.322265625,
+      "learning_rate": 2.9184618848399627e-06,
+      "loss": 0.8331,
+      "step": 4735
+    },
+    {
+      "epoch": 0.9316037735849056,
+      "grad_norm": 0.3125,
+      "learning_rate": 2.836761805337762e-06,
+      "loss": 0.7819,
+      "step": 4740
+    },
+    {
+      "epoch": 0.9325864779874213,
+      "grad_norm": 0.33984375,
+      "learning_rate": 2.756205130662737e-06,
+      "loss": 0.7949,
+      "step": 4745
+    },
+    {
+      "epoch": 0.9335691823899371,
+      "grad_norm": 0.31640625,
+      "learning_rate": 2.6767928087957693e-06,
+      "loss": 0.8147,
+      "step": 4750
+    },
+    {
+      "epoch": 0.9345518867924528,
+      "grad_norm": 0.30078125,
+      "learning_rate": 2.598525774251159e-06,
+      "loss": 0.7786,
+      "step": 4755
+    },
+    {
+      "epoch": 0.9355345911949685,
+      "grad_norm": 0.302734375,
+      "learning_rate": 2.52140494806552e-06,
+      "loss": 0.7954,
+      "step": 4760
+    },
+    {
+      "epoch": 0.9365172955974843,
+      "grad_norm": 0.30859375,
+      "learning_rate": 2.44543123778711e-06,
+      "loss": 0.7851,
+      "step": 4765
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 0.3046875,
+      "learning_rate": 2.370605537465065e-06,
+      "loss": 0.81,
+      "step": 4770
+    },
+    {
+      "epoch": 0.9384827044025157,
+      "grad_norm": 0.302734375,
+      "learning_rate": 2.296928727638814e-06,
+      "loss": 0.8305,
+      "step": 4775
+    },
+    {
+      "epoch": 0.9394654088050315,
+      "grad_norm": 0.3046875,
+      "learning_rate": 2.2244016753278586e-06,
+      "loss": 0.7896,
+      "step": 4780
+    },
+    {
+      "epoch": 0.9404481132075472,
+      "grad_norm": 0.3046875,
+      "learning_rate": 2.1530252340214996e-06,
+      "loss": 0.8101,
+      "step": 4785
+    },
+    {
+      "epoch": 0.9414308176100629,
+      "grad_norm": 0.31640625,
+      "learning_rate": 2.0828002436687257e-06,
+      "loss": 0.805,
+      "step": 4790
+    },
+    {
+      "epoch": 0.9424135220125787,
+      "grad_norm": 0.310546875,
+      "learning_rate": 2.013727530668452e-06,
+      "loss": 0.804,
+      "step": 4795
+    },
+    {
+      "epoch": 0.9433962264150944,
+      "grad_norm": 0.314453125,
+      "learning_rate": 1.9458079078597203e-06,
+      "loss": 0.825,
+      "step": 4800
+    },
+    {
+      "epoch": 0.94437893081761,
+      "grad_norm": 0.3046875,
+      "learning_rate": 1.8790421745121356e-06,
+      "loss": 0.821,
+      "step": 4805
+    },
+    {
+      "epoch": 0.9453616352201258,
+      "grad_norm": 0.310546875,
+      "learning_rate": 1.813431116316522e-06,
+      "loss": 0.8101,
+      "step": 4810
+    },
+    {
+      "epoch": 0.9463443396226415,
+      "grad_norm": 0.30859375,
+      "learning_rate": 1.748975505375583e-06,
+      "loss": 0.8016,
+      "step": 4815
+    },
+    {
+      "epoch": 0.9473270440251572,
+      "grad_norm": 0.296875,
+      "learning_rate": 1.6856761001948772e-06,
+      "loss": 0.7847,
+      "step": 4820
+    },
+    {
+      "epoch": 0.9483097484276729,
+      "grad_norm": 0.3203125,
+      "learning_rate": 1.6235336456739026e-06,
+      "loss": 0.8007,
+      "step": 4825
+    },
+    {
+      "epoch": 0.9492924528301887,
+      "grad_norm": 0.310546875,
+      "learning_rate": 1.5625488730972693e-06,
+      "loss": 0.7891,
+      "step": 4830
+    },
+    {
+      "epoch": 0.9502751572327044,
+      "grad_norm": 0.30859375,
+      "learning_rate": 1.5027225001261525e-06,
+      "loss": 0.8244,
+      "step": 4835
+    },
+    {
+      "epoch": 0.9512578616352201,
+      "grad_norm": 0.298828125,
+      "learning_rate": 1.4440552307898202e-06,
+      "loss": 0.7962,
+      "step": 4840
+    },
+    {
+      "epoch": 0.9522405660377359,
+      "grad_norm": 0.306640625,
+      "learning_rate": 1.386547755477363e-06,
+      "loss": 0.7982,
+      "step": 4845
+    },
+    {
+      "epoch": 0.9532232704402516,
+      "grad_norm": 0.318359375,
+      "learning_rate": 1.3302007509295445e-06,
+      "loss": 0.7896,
+      "step": 4850
+    },
+    {
+      "epoch": 0.9542059748427673,
+      "grad_norm": 0.310546875,
+      "learning_rate": 1.2750148802308737e-06,
+      "loss": 0.8158,
+      "step": 4855
+    },
+    {
+      "epoch": 0.9551886792452831,
+      "grad_norm": 0.3125,
+      "learning_rate": 1.2209907928017795e-06,
+      "loss": 0.8012,
+      "step": 4860
+    },
+    {
+      "epoch": 0.9561713836477987,
+      "grad_norm": 0.310546875,
+      "learning_rate": 1.1681291243909153e-06,
+      "loss": 0.8146,
+      "step": 4865
+    },
+    {
+      "epoch": 0.9571540880503144,
+      "grad_norm": 0.330078125,
+      "learning_rate": 1.116430497067833e-06,
+      "loss": 0.8175,
+      "step": 4870
+    },
+    {
+      "epoch": 0.9581367924528302,
+      "grad_norm": 0.3125,
+      "learning_rate": 1.0658955192154763e-06,
+      "loss": 0.7937,
+      "step": 4875
+    },
+    {
+      "epoch": 0.9591194968553459,
+      "grad_norm": 0.3125,
+      "learning_rate": 1.0165247855231542e-06,
+      "loss": 0.8,
+      "step": 4880
+    },
+    {
+      "epoch": 0.9601022012578616,
+      "grad_norm": 0.314453125,
+      "learning_rate": 9.683188769794792e-07,
+      "loss": 0.8042,
+      "step": 4885
+    },
+    {
+      "epoch": 0.9610849056603774,
+      "grad_norm": 0.298828125,
+      "learning_rate": 9.212783608655518e-07,
+      "loss": 0.8078,
+      "step": 4890
+    },
+    {
+      "epoch": 0.9620676100628931,
+      "grad_norm": 0.31640625,
+      "learning_rate": 8.754037907482748e-07,
+      "loss": 0.7992,
+      "step": 4895
+    },
+    {
+      "epoch": 0.9630503144654088,
+      "grad_norm": 0.306640625,
+      "learning_rate": 8.306957064738385e-07,
+      "loss": 0.806,
+      "step": 4900
+    },
+    {
+      "epoch": 0.9640330188679245,
+      "grad_norm": 0.31640625,
+      "learning_rate": 7.871546341614023e-07,
+      "loss": 0.7803,
+      "step": 4905
+    },
+    {
+      "epoch": 0.9650157232704403,
+      "grad_norm": 0.3046875,
+      "learning_rate": 7.447810861968552e-07,
+      "loss": 0.7864,
+      "step": 4910
+    },
+    {
+      "epoch": 0.965998427672956,
+      "grad_norm": 0.30859375,
+      "learning_rate": 7.03575561226788e-07,
+      "loss": 0.7837,
+      "step": 4915
+    },
+    {
+      "epoch": 0.9669811320754716,
+      "grad_norm": 0.302734375,
+      "learning_rate": 6.635385441526754e-07,
+      "loss": 0.7935,
+      "step": 4920
+    },
+    {
+      "epoch": 0.9679638364779874,
+      "grad_norm": 0.314453125,
+      "learning_rate": 6.246705061251245e-07,
+      "loss": 0.8074,
+      "step": 4925
+    },
+    {
+      "epoch": 0.9689465408805031,
+      "grad_norm": 0.298828125,
+      "learning_rate": 5.86971904538347e-07,
+      "loss": 0.8082,
+      "step": 4930
+    },
+    {
+      "epoch": 0.9699292452830188,
+      "grad_norm": 0.3125,
+      "learning_rate": 5.504431830247514e-07,
+      "loss": 0.7889,
+      "step": 4935
+    },
+    {
+      "epoch": 0.9709119496855346,
+      "grad_norm": 0.306640625,
+      "learning_rate": 5.150847714497697e-07,
+      "loss": 0.7924,
+      "step": 4940
+    },
+    {
+      "epoch": 0.9718946540880503,
+      "grad_norm": 0.296875,
+      "learning_rate": 4.80897085906773e-07,
+      "loss": 0.81,
+      "step": 4945
+    },
+    {
+      "epoch": 0.972877358490566,
+      "grad_norm": 0.294921875,
+      "learning_rate": 4.4788052871215234e-07,
+      "loss": 0.805,
+      "step": 4950
+    },
+    {
+      "epoch": 0.9738600628930818,
+      "grad_norm": 0.30078125,
+      "learning_rate": 4.1603548840062345e-07,
+      "loss": 0.8101,
+      "step": 4955
+    },
+    {
+      "epoch": 0.9748427672955975,
+      "grad_norm": 0.3046875,
+      "learning_rate": 3.853623397206407e-07,
+      "loss": 0.7909,
+      "step": 4960
+    },
+    {
+      "epoch": 0.9758254716981132,
+      "grad_norm": 0.302734375,
+      "learning_rate": 3.5586144362997896e-07,
+      "loss": 0.7972,
+      "step": 4965
+    },
+    {
+      "epoch": 0.976808176100629,
+      "grad_norm": 0.314453125,
+      "learning_rate": 3.275331472914922e-07,
+      "loss": 0.8101,
+      "step": 4970
+    },
+    {
+      "epoch": 0.9777908805031447,
+      "grad_norm": 0.3125,
+      "learning_rate": 3.0037778406902805e-07,
+      "loss": 0.8184,
+      "step": 4975
+    },
+    {
+      "epoch": 0.9787735849056604,
+      "grad_norm": 0.3125,
+      "learning_rate": 2.743956735234865e-07,
+      "loss": 0.782,
+      "step": 4980
+    },
+    {
+      "epoch": 0.9797562893081762,
+      "grad_norm": 0.322265625,
+      "learning_rate": 2.4958712140911166e-07,
+      "loss": 0.7905,
+      "step": 4985
+    },
+    {
+      "epoch": 0.9807389937106918,
+      "grad_norm": 0.310546875,
+      "learning_rate": 2.2595241966982817e-07,
+      "loss": 0.8163,
+      "step": 4990
+    },
+    {
+      "epoch": 0.9817216981132075,
+      "grad_norm": 0.3125,
+      "learning_rate": 2.0349184643586595e-07,
+      "loss": 0.8266,
+      "step": 4995
+    },
+    {
+      "epoch": 0.9827044025157232,
+      "grad_norm": 0.30859375,
+      "learning_rate": 1.8220566602040745e-07,
+      "loss": 0.8174,
+      "step": 5000
+    },
+    {
+      "epoch": 0.983687106918239,
+      "grad_norm": 0.302734375,
+      "learning_rate": 1.6209412891659003e-07,
+      "loss": 0.8052,
+      "step": 5005
+    },
+    {
+      "epoch": 0.9846698113207547,
+      "grad_norm": 0.302734375,
+      "learning_rate": 1.4315747179446392e-07,
+      "loss": 0.7871,
+      "step": 5010
+    },
+    {
+      "epoch": 0.9856525157232704,
+      "grad_norm": 0.31640625,
+      "learning_rate": 1.2539591749821666e-07,
+      "loss": 0.7973,
+      "step": 5015
+    },
+    {
+      "epoch": 0.9866352201257862,
+      "grad_norm": 0.33203125,
+      "learning_rate": 1.088096750436085e-07,
+      "loss": 0.7972,
+      "step": 5020
+    },
+    {
+      "epoch": 0.9876179245283019,
+      "grad_norm": 0.31640625,
+      "learning_rate": 9.339893961548551e-08,
+      "loss": 0.8152,
+      "step": 5025
+    },
+    {
+      "epoch": 0.9886006289308176,
+      "grad_norm": 0.310546875,
+      "learning_rate": 7.916389256541479e-08,
+      "loss": 0.8147,
+      "step": 5030
+    },
+    {
+      "epoch": 0.9895833333333334,
+      "grad_norm": 0.302734375,
+      "learning_rate": 6.610470140967495e-08,
+      "loss": 0.81,
+      "step": 5035
+    },
+    {
+      "epoch": 0.9905660377358491,
+      "grad_norm": 0.310546875,
+      "learning_rate": 5.422151982719115e-08,
+      "loss": 0.8167,
+      "step": 5040
+    },
+    {
+      "epoch": 0.9915487421383647,
+      "grad_norm": 0.330078125,
+      "learning_rate": 4.351448765775867e-08,
+      "loss": 0.8175,
+      "step": 5045
+    },
+    {
+      "epoch": 0.9925314465408805,
+      "grad_norm": 0.310546875,
+      "learning_rate": 3.3983730900377655e-08,
+      "loss": 0.8009,
+      "step": 5050
+    },
+    {
+      "epoch": 0.9935141509433962,
+      "grad_norm": 0.30078125,
+      "learning_rate": 2.5629361711809742e-08,
+      "loss": 0.8025,
+      "step": 5055
+    },
+    {
+      "epoch": 0.9944968553459119,
+      "grad_norm": 0.30078125,
+      "learning_rate": 1.8451478405223653e-08,
+      "loss": 0.7953,
+      "step": 5060
+    },
+    {
+      "epoch": 0.9954795597484277,
+      "grad_norm": 0.314453125,
+      "learning_rate": 1.2450165449062744e-08,
+      "loss": 0.7893,
+      "step": 5065
+    },
+    {
+      "epoch": 0.9964622641509434,
+      "grad_norm": 0.3046875,
+      "learning_rate": 7.62549346601249e-09,
+      "loss": 0.8113,
+      "step": 5070
+    },
+    {
+      "epoch": 0.9974449685534591,
+      "grad_norm": 0.302734375,
+      "learning_rate": 3.977519232223337e-09,
+      "loss": 0.8174,
+      "step": 5075
+    },
+    {
+      "epoch": 0.9984276729559748,
+      "grad_norm": 0.302734375,
+      "learning_rate": 1.5062856765779565e-09,
+      "loss": 0.8089,
+      "step": 5080
+    },
+    {
+      "epoch": 0.9994103773584906,
+      "grad_norm": 0.3125,
+      "learning_rate": 2.118218802582561e-10,
+      "loss": 0.8288,
+      "step": 5085
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.7983009815216064,
+      "eval_runtime": 7962.7938,
+      "eval_samples_per_second": 10.22,
+      "eval_steps_per_second": 0.16,
+      "step": 5088
+    },
+    {
+      "epoch": 1.0,
+      "step": 5088,
+      "total_flos": 1.5751056572484157e+19,
+      "train_loss": 0.09284130807192821,
+      "train_runtime": 20560.1048,
+      "train_samples_per_second": 15.837,
+      "train_steps_per_second": 0.247
     }
   ],
   "logging_steps": 5,
+  "max_steps": 5088,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 1.5751056572484157e+19,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null