End of training

Browse files

Files changed (13) hide show

README.md +7 -2
all_results.json +15 -0
merges.txt +0 -0
preprocessor_config.json +28 -0
runs/Oct14_22-43-32_workload-ai-workshop/events.out.tfevents.1728946985.workload-ai-workshop.3947.1 +3 -0
special_tokens_map.json +15 -0
test_results.json +10 -0
tokenizer.json +0 -0
tokenizer_config.json +57 -0
train_results.json +11 -0
trainer_state.json +1265 -0
validation_results.json +10 -0
vocab.json +0 -0

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ base_model: BridgeTower/bridgetower-large-itm-mlm-itc
 tags:
 - generated_from_trainer
 datasets:
-- newyorker_caption_contest
 model-index:
 - name: bridgetower
   results: []
@@ -15,7 +15,12 @@ should probably proofread and complete it, then remove this comment. -->
 # bridgetower
-This model is a fine-tuned version of [BridgeTower/bridgetower-large-itm-mlm-itc](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-itc) on the newyorker_caption_contest dataset.
 ## Model description

 tags:
 - generated_from_trainer
 datasets:
+- jmhessel/newyorker_caption_contest
 model-index:
 - name: bridgetower
   results: []
 # bridgetower
+This model is a fine-tuned version of [BridgeTower/bridgetower-large-itm-mlm-itc](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-itc) on the jmhessel/newyorker_caption_contest matching dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.1215
+- Memory Allocated (gb): 51.27
+- Max Memory Allocated (gb): 60.52
+- Total Memory Available (gb): 94.62
 ## Model description

all_results.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "epoch": 5.0,
+    "eval_loss": 0.12152472138404846,
+    "eval_runtime": 3.7753,
+    "eval_samples_per_second": 141.222,
+    "eval_steps_per_second": 8.826,
+    "max_memory_allocated (GB)": 60.52,
+    "memory_allocated (GB)": 51.27,
+    "total_flos": 3.0598946525952e+16,
+    "total_memory_available (GB)": 94.62,
+    "train_loss": 0.06085505417415074,
+    "train_runtime": 1020.8061,
+    "train_samples_per_second": 55.51,
+    "train_steps_per_second": 1.389
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": null,
+  "do_center_crop": true,
+  "do_normalize": true,
+  "do_pad": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "BridgeTowerImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_text_len": 50,
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 294
+  },
+  "size_divisor": 32,
+  "tokenizer": "roberta-large",
+  "vocab_size": 50265
+}

runs/Oct14_22-43-32_workload-ai-workshop/events.out.tfevents.1728946985.workload-ai-workshop.3947.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:074b81b769f81184541b5745477cfc1ab677173c8e33b12714f97e43ea66a9fb
+size 998

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

test_results.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "epoch": 5.0,
+    "eval_loss": 0.12152472138404846,
+    "eval_runtime": 3.7753,
+    "eval_samples_per_second": 141.222,
+    "eval_steps_per_second": 8.826,
+    "max_memory_allocated (GB)": 60.52,
+    "memory_allocated (GB)": 51.27,
+    "total_memory_available (GB)": 94.62
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "epoch": 5.0,
+    "max_memory_allocated (GB)": 60.52,
+    "memory_allocated (GB)": 50.57,
+    "total_flos": 3.0598946525952e+16,
+    "total_memory_available (GB)": 94.62,
+    "train_loss": 0.06085505417415074,
+    "train_runtime": 1020.8061,
+    "train_samples_per_second": 55.51,
+    "train_steps_per_second": 1.389
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1265 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 5.0,
+  "eval_steps": 500,
+  "global_step": 1225,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.04081632653061224,
+      "grad_norm": 9.589848518371582,
+      "learning_rate": 9.918367346938776e-06,
+      "loss": 0.2612,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 10,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.08163265306122448,
+      "grad_norm": 6.701302528381348,
+      "learning_rate": 9.836734693877552e-06,
+      "loss": 0.154,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 20,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.12244897959183673,
+      "grad_norm": 5.337311267852783,
+      "learning_rate": 9.755102040816327e-06,
+      "loss": 0.1235,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 30,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.16326530612244897,
+      "grad_norm": 4.5042338371276855,
+      "learning_rate": 9.673469387755103e-06,
+      "loss": 0.1096,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 40,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.20408163265306123,
+      "grad_norm": 4.461822032928467,
+      "learning_rate": 9.591836734693878e-06,
+      "loss": 0.1196,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 50,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.24489795918367346,
+      "grad_norm": 2.2825701236724854,
+      "learning_rate": 9.510204081632653e-06,
+      "loss": 0.0805,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 60,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.2857142857142857,
+      "grad_norm": 3.725268602371216,
+      "learning_rate": 9.42857142857143e-06,
+      "loss": 0.1026,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 70,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.32653061224489793,
+      "grad_norm": 1.707739233970642,
+      "learning_rate": 9.346938775510204e-06,
+      "loss": 0.1111,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 80,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.3673469387755102,
+      "grad_norm": 4.5863938331604,
+      "learning_rate": 9.26530612244898e-06,
+      "loss": 0.0856,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 90,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.40816326530612246,
+      "grad_norm": 11.972647666931152,
+      "learning_rate": 9.183673469387756e-06,
+      "loss": 0.0759,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 100,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.4489795918367347,
+      "grad_norm": 4.550654888153076,
+      "learning_rate": 9.102040816326532e-06,
+      "loss": 0.0717,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 110,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.4897959183673469,
+      "grad_norm": 4.418276786804199,
+      "learning_rate": 9.020408163265307e-06,
+      "loss": 0.0717,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 120,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.5306122448979592,
+      "grad_norm": 1.651443600654602,
+      "learning_rate": 8.938775510204082e-06,
+      "loss": 0.0581,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 130,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 1.5251814126968384,
+      "learning_rate": 8.857142857142858e-06,
+      "loss": 0.0481,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 140,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.6122448979591837,
+      "grad_norm": 1.7455183267593384,
+      "learning_rate": 8.775510204081633e-06,
+      "loss": 0.0625,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 150,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.6530612244897959,
+      "grad_norm": 1.7588891983032227,
+      "learning_rate": 8.69387755102041e-06,
+      "loss": 0.0711,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 160,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.6938775510204082,
+      "grad_norm": 2.7675328254699707,
+      "learning_rate": 8.612244897959184e-06,
+      "loss": 0.0747,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 170,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.7346938775510204,
+      "grad_norm": 1.781469464302063,
+      "learning_rate": 8.530612244897961e-06,
+      "loss": 0.061,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 180,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.7755102040816326,
+      "grad_norm": 2.3728435039520264,
+      "learning_rate": 8.448979591836736e-06,
+      "loss": 0.0588,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 190,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.8163265306122449,
+      "grad_norm": 0.8711996674537659,
+      "learning_rate": 8.36734693877551e-06,
+      "loss": 0.062,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 200,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.8571428571428571,
+      "grad_norm": 1.1986733675003052,
+      "learning_rate": 8.285714285714287e-06,
+      "loss": 0.0627,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 210,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.8979591836734694,
+      "grad_norm": 2.8968520164489746,
+      "learning_rate": 8.204081632653062e-06,
+      "loss": 0.0604,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 220,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.9387755102040817,
+      "grad_norm": 0.8414793610572815,
+      "learning_rate": 8.122448979591837e-06,
+      "loss": 0.0559,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 230,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 0.9795918367346939,
+      "grad_norm": 0.7434167861938477,
+      "learning_rate": 8.040816326530613e-06,
+      "loss": 0.0498,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 240,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.0204081632653061,
+      "grad_norm": 0.8703041076660156,
+      "learning_rate": 7.959183673469388e-06,
+      "loss": 0.0618,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 250,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.0612244897959184,
+      "grad_norm": 1.0856379270553589,
+      "learning_rate": 7.877551020408164e-06,
+      "loss": 0.056,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 260,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.1020408163265305,
+      "grad_norm": 0.8847401142120361,
+      "learning_rate": 7.79591836734694e-06,
+      "loss": 0.0625,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 270,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.1428571428571428,
+      "grad_norm": 1.5929882526397705,
+      "learning_rate": 7.714285714285716e-06,
+      "loss": 0.0571,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 280,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.183673469387755,
+      "grad_norm": 0.8007532954216003,
+      "learning_rate": 7.63265306122449e-06,
+      "loss": 0.0511,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 290,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.2244897959183674,
+      "grad_norm": 1.2002859115600586,
+      "learning_rate": 7.551020408163265e-06,
+      "loss": 0.065,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 300,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.2653061224489797,
+      "grad_norm": 12.871713638305664,
+      "learning_rate": 7.469387755102041e-06,
+      "loss": 0.0664,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 310,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.306122448979592,
+      "grad_norm": 2.46173357963562,
+      "learning_rate": 7.387755102040817e-06,
+      "loss": 0.0495,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 320,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.346938775510204,
+      "grad_norm": 0.860598087310791,
+      "learning_rate": 7.306122448979592e-06,
+      "loss": 0.0603,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 330,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.3877551020408163,
+      "grad_norm": 2.5583598613739014,
+      "learning_rate": 7.224489795918368e-06,
+      "loss": 0.0547,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 340,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.37155964970588684,
+      "learning_rate": 7.1428571428571436e-06,
+      "loss": 0.048,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 350,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.469387755102041,
+      "grad_norm": 1.808316707611084,
+      "learning_rate": 7.061224489795919e-06,
+      "loss": 0.0462,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 360,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.510204081632653,
+      "grad_norm": 1.0183931589126587,
+      "learning_rate": 6.979591836734695e-06,
+      "loss": 0.0594,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 370,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.5510204081632653,
+      "grad_norm": 0.5249583721160889,
+      "learning_rate": 6.8979591836734705e-06,
+      "loss": 0.0479,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 380,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.5918367346938775,
+      "grad_norm": 1.1005572080612183,
+      "learning_rate": 6.816326530612245e-06,
+      "loss": 0.0649,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 390,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.6326530612244898,
+      "grad_norm": 0.6047573089599609,
+      "learning_rate": 6.734693877551021e-06,
+      "loss": 0.0607,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 400,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.6734693877551021,
+      "grad_norm": 0.7261654734611511,
+      "learning_rate": 6.653061224489797e-06,
+      "loss": 0.0606,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 410,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.7142857142857144,
+      "grad_norm": 0.848527193069458,
+      "learning_rate": 6.571428571428572e-06,
+      "loss": 0.0532,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 420,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.7551020408163265,
+      "grad_norm": 0.23483288288116455,
+      "learning_rate": 6.489795918367348e-06,
+      "loss": 0.068,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 430,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.7959183673469388,
+      "grad_norm": 2.0767459869384766,
+      "learning_rate": 6.408163265306124e-06,
+      "loss": 0.0617,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 440,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.836734693877551,
+      "grad_norm": 0.5654011368751526,
+      "learning_rate": 6.326530612244899e-06,
+      "loss": 0.044,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 450,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.8775510204081631,
+      "grad_norm": 0.7382919788360596,
+      "learning_rate": 6.244897959183675e-06,
+      "loss": 0.0537,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 460,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.9183673469387754,
+      "grad_norm": 1.3547204732894897,
+      "learning_rate": 6.163265306122449e-06,
+      "loss": 0.0432,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 470,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 1.9591836734693877,
+      "grad_norm": 0.19681082665920258,
+      "learning_rate": 6.0816326530612245e-06,
+      "loss": 0.0498,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 480,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.109737515449524,
+      "learning_rate": 6e-06,
+      "loss": 0.0639,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 490,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.0408163265306123,
+      "grad_norm": 0.5894625782966614,
+      "learning_rate": 5.918367346938776e-06,
+      "loss": 0.0593,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 500,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.0816326530612246,
+      "grad_norm": 0.7122555375099182,
+      "learning_rate": 5.8367346938775515e-06,
+      "loss": 0.0498,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 510,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.122448979591837,
+      "grad_norm": 0.8958902955055237,
+      "learning_rate": 5.755102040816327e-06,
+      "loss": 0.0457,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 520,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.163265306122449,
+      "grad_norm": 11.620415687561035,
+      "learning_rate": 5.673469387755103e-06,
+      "loss": 0.0626,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 530,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.204081632653061,
+      "grad_norm": 0.3538230061531067,
+      "learning_rate": 5.591836734693878e-06,
+      "loss": 0.0584,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 540,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.2448979591836733,
+      "grad_norm": 1.5313146114349365,
+      "learning_rate": 5.510204081632653e-06,
+      "loss": 0.0627,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 550,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.2857142857142856,
+      "grad_norm": 1.3519809246063232,
+      "learning_rate": 5.428571428571429e-06,
+      "loss": 0.0572,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 560,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.326530612244898,
+      "grad_norm": 1.0263270139694214,
+      "learning_rate": 5.3469387755102045e-06,
+      "loss": 0.0585,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 570,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.36734693877551,
+      "grad_norm": 0.8926671147346497,
+      "learning_rate": 5.26530612244898e-06,
+      "loss": 0.0673,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 580,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.4081632653061225,
+      "grad_norm": 0.3185974955558777,
+      "learning_rate": 5.183673469387756e-06,
+      "loss": 0.0537,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 590,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.4489795918367347,
+      "grad_norm": 0.944624662399292,
+      "learning_rate": 5.1020408163265315e-06,
+      "loss": 0.0442,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 600,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.489795918367347,
+      "grad_norm": 0.32796111702919006,
+      "learning_rate": 5.020408163265307e-06,
+      "loss": 0.0413,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 610,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.5306122448979593,
+      "grad_norm": 0.7929801940917969,
+      "learning_rate": 4.938775510204082e-06,
+      "loss": 0.0428,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 620,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.571428571428571,
+      "grad_norm": 0.910254955291748,
+      "learning_rate": 4.857142857142858e-06,
+      "loss": 0.0813,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 630,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.612244897959184,
+      "grad_norm": 1.101942539215088,
+      "learning_rate": 4.775510204081633e-06,
+      "loss": 0.0495,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 640,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.6530612244897958,
+      "grad_norm": 0.7182526588439941,
+      "learning_rate": 4.693877551020409e-06,
+      "loss": 0.0471,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 650,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.693877551020408,
+      "grad_norm": 0.8068158626556396,
+      "learning_rate": 4.612244897959184e-06,
+      "loss": 0.0469,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 660,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.7346938775510203,
+      "grad_norm": 1.2375913858413696,
+      "learning_rate": 4.530612244897959e-06,
+      "loss": 0.0857,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 670,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.7755102040816326,
+      "grad_norm": 1.1524357795715332,
+      "learning_rate": 4.448979591836735e-06,
+      "loss": 0.0488,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 680,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.816326530612245,
+      "grad_norm": 0.3913586437702179,
+      "learning_rate": 4.367346938775511e-06,
+      "loss": 0.0451,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 690,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.47935113310813904,
+      "learning_rate": 4.2857142857142855e-06,
+      "loss": 0.0433,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 700,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.8979591836734695,
+      "grad_norm": 0.8084143996238708,
+      "learning_rate": 4.204081632653061e-06,
+      "loss": 0.0548,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 710,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.938775510204082,
+      "grad_norm": 1.7315497398376465,
+      "learning_rate": 4.122448979591837e-06,
+      "loss": 0.0587,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 720,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 2.979591836734694,
+      "grad_norm": 0.20743349194526672,
+      "learning_rate": 4.040816326530612e-06,
+      "loss": 0.0342,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 730,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.020408163265306,
+      "grad_norm": 0.8024761080741882,
+      "learning_rate": 3.959183673469388e-06,
+      "loss": 0.053,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 740,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.061224489795918,
+      "grad_norm": 0.45326006412506104,
+      "learning_rate": 3.877551020408164e-06,
+      "loss": 0.0619,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 750,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.1020408163265305,
+      "grad_norm": 0.6953087449073792,
+      "learning_rate": 3.795918367346939e-06,
+      "loss": 0.0527,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 760,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.142857142857143,
+      "grad_norm": 1.2290390729904175,
+      "learning_rate": 3.7142857142857146e-06,
+      "loss": 0.0689,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 770,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.183673469387755,
+      "grad_norm": 0.6281890869140625,
+      "learning_rate": 3.6326530612244903e-06,
+      "loss": 0.0647,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 780,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.2244897959183674,
+      "grad_norm": 0.3096281588077545,
+      "learning_rate": 3.5510204081632655e-06,
+      "loss": 0.0522,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 790,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.2653061224489797,
+      "grad_norm": 0.9390127062797546,
+      "learning_rate": 3.469387755102041e-06,
+      "loss": 0.0432,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 800,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.306122448979592,
+      "grad_norm": 0.87565016746521,
+      "learning_rate": 3.3877551020408168e-06,
+      "loss": 0.0555,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 810,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.3469387755102042,
+      "grad_norm": 1.0797837972640991,
+      "learning_rate": 3.3061224489795924e-06,
+      "loss": 0.0455,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 820,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.387755102040816,
+      "grad_norm": 0.3658354878425598,
+      "learning_rate": 3.2244897959183672e-06,
+      "loss": 0.0487,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 830,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.4285714285714284,
+      "grad_norm": 0.4766336977481842,
+      "learning_rate": 3.142857142857143e-06,
+      "loss": 0.053,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 840,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.4693877551020407,
+      "grad_norm": 0.49318933486938477,
+      "learning_rate": 3.0612244897959185e-06,
+      "loss": 0.0812,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 850,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.510204081632653,
+      "grad_norm": 1.3475311994552612,
+      "learning_rate": 2.979591836734694e-06,
+      "loss": 0.0451,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 860,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.5510204081632653,
+      "grad_norm": 0.36763882637023926,
+      "learning_rate": 2.8979591836734694e-06,
+      "loss": 0.0646,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 870,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.5918367346938775,
+      "grad_norm": 3.085198402404785,
+      "learning_rate": 2.816326530612245e-06,
+      "loss": 0.0439,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 880,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.63265306122449,
+      "grad_norm": 0.17229312658309937,
+      "learning_rate": 2.7346938775510207e-06,
+      "loss": 0.0288,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 890,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.673469387755102,
+      "grad_norm": 1.0760900974273682,
+      "learning_rate": 2.6530612244897964e-06,
+      "loss": 0.0514,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 900,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.7142857142857144,
+      "grad_norm": 0.45855164527893066,
+      "learning_rate": 2.571428571428571e-06,
+      "loss": 0.0602,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 910,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.7551020408163263,
+      "grad_norm": 0.15575875341892242,
+      "learning_rate": 2.489795918367347e-06,
+      "loss": 0.0543,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 920,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.795918367346939,
+      "grad_norm": 0.779755175113678,
+      "learning_rate": 2.4081632653061225e-06,
+      "loss": 0.0497,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 930,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.836734693877551,
+      "grad_norm": 0.7307060956954956,
+      "learning_rate": 2.326530612244898e-06,
+      "loss": 0.0486,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 940,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.877551020408163,
+      "grad_norm": 1.062565803527832,
+      "learning_rate": 2.244897959183674e-06,
+      "loss": 0.0594,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 950,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.9183673469387754,
+      "grad_norm": 0.3031039535999298,
+      "learning_rate": 2.1632653061224495e-06,
+      "loss": 0.0497,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 960,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 3.9591836734693877,
+      "grad_norm": 2.310593843460083,
+      "learning_rate": 2.0816326530612247e-06,
+      "loss": 0.0746,
+      "max_memory_allocated (GB)": 57.18,
+      "memory_allocated (GB)": 50.57,
+      "step": 970,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.6998704075813293,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 0.0703,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 980,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.040816326530612,
+      "grad_norm": 0.7492395639419556,
+      "learning_rate": 1.9183673469387756e-06,
+      "loss": 0.0486,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 990,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.081632653061225,
+      "grad_norm": 0.7633445858955383,
+      "learning_rate": 1.8367346938775512e-06,
+      "loss": 0.0625,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1000,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.122448979591836,
+      "grad_norm": 0.6911561489105225,
+      "learning_rate": 1.7551020408163267e-06,
+      "loss": 0.0632,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1010,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.163265306122449,
+      "grad_norm": 0.33521902561187744,
+      "learning_rate": 1.6734693877551023e-06,
+      "loss": 0.0406,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1020,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.204081632653061,
+      "grad_norm": 0.7509037852287292,
+      "learning_rate": 1.5918367346938775e-06,
+      "loss": 0.0531,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1030,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.244897959183674,
+      "grad_norm": 0.5234070420265198,
+      "learning_rate": 1.5102040816326532e-06,
+      "loss": 0.0396,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1040,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.285714285714286,
+      "grad_norm": 0.7997304797172546,
+      "learning_rate": 1.4285714285714286e-06,
+      "loss": 0.05,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1050,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.326530612244898,
+      "grad_norm": 0.2255077213048935,
+      "learning_rate": 1.3469387755102043e-06,
+      "loss": 0.0457,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1060,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.36734693877551,
+      "grad_norm": 0.5182124376296997,
+      "learning_rate": 1.2653061224489795e-06,
+      "loss": 0.0485,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1070,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.408163265306122,
+      "grad_norm": 0.35046374797821045,
+      "learning_rate": 1.1836734693877552e-06,
+      "loss": 0.0519,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1080,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.448979591836735,
+      "grad_norm": 0.3923434615135193,
+      "learning_rate": 1.1020408163265308e-06,
+      "loss": 0.0507,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1090,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.489795918367347,
+      "grad_norm": 0.23866137862205505,
+      "learning_rate": 1.0204081632653063e-06,
+      "loss": 0.0362,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1100,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.530612244897959,
+      "grad_norm": 0.15117916464805603,
+      "learning_rate": 9.387755102040817e-07,
+      "loss": 0.0464,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1110,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.571428571428571,
+      "grad_norm": 0.5993088483810425,
+      "learning_rate": 8.571428571428572e-07,
+      "loss": 0.0404,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1120,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.612244897959184,
+      "grad_norm": 0.30265432596206665,
+      "learning_rate": 7.755102040816327e-07,
+      "loss": 0.0545,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1130,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.653061224489796,
+      "grad_norm": 0.6385183334350586,
+      "learning_rate": 6.938775510204082e-07,
+      "loss": 0.0731,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1140,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.6938775510204085,
+      "grad_norm": 1.128566026687622,
+      "learning_rate": 6.122448979591837e-07,
+      "loss": 0.0516,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1150,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.73469387755102,
+      "grad_norm": 1.1660116910934448,
+      "learning_rate": 5.306122448979592e-07,
+      "loss": 0.0611,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1160,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.775510204081632,
+      "grad_norm": 0.5327439904212952,
+      "learning_rate": 4.489795918367347e-07,
+      "loss": 0.0549,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1170,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.816326530612245,
+      "grad_norm": 0.8764423131942749,
+      "learning_rate": 3.6734693877551025e-07,
+      "loss": 0.0441,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1180,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.857142857142857,
+      "grad_norm": 0.47835007309913635,
+      "learning_rate": 2.8571428571428575e-07,
+      "loss": 0.0541,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1190,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.8979591836734695,
+      "grad_norm": 1.048047661781311,
+      "learning_rate": 2.0408163265306121e-07,
+      "loss": 0.0731,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1200,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.938775510204081,
+      "grad_norm": 0.3101171851158142,
+      "learning_rate": 1.2244897959183673e-07,
+      "loss": 0.0648,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1210,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 4.979591836734694,
+      "grad_norm": 0.76802659034729,
+      "learning_rate": 4.0816326530612253e-08,
+      "loss": 0.0418,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1220,
+      "total_memory_available (GB)": 94.62
+    },
+    {
+      "epoch": 5.0,
+      "max_memory_allocated (GB)": 60.52,
+      "memory_allocated (GB)": 50.57,
+      "step": 1225,
+      "total_flos": 3.0598946525952e+16,
+      "total_memory_available (GB)": 94.62,
+      "train_loss": 0.06085505417415074,
+      "train_runtime": 1020.8061,
+      "train_samples_per_second": 55.51,
+      "train_steps_per_second": 1.389
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1225,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.0598946525952e+16,
+  "train_batch_size": 40,
+  "trial_name": null,
+  "trial_params": null
+}

validation_results.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "epoch": 5.0,
+    "eval_loss": 0.1272137612104416,
+    "eval_runtime": 30.5176,
+    "eval_samples_per_second": 31.06,
+    "eval_steps_per_second": 1.994,
+    "max_memory_allocated (GB)": 60.52,
+    "memory_allocated (GB)": 51.27,
+    "total_memory_available (GB)": 94.62
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff