Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

README.md +64 -0
checkpoint-700/config.json +43 -0
checkpoint-700/generation_config.json +7 -0
checkpoint-700/merges.txt +0 -0
checkpoint-700/model.safetensors +3 -0
checkpoint-700/special_tokens_map.json +23 -0
checkpoint-700/tokenizer.json +0 -0
checkpoint-700/tokenizer_config.json +51 -0
checkpoint-700/trainer_state.json +1064 -0
checkpoint-700/training_args.bin +3 -0
checkpoint-700/vocab.json +0 -0
generation_config.json +7 -0
model.safetensors +1 -1

README.md ADDED Viewed

	@@ -0,0 +1,64 @@

+---
+tags:
+- generated_from_trainer
+metrics:
+- accuracy
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# griffin-1024-c3t-8layer-simple_wikipedia_LM-vN
+This model is a fine-tuned version of [./griffin-1024-c3t-8layer](https://huggingface.co/./griffin-1024-c3t-8layer) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Loss: 4.1928
+- Accuracy: 0.4084
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0003
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 80085
+- gradient_accumulation_steps: 32
+- total_train_batch_size: 128
+- optimizer: Adam with betas=(0.9,0.99) and epsilon=1e-07
+- lr_scheduler_type: constant_with_warmup
+- lr_scheduler_warmup_ratio: 0.05
+- num_epochs: 2.0
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss | Accuracy |
+|:-------------:|:------:|:----:|:---------------:|:--------:|
+| 13.2525       | 0.2548 | 100  | 11.9768         | 0.0131   |
+| 8.8873        | 0.5095 | 200  | 8.0127          | 0.0357   |
+| 7.2457        | 0.7643 | 300  | 6.4508          | 0.0512   |
+| 6.3152        | 1.0190 | 400  | 5.6163          | 0.0460   |
+| 5.5586        | 1.2738 | 500  | 4.7645          | 0.3650   |
+| 5.2936        | 1.5285 | 600  | 4.3919          | 0.3934   |
+| 4.8839        | 1.7833 | 700  | 4.1928          | 0.4084   |
+### Framework versions
+- Transformers 4.40.1
+- Pytorch 2.2.0+cu121
+- Datasets 2.19.0
+- Tokenizers 0.19.1

checkpoint-700/config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "_block_types": [
+    "recurrent",
+    "recurrent",
+    "attention"
+  ],
+  "_name_or_path": "./griffin-1024-c3t-8layer",
+  "architectures": [
+    "RecurrentGemmaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attention_window_size": 2048,
+  "block_types": [
+    "recurrent",
+    "recurrent",
+    "attention"
+  ],
+  "bos_token_id": 0,
+  "conv1d_width": 4,
+  "embeddings_scale_by_sqrt_dim": true,
+  "eos_token_id": 0,
+  "final_w_init_variance_scale": 0.25,
+  "head_dim": 128,
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 1024,
+  "intermediate_size": 6144,
+  "logits_soft_cap": 30.0,
+  "lru_width": 1024,
+  "model_type": "recurrent_gemma",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 8,
+  "num_key_value_heads": 2,
+  "pad_token_id": 0,
+  "partial_rotary_factor": 0.5,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.1",
+  "use_cache": true,
+  "vocab_size": 65024,
+  "w_init_variance_scale": 0.01
+}

checkpoint-700/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "pad_token_id": 0,
+  "transformers_version": "4.40.1"
+}

checkpoint-700/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-700/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:293c092a8f8799e6300d498d53a2ca8e779bea567aebb76936ccee19e8207577
+size 671684224

checkpoint-700/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<EOT>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<EOT>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<EOT>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-700/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-700/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<META>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<META_START>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<META_END>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<SOS>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<EOT>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<EOT>",
+  "model_max_length": 200000,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<EOT>"
+}

checkpoint-700/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1064 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.7832975081601783,
+  "eval_steps": 100,
+  "global_step": 700,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.012737839344001274,
+      "grad_norm": 7.432073593139648,
+      "learning_rate": 3.75e-05,
+      "loss": 37.7163,
+      "step": 5
+    },
+    {
+      "epoch": 0.02547567868800255,
+      "grad_norm": 3.020324945449829,
+      "learning_rate": 7.5e-05,
+      "loss": 33.9598,
+      "step": 10
+    },
+    {
+      "epoch": 0.03821351803200382,
+      "grad_norm": 1.8713821172714233,
+      "learning_rate": 0.0001125,
+      "loss": 31.1454,
+      "step": 15
+    },
+    {
+      "epoch": 0.0509513573760051,
+      "grad_norm": 1.4051786661148071,
+      "learning_rate": 0.00015,
+      "loss": 28.2185,
+      "step": 20
+    },
+    {
+      "epoch": 0.06368919672000636,
+      "grad_norm": 1.191188931465149,
+      "learning_rate": 0.00018749999999999998,
+      "loss": 26.7731,
+      "step": 25
+    },
+    {
+      "epoch": 0.07642703606400764,
+      "grad_norm": 1.0205295085906982,
+      "learning_rate": 0.000225,
+      "loss": 24.2433,
+      "step": 30
+    },
+    {
+      "epoch": 0.08916487540800892,
+      "grad_norm": 0.9422863125801086,
+      "learning_rate": 0.0002625,
+      "loss": 22.2937,
+      "step": 35
+    },
+    {
+      "epoch": 0.1019027147520102,
+      "grad_norm": 0.789577305316925,
+      "learning_rate": 0.0003,
+      "loss": 20.3004,
+      "step": 40
+    },
+    {
+      "epoch": 0.11464055409601147,
+      "grad_norm": 0.74051433801651,
+      "learning_rate": 0.0003,
+      "loss": 18.6344,
+      "step": 45
+    },
+    {
+      "epoch": 0.12737839344001273,
+      "grad_norm": 0.6285977959632874,
+      "learning_rate": 0.0003,
+      "loss": 17.597,
+      "step": 50
+    },
+    {
+      "epoch": 0.140116232784014,
+      "grad_norm": 0.5593317747116089,
+      "learning_rate": 0.0003,
+      "loss": 16.7403,
+      "step": 55
+    },
+    {
+      "epoch": 0.15285407212801527,
+      "grad_norm": 0.49293985962867737,
+      "learning_rate": 0.0003,
+      "loss": 16.0871,
+      "step": 60
+    },
+    {
+      "epoch": 0.16559191147201657,
+      "grad_norm": 0.4442497789859772,
+      "learning_rate": 0.0003,
+      "loss": 15.8307,
+      "step": 65
+    },
+    {
+      "epoch": 0.17832975081601785,
+      "grad_norm": 0.4205341637134552,
+      "learning_rate": 0.0003,
+      "loss": 15.15,
+      "step": 70
+    },
+    {
+      "epoch": 0.19106759016001912,
+      "grad_norm": 0.4151783883571625,
+      "learning_rate": 0.0003,
+      "loss": 15.0298,
+      "step": 75
+    },
+    {
+      "epoch": 0.2038054295040204,
+      "grad_norm": 0.407552033662796,
+      "learning_rate": 0.0003,
+      "loss": 14.694,
+      "step": 80
+    },
+    {
+      "epoch": 0.21654326884802166,
+      "grad_norm": 0.39507508277893066,
+      "learning_rate": 0.0003,
+      "loss": 14.1493,
+      "step": 85
+    },
+    {
+      "epoch": 0.22928110819202294,
+      "grad_norm": 0.4005606770515442,
+      "learning_rate": 0.0003,
+      "loss": 13.8652,
+      "step": 90
+    },
+    {
+      "epoch": 0.2420189475360242,
+      "grad_norm": 0.38723692297935486,
+      "learning_rate": 0.0003,
+      "loss": 13.4051,
+      "step": 95
+    },
+    {
+      "epoch": 0.25475678688002545,
+      "grad_norm": 0.36756670475006104,
+      "learning_rate": 0.0003,
+      "loss": 13.2525,
+      "step": 100
+    },
+    {
+      "epoch": 0.25475678688002545,
+      "eval_accuracy": 0.013114369501466275,
+      "eval_loss": 11.976838111877441,
+      "eval_runtime": 14.6659,
+      "eval_samples_per_second": 17.046,
+      "eval_steps_per_second": 4.296,
+      "step": 100
+    },
+    {
+      "epoch": 0.26749462622402675,
+      "grad_norm": 0.3957444727420807,
+      "learning_rate": 0.0003,
+      "loss": 13.0266,
+      "step": 105
+    },
+    {
+      "epoch": 0.280232465568028,
+      "grad_norm": 0.3462829887866974,
+      "learning_rate": 0.0003,
+      "loss": 12.5453,
+      "step": 110
+    },
+    {
+      "epoch": 0.2929703049120293,
+      "grad_norm": 0.33832067251205444,
+      "learning_rate": 0.0003,
+      "loss": 12.4192,
+      "step": 115
+    },
+    {
+      "epoch": 0.30570814425603055,
+      "grad_norm": 0.3296329975128174,
+      "learning_rate": 0.0003,
+      "loss": 12.16,
+      "step": 120
+    },
+    {
+      "epoch": 0.31844598360003185,
+      "grad_norm": 0.34072279930114746,
+      "learning_rate": 0.0003,
+      "loss": 11.9492,
+      "step": 125
+    },
+    {
+      "epoch": 0.33118382294403315,
+      "grad_norm": 0.317059725522995,
+      "learning_rate": 0.0003,
+      "loss": 11.7652,
+      "step": 130
+    },
+    {
+      "epoch": 0.3439216622880344,
+      "grad_norm": 0.32677391171455383,
+      "learning_rate": 0.0003,
+      "loss": 11.4164,
+      "step": 135
+    },
+    {
+      "epoch": 0.3566595016320357,
+      "grad_norm": 0.34426912665367126,
+      "learning_rate": 0.0003,
+      "loss": 11.2546,
+      "step": 140
+    },
+    {
+      "epoch": 0.36939734097603694,
+      "grad_norm": 0.3237800896167755,
+      "learning_rate": 0.0003,
+      "loss": 10.9715,
+      "step": 145
+    },
+    {
+      "epoch": 0.38213518032003824,
+      "grad_norm": 0.3214079439640045,
+      "learning_rate": 0.0003,
+      "loss": 10.612,
+      "step": 150
+    },
+    {
+      "epoch": 0.3948730196640395,
+      "grad_norm": 0.3305688500404358,
+      "learning_rate": 0.0003,
+      "loss": 10.4704,
+      "step": 155
+    },
+    {
+      "epoch": 0.4076108590080408,
+      "grad_norm": 0.3146178722381592,
+      "learning_rate": 0.0003,
+      "loss": 10.1443,
+      "step": 160
+    },
+    {
+      "epoch": 0.420348698352042,
+      "grad_norm": 0.3035760521888733,
+      "learning_rate": 0.0003,
+      "loss": 10.1132,
+      "step": 165
+    },
+    {
+      "epoch": 0.4330865376960433,
+      "grad_norm": 0.3146466910839081,
+      "learning_rate": 0.0003,
+      "loss": 9.7969,
+      "step": 170
+    },
+    {
+      "epoch": 0.4458243770400446,
+      "grad_norm": 0.3030209243297577,
+      "learning_rate": 0.0003,
+      "loss": 9.6162,
+      "step": 175
+    },
+    {
+      "epoch": 0.4585622163840459,
+      "grad_norm": 0.3140353560447693,
+      "learning_rate": 0.0003,
+      "loss": 9.4377,
+      "step": 180
+    },
+    {
+      "epoch": 0.4713000557280471,
+      "grad_norm": 0.2986455261707306,
+      "learning_rate": 0.0003,
+      "loss": 9.1966,
+      "step": 185
+    },
+    {
+      "epoch": 0.4840378950720484,
+      "grad_norm": 0.30883148312568665,
+      "learning_rate": 0.0003,
+      "loss": 9.1145,
+      "step": 190
+    },
+    {
+      "epoch": 0.49677573441604966,
+      "grad_norm": 0.29901790618896484,
+      "learning_rate": 0.0003,
+      "loss": 8.9888,
+      "step": 195
+    },
+    {
+      "epoch": 0.5095135737600509,
+      "grad_norm": 0.30676862597465515,
+      "learning_rate": 0.0003,
+      "loss": 8.8873,
+      "step": 200
+    },
+    {
+      "epoch": 0.5095135737600509,
+      "eval_accuracy": 0.0356871945259042,
+      "eval_loss": 8.012660026550293,
+      "eval_runtime": 14.5542,
+      "eval_samples_per_second": 17.177,
+      "eval_steps_per_second": 4.329,
+      "step": 200
+    },
+    {
+      "epoch": 0.5222514131040522,
+      "grad_norm": 0.3067306876182556,
+      "learning_rate": 0.0003,
+      "loss": 8.788,
+      "step": 205
+    },
+    {
+      "epoch": 0.5349892524480535,
+      "grad_norm": 0.30200693011283875,
+      "learning_rate": 0.0003,
+      "loss": 8.7241,
+      "step": 210
+    },
+    {
+      "epoch": 0.5477270917920548,
+      "grad_norm": 0.3111984431743622,
+      "learning_rate": 0.0003,
+      "loss": 8.4474,
+      "step": 215
+    },
+    {
+      "epoch": 0.560464931136056,
+      "grad_norm": 0.3096470534801483,
+      "learning_rate": 0.0003,
+      "loss": 8.4588,
+      "step": 220
+    },
+    {
+      "epoch": 0.5732027704800573,
+      "grad_norm": 0.28584450483322144,
+      "learning_rate": 0.0003,
+      "loss": 8.1215,
+      "step": 225
+    },
+    {
+      "epoch": 0.5859406098240586,
+      "grad_norm": 0.2976541817188263,
+      "learning_rate": 0.0003,
+      "loss": 8.2402,
+      "step": 230
+    },
+    {
+      "epoch": 0.5986784491680599,
+      "grad_norm": 0.28990888595581055,
+      "learning_rate": 0.0003,
+      "loss": 8.0817,
+      "step": 235
+    },
+    {
+      "epoch": 0.6114162885120611,
+      "grad_norm": 0.30657345056533813,
+      "learning_rate": 0.0003,
+      "loss": 8.1059,
+      "step": 240
+    },
+    {
+      "epoch": 0.6241541278560624,
+      "grad_norm": 0.2960628569126129,
+      "learning_rate": 0.0003,
+      "loss": 7.7854,
+      "step": 245
+    },
+    {
+      "epoch": 0.6368919672000637,
+      "grad_norm": 0.28521808981895447,
+      "learning_rate": 0.0003,
+      "loss": 7.9146,
+      "step": 250
+    },
+    {
+      "epoch": 0.649629806544065,
+      "grad_norm": 0.3004601001739502,
+      "learning_rate": 0.0003,
+      "loss": 7.7238,
+      "step": 255
+    },
+    {
+      "epoch": 0.6623676458880663,
+      "grad_norm": 0.2811897099018097,
+      "learning_rate": 0.0003,
+      "loss": 7.7869,
+      "step": 260
+    },
+    {
+      "epoch": 0.6751054852320675,
+      "grad_norm": 0.31247615814208984,
+      "learning_rate": 0.0003,
+      "loss": 7.6116,
+      "step": 265
+    },
+    {
+      "epoch": 0.6878433245760688,
+      "grad_norm": 0.28785058856010437,
+      "learning_rate": 0.0003,
+      "loss": 7.6421,
+      "step": 270
+    },
+    {
+      "epoch": 0.7005811639200701,
+      "grad_norm": 0.3141111731529236,
+      "learning_rate": 0.0003,
+      "loss": 7.452,
+      "step": 275
+    },
+    {
+      "epoch": 0.7133190032640714,
+      "grad_norm": 0.2942105233669281,
+      "learning_rate": 0.0003,
+      "loss": 7.4557,
+      "step": 280
+    },
+    {
+      "epoch": 0.7260568426080726,
+      "grad_norm": 0.2928450107574463,
+      "learning_rate": 0.0003,
+      "loss": 7.5162,
+      "step": 285
+    },
+    {
+      "epoch": 0.7387946819520739,
+      "grad_norm": 0.28676503896713257,
+      "learning_rate": 0.0003,
+      "loss": 7.3625,
+      "step": 290
+    },
+    {
+      "epoch": 0.7515325212960752,
+      "grad_norm": 0.32866013050079346,
+      "learning_rate": 0.0003,
+      "loss": 7.2396,
+      "step": 295
+    },
+    {
+      "epoch": 0.7642703606400765,
+      "grad_norm": 0.2969712018966675,
+      "learning_rate": 0.0003,
+      "loss": 7.2457,
+      "step": 300
+    },
+    {
+      "epoch": 0.7642703606400765,
+      "eval_accuracy": 0.051225806451612906,
+      "eval_loss": 6.45078706741333,
+      "eval_runtime": 14.5738,
+      "eval_samples_per_second": 17.154,
+      "eval_steps_per_second": 4.323,
+      "step": 300
+    },
+    {
+      "epoch": 0.7770081999840777,
+      "grad_norm": 0.3022700548171997,
+      "learning_rate": 0.0003,
+      "loss": 7.0441,
+      "step": 305
+    },
+    {
+      "epoch": 0.789746039328079,
+      "grad_norm": 0.29694968461990356,
+      "learning_rate": 0.0003,
+      "loss": 7.0164,
+      "step": 310
+    },
+    {
+      "epoch": 0.8024838786720803,
+      "grad_norm": 0.33195170760154724,
+      "learning_rate": 0.0003,
+      "loss": 7.0554,
+      "step": 315
+    },
+    {
+      "epoch": 0.8152217180160816,
+      "grad_norm": 0.3369743824005127,
+      "learning_rate": 0.0003,
+      "loss": 6.9671,
+      "step": 320
+    },
+    {
+      "epoch": 0.8279595573600828,
+      "grad_norm": 0.3308833837509155,
+      "learning_rate": 0.0003,
+      "loss": 6.9425,
+      "step": 325
+    },
+    {
+      "epoch": 0.840697396704084,
+      "grad_norm": 0.33486923575401306,
+      "learning_rate": 0.0003,
+      "loss": 6.7649,
+      "step": 330
+    },
+    {
+      "epoch": 0.8534352360480854,
+      "grad_norm": 0.33329740166664124,
+      "learning_rate": 0.0003,
+      "loss": 6.7606,
+      "step": 335
+    },
+    {
+      "epoch": 0.8661730753920867,
+      "grad_norm": 0.29600027203559875,
+      "learning_rate": 0.0003,
+      "loss": 6.6892,
+      "step": 340
+    },
+    {
+      "epoch": 0.8789109147360878,
+      "grad_norm": 0.31691107153892517,
+      "learning_rate": 0.0003,
+      "loss": 6.6104,
+      "step": 345
+    },
+    {
+      "epoch": 0.8916487540800891,
+      "grad_norm": 0.31995242834091187,
+      "learning_rate": 0.0003,
+      "loss": 6.6987,
+      "step": 350
+    },
+    {
+      "epoch": 0.9043865934240904,
+      "grad_norm": 0.3355189859867096,
+      "learning_rate": 0.0003,
+      "loss": 6.5496,
+      "step": 355
+    },
+    {
+      "epoch": 0.9171244327680917,
+      "grad_norm": 0.34733301401138306,
+      "learning_rate": 0.0003,
+      "loss": 6.6299,
+      "step": 360
+    },
+    {
+      "epoch": 0.9298622721120929,
+      "grad_norm": 0.3098255693912506,
+      "learning_rate": 0.0003,
+      "loss": 6.5739,
+      "step": 365
+    },
+    {
+      "epoch": 0.9426001114560942,
+      "grad_norm": 0.38446882367134094,
+      "learning_rate": 0.0003,
+      "loss": 6.5487,
+      "step": 370
+    },
+    {
+      "epoch": 0.9553379508000955,
+      "grad_norm": 0.33057910203933716,
+      "learning_rate": 0.0003,
+      "loss": 6.4514,
+      "step": 375
+    },
+    {
+      "epoch": 0.9680757901440968,
+      "grad_norm": 0.3184017539024353,
+      "learning_rate": 0.0003,
+      "loss": 6.4215,
+      "step": 380
+    },
+    {
+      "epoch": 0.980813629488098,
+      "grad_norm": 0.33589789271354675,
+      "learning_rate": 0.0003,
+      "loss": 6.5542,
+      "step": 385
+    },
+    {
+      "epoch": 0.9935514688320993,
+      "grad_norm": 0.2836326062679291,
+      "learning_rate": 0.0003,
+      "loss": 6.3827,
+      "step": 390
+    },
+    {
+      "epoch": 1.0062893081761006,
+      "grad_norm": 0.3056611120700836,
+      "learning_rate": 0.0003,
+      "loss": 6.387,
+      "step": 395
+    },
+    {
+      "epoch": 1.0190271475201018,
+      "grad_norm": 0.2845809757709503,
+      "learning_rate": 0.0003,
+      "loss": 6.3152,
+      "step": 400
+    },
+    {
+      "epoch": 1.0190271475201018,
+      "eval_accuracy": 0.04596676441837732,
+      "eval_loss": 5.616324424743652,
+      "eval_runtime": 14.8427,
+      "eval_samples_per_second": 16.843,
+      "eval_steps_per_second": 4.245,
+      "step": 400
+    },
+    {
+      "epoch": 1.0317649868641032,
+      "grad_norm": 0.2871081531047821,
+      "learning_rate": 0.0003,
+      "loss": 6.3239,
+      "step": 405
+    },
+    {
+      "epoch": 1.0445028262081044,
+      "grad_norm": 0.37001457810401917,
+      "learning_rate": 0.0003,
+      "loss": 6.3937,
+      "step": 410
+    },
+    {
+      "epoch": 1.0572406655521058,
+      "grad_norm": 0.32665711641311646,
+      "learning_rate": 0.0003,
+      "loss": 6.2438,
+      "step": 415
+    },
+    {
+      "epoch": 1.069978504896107,
+      "grad_norm": 0.37014567852020264,
+      "learning_rate": 0.0003,
+      "loss": 6.1073,
+      "step": 420
+    },
+    {
+      "epoch": 1.0827163442401082,
+      "grad_norm": 0.42654964327812195,
+      "learning_rate": 0.0003,
+      "loss": 6.2657,
+      "step": 425
+    },
+    {
+      "epoch": 1.0954541835841096,
+      "grad_norm": 0.43892917037010193,
+      "learning_rate": 0.0003,
+      "loss": 5.9774,
+      "step": 430
+    },
+    {
+      "epoch": 1.1081920229281108,
+      "grad_norm": 0.40710192918777466,
+      "learning_rate": 0.0003,
+      "loss": 6.0797,
+      "step": 435
+    },
+    {
+      "epoch": 1.120929862272112,
+      "grad_norm": 0.3674974739551544,
+      "learning_rate": 0.0003,
+      "loss": 5.9008,
+      "step": 440
+    },
+    {
+      "epoch": 1.1336677016161134,
+      "grad_norm": 0.41214117407798767,
+      "learning_rate": 0.0003,
+      "loss": 5.9841,
+      "step": 445
+    },
+    {
+      "epoch": 1.1464055409601146,
+      "grad_norm": 0.7298715114593506,
+      "learning_rate": 0.0003,
+      "loss": 5.9017,
+      "step": 450
+    },
+    {
+      "epoch": 1.159143380304116,
+      "grad_norm": 0.4723041355609894,
+      "learning_rate": 0.0003,
+      "loss": 5.8628,
+      "step": 455
+    },
+    {
+      "epoch": 1.1718812196481172,
+      "grad_norm": 0.758711576461792,
+      "learning_rate": 0.0003,
+      "loss": 5.7863,
+      "step": 460
+    },
+    {
+      "epoch": 1.1846190589921184,
+      "grad_norm": 0.4319106936454773,
+      "learning_rate": 0.0003,
+      "loss": 5.7691,
+      "step": 465
+    },
+    {
+      "epoch": 1.1973568983361198,
+      "grad_norm": 0.43299469351768494,
+      "learning_rate": 0.0003,
+      "loss": 5.667,
+      "step": 470
+    },
+    {
+      "epoch": 1.210094737680121,
+      "grad_norm": 0.48413950204849243,
+      "learning_rate": 0.0003,
+      "loss": 5.5798,
+      "step": 475
+    },
+    {
+      "epoch": 1.2228325770241222,
+      "grad_norm": 0.41688182950019836,
+      "learning_rate": 0.0003,
+      "loss": 5.6484,
+      "step": 480
+    },
+    {
+      "epoch": 1.2355704163681236,
+      "grad_norm": 0.9052969813346863,
+      "learning_rate": 0.0003,
+      "loss": 5.5806,
+      "step": 485
+    },
+    {
+      "epoch": 1.2483082557121248,
+      "grad_norm": 0.9680259227752686,
+      "learning_rate": 0.0003,
+      "loss": 5.6531,
+      "step": 490
+    },
+    {
+      "epoch": 1.261046095056126,
+      "grad_norm": 0.5839616656303406,
+      "learning_rate": 0.0003,
+      "loss": 5.5656,
+      "step": 495
+    },
+    {
+      "epoch": 1.2737839344001274,
+      "grad_norm": 0.48688173294067383,
+      "learning_rate": 0.0003,
+      "loss": 5.5586,
+      "step": 500
+    },
+    {
+      "epoch": 1.2737839344001274,
+      "eval_accuracy": 0.3649931573802542,
+      "eval_loss": 4.76446008682251,
+      "eval_runtime": 14.7002,
+      "eval_samples_per_second": 17.007,
+      "eval_steps_per_second": 4.286,
+      "step": 500
+    },
+    {
+      "epoch": 1.2865217737441286,
+      "grad_norm": 0.4973750412464142,
+      "learning_rate": 0.0003,
+      "loss": 5.5474,
+      "step": 505
+    },
+    {
+      "epoch": 1.29925961308813,
+      "grad_norm": 0.4334980845451355,
+      "learning_rate": 0.0003,
+      "loss": 5.4498,
+      "step": 510
+    },
+    {
+      "epoch": 1.3119974524321312,
+      "grad_norm": 0.4760842025279999,
+      "learning_rate": 0.0003,
+      "loss": 5.4224,
+      "step": 515
+    },
+    {
+      "epoch": 1.3247352917761326,
+      "grad_norm": 0.5825368762016296,
+      "learning_rate": 0.0003,
+      "loss": 5.5087,
+      "step": 520
+    },
+    {
+      "epoch": 1.3374731311201338,
+      "grad_norm": 0.651641309261322,
+      "learning_rate": 0.0003,
+      "loss": 5.3186,
+      "step": 525
+    },
+    {
+      "epoch": 1.350210970464135,
+      "grad_norm": 0.5380859375,
+      "learning_rate": 0.0003,
+      "loss": 5.3928,
+      "step": 530
+    },
+    {
+      "epoch": 1.3629488098081364,
+      "grad_norm": 0.5173642635345459,
+      "learning_rate": 0.0003,
+      "loss": 5.351,
+      "step": 535
+    },
+    {
+      "epoch": 1.3756866491521376,
+      "grad_norm": 0.4927425682544708,
+      "learning_rate": 0.0003,
+      "loss": 5.2646,
+      "step": 540
+    },
+    {
+      "epoch": 1.3884244884961388,
+      "grad_norm": 0.6876756548881531,
+      "learning_rate": 0.0003,
+      "loss": 5.4045,
+      "step": 545
+    },
+    {
+      "epoch": 1.4011623278401402,
+      "grad_norm": 0.7293450832366943,
+      "learning_rate": 0.0003,
+      "loss": 5.4399,
+      "step": 550
+    },
+    {
+      "epoch": 1.4139001671841414,
+      "grad_norm": 0.4836059808731079,
+      "learning_rate": 0.0003,
+      "loss": 5.3522,
+      "step": 555
+    },
+    {
+      "epoch": 1.4266380065281425,
+      "grad_norm": 0.5378084778785706,
+      "learning_rate": 0.0003,
+      "loss": 5.1109,
+      "step": 560
+    },
+    {
+      "epoch": 1.439375845872144,
+      "grad_norm": 0.5663474202156067,
+      "learning_rate": 0.0003,
+      "loss": 5.3989,
+      "step": 565
+    },
+    {
+      "epoch": 1.4521136852161451,
+      "grad_norm": 0.6027519702911377,
+      "learning_rate": 0.0003,
+      "loss": 5.3082,
+      "step": 570
+    },
+    {
+      "epoch": 1.4648515245601466,
+      "grad_norm": 0.5912690758705139,
+      "learning_rate": 0.0003,
+      "loss": 5.3432,
+      "step": 575
+    },
+    {
+      "epoch": 1.4775893639041477,
+      "grad_norm": 0.5942875742912292,
+      "learning_rate": 0.0003,
+      "loss": 5.2603,
+      "step": 580
+    },
+    {
+      "epoch": 1.4903272032481492,
+      "grad_norm": 0.45755377411842346,
+      "learning_rate": 0.0003,
+      "loss": 5.1047,
+      "step": 585
+    },
+    {
+      "epoch": 1.5030650425921503,
+      "grad_norm": 0.6130331754684448,
+      "learning_rate": 0.0003,
+      "loss": 5.1628,
+      "step": 590
+    },
+    {
+      "epoch": 1.5158028819361515,
+      "grad_norm": 0.6434487700462341,
+      "learning_rate": 0.0003,
+      "loss": 5.164,
+      "step": 595
+    },
+    {
+      "epoch": 1.528540721280153,
+      "grad_norm": 0.919582724571228,
+      "learning_rate": 0.0003,
+      "loss": 5.2936,
+      "step": 600
+    },
+    {
+      "epoch": 1.528540721280153,
+      "eval_accuracy": 0.39341935483870966,
+      "eval_loss": 4.391851425170898,
+      "eval_runtime": 14.8627,
+      "eval_samples_per_second": 16.821,
+      "eval_steps_per_second": 4.239,
+      "step": 600
+    },
+    {
+      "epoch": 1.5412785606241541,
+      "grad_norm": 0.6150545477867126,
+      "learning_rate": 0.0003,
+      "loss": 5.0455,
+      "step": 605
+    },
+    {
+      "epoch": 1.5540163999681553,
+      "grad_norm": 0.5225240588188171,
+      "learning_rate": 0.0003,
+      "loss": 5.1175,
+      "step": 610
+    },
+    {
+      "epoch": 1.5667542393121567,
+      "grad_norm": 0.8378353714942932,
+      "learning_rate": 0.0003,
+      "loss": 5.1146,
+      "step": 615
+    },
+    {
+      "epoch": 1.579492078656158,
+      "grad_norm": 0.5006564855575562,
+      "learning_rate": 0.0003,
+      "loss": 4.9924,
+      "step": 620
+    },
+    {
+      "epoch": 1.5922299180001591,
+      "grad_norm": 0.7312870621681213,
+      "learning_rate": 0.0003,
+      "loss": 5.0733,
+      "step": 625
+    },
+    {
+      "epoch": 1.6049677573441605,
+      "grad_norm": 0.6706296801567078,
+      "learning_rate": 0.0003,
+      "loss": 4.9791,
+      "step": 630
+    },
+    {
+      "epoch": 1.6177055966881617,
+      "grad_norm": 0.5874515175819397,
+      "learning_rate": 0.0003,
+      "loss": 5.0827,
+      "step": 635
+    },
+    {
+      "epoch": 1.630443436032163,
+      "grad_norm": 0.6047885417938232,
+      "learning_rate": 0.0003,
+      "loss": 5.1284,
+      "step": 640
+    },
+    {
+      "epoch": 1.6431812753761643,
+      "grad_norm": 0.8195576667785645,
+      "learning_rate": 0.0003,
+      "loss": 5.0817,
+      "step": 645
+    },
+    {
+      "epoch": 1.6559191147201657,
+      "grad_norm": 0.8390661478042603,
+      "learning_rate": 0.0003,
+      "loss": 4.9869,
+      "step": 650
+    },
+    {
+      "epoch": 1.668656954064167,
+      "grad_norm": 0.6308897733688354,
+      "learning_rate": 0.0003,
+      "loss": 5.0157,
+      "step": 655
+    },
+    {
+      "epoch": 1.681394793408168,
+      "grad_norm": 0.9929732084274292,
+      "learning_rate": 0.0003,
+      "loss": 5.0349,
+      "step": 660
+    },
+    {
+      "epoch": 1.6941326327521695,
+      "grad_norm": 0.660764753818512,
+      "learning_rate": 0.0003,
+      "loss": 5.1106,
+      "step": 665
+    },
+    {
+      "epoch": 1.7068704720961707,
+      "grad_norm": 0.7146616578102112,
+      "learning_rate": 0.0003,
+      "loss": 5.0494,
+      "step": 670
+    },
+    {
+      "epoch": 1.719608311440172,
+      "grad_norm": 0.8408402800559998,
+      "learning_rate": 0.0003,
+      "loss": 4.9912,
+      "step": 675
+    },
+    {
+      "epoch": 1.7323461507841733,
+      "grad_norm": 0.7403599619865417,
+      "learning_rate": 0.0003,
+      "loss": 4.8969,
+      "step": 680
+    },
+    {
+      "epoch": 1.7450839901281745,
+      "grad_norm": 0.9758443832397461,
+      "learning_rate": 0.0003,
+      "loss": 4.9444,
+      "step": 685
+    },
+    {
+      "epoch": 1.7578218294721757,
+      "grad_norm": 0.551741898059845,
+      "learning_rate": 0.0003,
+      "loss": 4.9441,
+      "step": 690
+    },
+    {
+      "epoch": 1.770559668816177,
+      "grad_norm": 0.6962785720825195,
+      "learning_rate": 0.0003,
+      "loss": 4.9756,
+      "step": 695
+    },
+    {
+      "epoch": 1.7832975081601783,
+      "grad_norm": 0.5543167591094971,
+      "learning_rate": 0.0003,
+      "loss": 4.8839,
+      "step": 700
+    },
+    {
+      "epoch": 1.7832975081601783,
+      "eval_accuracy": 0.40842619745845554,
+      "eval_loss": 4.192822456359863,
+      "eval_runtime": 14.7931,
+      "eval_samples_per_second": 16.9,
+      "eval_steps_per_second": 4.259,
+      "step": 700
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 784,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "total_flos": 5.578286899711181e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-700/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:698e3be45e384522b83865d39a95c98084e8feccafe7e17ac91b0853d9a956a4
+size 5176

checkpoint-700/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "pad_token_id": 0,
+  "transformers_version": "4.40.1"
+}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:293c092a8f8799e6300d498d53a2ca8e779bea567aebb76936ccee19e8207577
 size 671684224

 version https://git-lfs.github.com/spec/v1
+oid sha256:e0f05c2c379e62ba1f6aefd3101cab184a44bf05d79ac51787305f79f74706cb
 size 671684224