diff --git a/attention_2_only_emb/final_model/pytorch_model.bin b/attention_2_only_emb/final_model/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..95671e5b221e307dfd8b43fd774ce1a8e341cf9b
--- /dev/null
+++ b/attention_2_only_emb/final_model/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65238b445c26f43da67f2f54cecb176acdc8f4316c36230e1baccf453169754b
+size 2533545094
diff --git a/attention_2_only_emb/model_1000/pytorch_model.bin b/attention_2_only_emb/model_1000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..378e8f4dfc77739b9b7a3041fada61ea4a3811f2
--- /dev/null
+++ b/attention_2_only_emb/model_1000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d48e11d99e74e3ac615d5f6bd78b0f524e0ad1951fca5b223c9aec52ebb23596
+size 2533545094
diff --git a/attention_2_only_emb/model_10000/pytorch_model.bin b/attention_2_only_emb/model_10000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..03697e58d7b3d79c041b33f8cf1a36cc2bdcc3a1
--- /dev/null
+++ b/attention_2_only_emb/model_10000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e190ea29dd8119871d88bc6c120b49a4c3b6e9e979286dac7b464f9c42fb5454
+size 2533545094
diff --git a/attention_2_only_emb/model_11000/pytorch_model.bin b/attention_2_only_emb/model_11000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f6a3d29ecb67e69c8b18ad5eba25236b3eb48800
--- /dev/null
+++ b/attention_2_only_emb/model_11000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e8a06de57e06be1e59112b19c52985b6e1e1905645464df7d675c61f8a62a67
+size 2533545094
diff --git a/attention_2_only_emb/model_13000/pytorch_model.bin b/attention_2_only_emb/model_13000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..13ad192ebe0e8705f80b5cc28f1daf5d0c51ade5
--- /dev/null
+++ b/attention_2_only_emb/model_13000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33dfd88af702c854bab079669a8dde4157c14ab4dbbea412e8382b1157fed607
+size 2533545094
diff --git a/attention_2_only_emb/model_14000/pytorch_model.bin b/attention_2_only_emb/model_14000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..51e3b9125da682e52a3dabf3e86f4d910e2d8b57
--- /dev/null
+++ b/attention_2_only_emb/model_14000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:288665e9230d153d7a863e7a8e66cc3c48d5133ba629a788f8c8302f62bcf508
+size 2533545094
diff --git a/attention_2_only_emb/model_15000/pytorch_model.bin b/attention_2_only_emb/model_15000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..95671e5b221e307dfd8b43fd774ce1a8e341cf9b
--- /dev/null
+++ b/attention_2_only_emb/model_15000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65238b445c26f43da67f2f54cecb176acdc8f4316c36230e1baccf453169754b
+size 2533545094
diff --git a/attention_2_only_emb/model_4000/pytorch_model.bin b/attention_2_only_emb/model_4000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9b1416b0bab8868541ea4571a456a5e63423a241
--- /dev/null
+++ b/attention_2_only_emb/model_4000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f831b13a1e989d024050ac81156cca927d96d40afd492847292ea167b8f97d8e
+size 2533545094
diff --git a/attention_2_only_emb/model_5000/pytorch_model.bin b/attention_2_only_emb/model_5000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..17d01f228b890cc161410230468fad2793b9b850
--- /dev/null
+++ b/attention_2_only_emb/model_5000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a72b2ed53bbf3dbc3fc4711abcd9d661d0064760380d4c3f2ce7c6b59db76dbf
+size 2533545094
diff --git a/attention_2_only_emb/model_9000/pytorch_model.bin b/attention_2_only_emb/model_9000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..854b13c5fe697f8de1eba0c8474e6b371e273a18
--- /dev/null
+++ b/attention_2_only_emb/model_9000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4851e88488581a963da7b13d25b32cf82e0512d7a393bfbf957b3671ff379c7a
+size 2533545094
diff --git a/baseline/model_1000/pytorch_model.bin b/baseline/model_1000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cd0cac02a4f88d9f645ff007ef5f596a434d5edc
--- /dev/null
+++ b/baseline/model_1000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00edd9bd916b1d6059044c1e496b6c21775ba6e8cfc08fb54ea38a112cefeb4e
+size 2533545094
diff --git a/baseline/model_11000/pytorch_model.bin b/baseline/model_11000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..03dfcbce61dd13f29230fde15f02811e200e0dba
--- /dev/null
+++ b/baseline/model_11000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef2caffc4e634a9b5cc17988d11913c72e3a6eeede9a45f38257db62bff5f661
+size 2533545094
diff --git a/baseline/model_12000/pytorch_model.bin b/baseline/model_12000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f7c0e4d4150892b1a3ea7a22dd5d0c92557ca872
--- /dev/null
+++ b/baseline/model_12000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e719eae49624f71ba9389bd9764c1cb5d07c0acf33a1220eb4f8dda73240b9b
+size 2533545094
diff --git a/baseline/model_13000/pytorch_model.bin b/baseline/model_13000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e828cb6f6046020e30081a5b903a54f1e087f3df
--- /dev/null
+++ b/baseline/model_13000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:672bb2ac05b1618a7d748e23fa392f5197065cef854f22b6dce109b472470517
+size 2533545094
diff --git a/baseline/model_14000/pytorch_model.bin b/baseline/model_14000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..57ed073b2f9d4c0a39a4b0ce8f91c5016e4bb101
--- /dev/null
+++ b/baseline/model_14000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4739b4a78103197ee542a1010ba48e38824f60f5ec00ce70a9ae33ca5f2e4343
+size 2533545094
diff --git a/baseline/model_2000/pytorch_model.bin b/baseline/model_2000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c33f206a54f0dac0d693f3a0ba40c054203c8bab
--- /dev/null
+++ b/baseline/model_2000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc43549f54dd1e2edc06ec8ea811fc48c5b51a83830ec22027266878f3657b91
+size 2533545094
diff --git a/baseline/model_3000/pytorch_model.bin b/baseline/model_3000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..41c403d50726d881633aab9dd1ff78e7459096ad
--- /dev/null
+++ b/baseline/model_3000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83671facca5fb99530b5c52bbd111dda3383639f4ef50f90bd7379aa1b37fae2
+size 2533545094
diff --git a/baseline/model_4000/pytorch_model.bin b/baseline/model_4000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..013f0b0f3896454305be7ecd36202cf677723804
--- /dev/null
+++ b/baseline/model_4000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebb2010e8f368ae6fe61f89a9d71b8ee6a821a59727301688097911f2337d544
+size 2533545094
diff --git a/baseline/model_5000/pytorch_model.bin b/baseline/model_5000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e0e57f5d7b997b27d32aded45ef22a03993eddce
--- /dev/null
+++ b/baseline/model_5000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d88bc7652172e2291ad1e16d2326344a1f9651dec698d2f14ff2b8a909f82dc
+size 2533545094
diff --git a/baseline/model_6000/pytorch_model.bin b/baseline/model_6000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1e6820276ba5468f0adab2d82c0fa2faf7e6b5b9
--- /dev/null
+++ b/baseline/model_6000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e8417cded07b68977443fb14688e0a5d1fdfe3727a5a58da05dd5df10423795
+size 2533545094
diff --git a/baseline/model_7000/pytorch_model.bin b/baseline/model_7000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3ffd06ffe06db37eaf8209c0afc616d1a91ab2c5
--- /dev/null
+++ b/baseline/model_7000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bedb26968104d1fe2d880f5b4e0ebb006fbea3dde5ca3603f7606c07e0a4bed8
+size 2533545094
diff --git a/baseline/model_8000/pytorch_model.bin b/baseline/model_8000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4e065d99cda8d1e005df97f758c6cfe717c15156
--- /dev/null
+++ b/baseline/model_8000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7d8381c12bd8d5c961d08860dfeac2a8cfbea2af8400b6d62bd2623c90258f5
+size 2533545094
diff --git a/baseline/model_9000/pytorch_model.bin b/baseline/model_9000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1b71c5d2f55e8e54d275d20ac8d787344f940aab
--- /dev/null
+++ b/baseline/model_9000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d62dca90201b94b66ead46c71e3b2036bbe678cd38ab831997b6ce49b858773f
+size 2533545094
diff --git a/bigram_2/final_model/config.json b/bigram_2/final_model/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2/final_model/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2/model_1000/model_config.json b/bigram_2/model_1000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2/model_1000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2/model_1000/training_state.json b/bigram_2/model_1000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..81cd0d80404a5e37372412f6a02ed258d564ca89
--- /dev/null
+++ b/bigram_2/model_1000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 1000,
+    "update_step": 1000,
+    "tokens_seen": 639025152,
+    "tokens_seen_before": 638386176,
+    "update_time": 2.892822504043579,
+    "wandb_id": "walilhuz"
+}
\ No newline at end of file
diff --git a/bigram_2/model_10000/model_config.json b/bigram_2/model_10000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2/model_10000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2/model_10000/training_state.json b/bigram_2/model_10000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1fcc39d0302aca08bc8dd8de528073aefb462059
--- /dev/null
+++ b/bigram_2/model_10000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 10000,
+    "update_step": 10000,
+    "tokens_seen": 6389809152,
+    "tokens_seen_before": 6389170176,
+    "update_time": 2.8910531997680664,
+    "wandb_id": "walilhuz"
+}
\ No newline at end of file
diff --git a/bigram_2/model_11000/model_config.json b/bigram_2/model_11000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2/model_11000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2/model_11000/training_state.json b/bigram_2/model_11000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3424a3084540e7fcc2928dc45d77e68af62afa0e
--- /dev/null
+++ b/bigram_2/model_11000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 11000,
+    "update_step": 11000,
+    "tokens_seen": 7028785152,
+    "tokens_seen_before": 7028146176,
+    "update_time": 2.88981556892395,
+    "wandb_id": "walilhuz"
+}
\ No newline at end of file
diff --git a/bigram_2/model_12000/model_config.json b/bigram_2/model_12000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2/model_12000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2/model_12000/training_state.json b/bigram_2/model_12000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b9a5e45ee6f4d159ad10d6465b8d5d3d7f04282
--- /dev/null
+++ b/bigram_2/model_12000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 12000,
+    "update_step": 12000,
+    "tokens_seen": 7667761152,
+    "tokens_seen_before": 7667122176,
+    "update_time": 2.8912298679351807,
+    "wandb_id": "walilhuz"
+}
\ No newline at end of file
diff --git a/bigram_2/model_14000/model_config.json b/bigram_2/model_14000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2/model_14000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2/model_14000/training_state.json b/bigram_2/model_14000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..022a95a57b3f716cdedd9cf9d11fe05e285e47a7
--- /dev/null
+++ b/bigram_2/model_14000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 14000,
+    "update_step": 14000,
+    "tokens_seen": 8945713152,
+    "tokens_seen_before": 8945074176,
+    "update_time": 2.8920021057128906,
+    "wandb_id": "walilhuz"
+}
\ No newline at end of file
diff --git a/bigram_2/model_15000/model_config.json b/bigram_2/model_15000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2/model_15000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2/model_15000/training_state.json b/bigram_2/model_15000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c898806ef419c4f1ededd28aaf71a03240351f02
--- /dev/null
+++ b/bigram_2/model_15000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 15000,
+    "update_step": 15000,
+    "tokens_seen": 9584689152,
+    "tokens_seen_before": 9584050176,
+    "update_time": 2.89182710647583,
+    "wandb_id": "walilhuz"
+}
\ No newline at end of file
diff --git a/bigram_2/model_4000/model_config.json b/bigram_2/model_4000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2/model_4000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2/model_5000/model_config.json b/bigram_2/model_5000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2/model_5000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2/model_5000/training_state.json b/bigram_2/model_5000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1bdc22f88f77c3f3f5e88f9efee1591baa9acb72
--- /dev/null
+++ b/bigram_2/model_5000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 5000,
+    "update_step": 5000,
+    "tokens_seen": 3194929152,
+    "tokens_seen_before": 3194290176,
+    "update_time": 2.8917791843414307,
+    "wandb_id": "walilhuz"
+}
\ No newline at end of file
diff --git a/bigram_2/model_6000/model_config.json b/bigram_2/model_6000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2/model_6000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2/model_6000/training_state.json b/bigram_2/model_6000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..388967265b0e371f66582c6662e629e7cd031941
--- /dev/null
+++ b/bigram_2/model_6000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 6000,
+    "update_step": 6000,
+    "tokens_seen": 3833905152,
+    "tokens_seen_before": 3833266176,
+    "update_time": 2.892122983932495,
+    "wandb_id": "walilhuz"
+}
\ No newline at end of file
diff --git a/bigram_2/model_7000/model_config.json b/bigram_2/model_7000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2/model_7000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2/model_7000/training_state.json b/bigram_2/model_7000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f71315234af27e3e7ac92e9dc7a68e1e61972ca
--- /dev/null
+++ b/bigram_2/model_7000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 7000,
+    "update_step": 7000,
+    "tokens_seen": 4472881152,
+    "tokens_seen_before": 4472242176,
+    "update_time": 2.8900256156921387,
+    "wandb_id": "walilhuz"
+}
\ No newline at end of file
diff --git a/bigram_2/model_8000/model_config.json b/bigram_2/model_8000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2/model_8000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2/model_8000/training_state.json b/bigram_2/model_8000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b142ba0aeeea8a30e710428701b65c91cc00b48
--- /dev/null
+++ b/bigram_2/model_8000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 8000,
+    "update_step": 8000,
+    "tokens_seen": 5111857152,
+    "tokens_seen_before": 5111218176,
+    "update_time": 2.8911120891571045,
+    "wandb_id": "walilhuz"
+}
\ No newline at end of file
diff --git a/bigram_2/model_9000/model_config.json b/bigram_2/model_9000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2/model_9000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2/model_9000/training_state.json b/bigram_2/model_9000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5f5b9a718929823c4233ee0407ecd8eff0d632a
--- /dev/null
+++ b/bigram_2/model_9000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 9000,
+    "update_step": 9000,
+    "tokens_seen": 5750833152,
+    "tokens_seen_before": 5750194176,
+    "update_time": 2.890213966369629,
+    "wandb_id": "walilhuz"
+}
\ No newline at end of file
diff --git a/bigram_2_full/final_model/pytorch_model.bin b/bigram_2_full/final_model/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1d2c7868988799fa9135d1ee3c14ea7f1357b0d5
--- /dev/null
+++ b/bigram_2_full/final_model/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:477b0b02d1fa370dd03a88359bf7784e17681740c07a6952b75b5bab9f5e333f
+size 2533545094
diff --git a/bigram_2_full/model_1000/pytorch_model.bin b/bigram_2_full/model_1000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1d9946f12cb9210a4dc458bfc123703dba656497
--- /dev/null
+++ b/bigram_2_full/model_1000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f696208f3842be6f4c26371224bc32cb8cb8f5a268c879be0dd18afef088b4f
+size 2533545094
diff --git a/bigram_2_full/model_10000/pytorch_model.bin b/bigram_2_full/model_10000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..941e96234d5b7e40485a2e8337148fc4cd6c7832
--- /dev/null
+++ b/bigram_2_full/model_10000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:808a7200d616e0ad374917803a0a3e5e53c4751e33bdf1761e5d421e603581cd
+size 2533545094
diff --git a/bigram_2_full/model_11000/pytorch_model.bin b/bigram_2_full/model_11000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..02d773600791e65e1ee32d30261135522400c82c
--- /dev/null
+++ b/bigram_2_full/model_11000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b2c8b1b152a92fcb299306899342ad9e07e2fc3b3c270141d5153b2da982088
+size 2533545094
diff --git a/bigram_2_full/model_12000/pytorch_model.bin b/bigram_2_full/model_12000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6ebae8cc7dec6c1ae08cc9dfc0414830136a3f8b
--- /dev/null
+++ b/bigram_2_full/model_12000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:062de95bd0fbe5d415395f0ef568fc9c5a047f5ece6ae39233004ac42ad5b2ec
+size 2533545094
diff --git a/bigram_2_full/model_13000/pytorch_model.bin b/bigram_2_full/model_13000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..31734c6a114e1f290b1e0c97d6bd037811bd0bff
--- /dev/null
+++ b/bigram_2_full/model_13000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:322b22a2bda58c621d02d89a01ec0f2532dc3ce28ad99ba73ff5b899173e8a38
+size 2533545094
diff --git a/bigram_2_full/model_14000/pytorch_model.bin b/bigram_2_full/model_14000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..755e1fe55c4f7996208cce7010cf511d50d3e146
--- /dev/null
+++ b/bigram_2_full/model_14000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1451295b36110a92129f88c001b929004b0ca76833c9c254578777aff9c0d7ba
+size 2533545094
diff --git a/bigram_2_full/model_15000/pytorch_model.bin b/bigram_2_full/model_15000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1d2c7868988799fa9135d1ee3c14ea7f1357b0d5
--- /dev/null
+++ b/bigram_2_full/model_15000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:477b0b02d1fa370dd03a88359bf7784e17681740c07a6952b75b5bab9f5e333f
+size 2533545094
diff --git a/bigram_2_full/model_2000/pytorch_model.bin b/bigram_2_full/model_2000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c84717271e76a8b4a35c394502eefdbf3e1c9a78
--- /dev/null
+++ b/bigram_2_full/model_2000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4fbef22f7891155e70e7c265b62b090caa50f0364de5a11b4adb0f6d4fbc6417
+size 2533545094
diff --git a/bigram_2_full/model_3000/pytorch_model.bin b/bigram_2_full/model_3000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..293e4118218f0dc98de29eee6820d0fb061ecf8a
--- /dev/null
+++ b/bigram_2_full/model_3000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed6b522ff8826748d3c9a799bd85be1d1825c64b80c388c15b00bf3838b24efc
+size 2533545094
diff --git a/bigram_2_full/model_4000/pytorch_model.bin b/bigram_2_full/model_4000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f00cdf477a754683d78c9bddbda101fa06097843
--- /dev/null
+++ b/bigram_2_full/model_4000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbb49985fad62c8da6673d6768bb3843c2678436167c6e503a90be5f5ad0e7cc
+size 2533545094
diff --git a/bigram_2_full/model_5000/pytorch_model.bin b/bigram_2_full/model_5000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fcc2841a14b685e0f195cc972ce208468c276a4a
--- /dev/null
+++ b/bigram_2_full/model_5000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e4d8c60a18da227bcf549ccb742861073d736f37887e7a60fd344342c17af79
+size 2533545094
diff --git a/bigram_2_full/model_6000/pytorch_model.bin b/bigram_2_full/model_6000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..37d8d1ec61a3c0504537927c5d1f0709fad2f52e
--- /dev/null
+++ b/bigram_2_full/model_6000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c65e3e9109eedadfcd0e30f45c3ce0f1b1674999b4bf92b1c3a5182ed7cbf27c
+size 2533545094
diff --git a/bigram_2_full/model_7000/pytorch_model.bin b/bigram_2_full/model_7000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..595a6f8e75df1442fe3bdd39ccb9d2082af58e7d
--- /dev/null
+++ b/bigram_2_full/model_7000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8031e8f5551f47b130bfd7cc58b8927d62516ecc78ac3b09927b025c72935c5d
+size 2533545094
diff --git a/bigram_2_full/model_8000/pytorch_model.bin b/bigram_2_full/model_8000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..132a06eb4c3c2a1d8f07f3d2d47b2a9b7e378c18
--- /dev/null
+++ b/bigram_2_full/model_8000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9edbd2567cbe48ae392443870d7b60fd3ac6970556059d45142a05161b713bc0
+size 2533545094
diff --git a/bigram_2_full/model_9000/pytorch_model.bin b/bigram_2_full/model_9000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2c4ee7b61e9b7704fc7094e6d3c99508bf12d40e
--- /dev/null
+++ b/bigram_2_full/model_9000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:812b7b7e03b649ec9c67c3694b220076bbcf95816c4681fca07cde6fecc7c402
+size 2533545094
diff --git a/first_attention_1/final_model/config.json b/first_attention_1/final_model/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_1/final_model/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_1/model_1000/config.json b/first_attention_1/model_1000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_1/model_1000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_1/model_1000/training_state.json b/first_attention_1/model_1000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b25800c7ee1a6849329d986937842af1564ffbc
--- /dev/null
+++ b/first_attention_1/model_1000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 1000,
+    "update_step": 1000,
+    "tokens_seen": 639025152,
+    "tokens_seen_before": 638386176,
+    "update_time": 0.842038631439209,
+    "wandb_id": "0e3seng4"
+}
\ No newline at end of file
diff --git a/first_attention_1/model_10000/config.json b/first_attention_1/model_10000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_1/model_10000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_1/model_10000/training_state.json b/first_attention_1/model_10000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..750dddae0916ac9388afdb5ed60f4c449b676932
--- /dev/null
+++ b/first_attention_1/model_10000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 10000,
+    "update_step": 10000,
+    "tokens_seen": 6389809152,
+    "tokens_seen_before": 6389170176,
+    "update_time": 0.8426082134246826,
+    "wandb_id": "0e3seng4"
+}
\ No newline at end of file
diff --git a/first_attention_1/model_11000/config.json b/first_attention_1/model_11000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_1/model_11000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_1/model_11000/training_state.json b/first_attention_1/model_11000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1181ecb49ed7a3067a6b2dc83f5ead852f5ed33
--- /dev/null
+++ b/first_attention_1/model_11000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 11000,
+    "update_step": 11000,
+    "tokens_seen": 7028785152,
+    "tokens_seen_before": 7028146176,
+    "update_time": 0.8413209915161133,
+    "wandb_id": "0e3seng4"
+}
\ No newline at end of file
diff --git a/first_attention_1/model_12000/config.json b/first_attention_1/model_12000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_1/model_12000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_1/model_12000/training_state.json b/first_attention_1/model_12000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf5ada7befaa42a325142d48bb6f07cf982ccca5
--- /dev/null
+++ b/first_attention_1/model_12000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 12000,
+    "update_step": 12000,
+    "tokens_seen": 7667761152,
+    "tokens_seen_before": 7667122176,
+    "update_time": 0.8421485424041748,
+    "wandb_id": "0e3seng4"
+}
\ No newline at end of file
diff --git a/first_attention_1/model_13000/config.json b/first_attention_1/model_13000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_1/model_13000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_1/model_13000/training_state.json b/first_attention_1/model_13000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0dcc3de2b7c8e2309734711edd9610d4e88d5a7
--- /dev/null
+++ b/first_attention_1/model_13000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 13000,
+    "update_step": 13000,
+    "tokens_seen": 8306737152,
+    "tokens_seen_before": 8306098176,
+    "update_time": 0.8420231342315674,
+    "wandb_id": "0e3seng4"
+}
\ No newline at end of file
diff --git a/first_attention_1/model_14000/config.json b/first_attention_1/model_14000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_1/model_14000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_1/model_14000/training_state.json b/first_attention_1/model_14000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ece6a192ef3ed2d193d138d89a84be049c747b0c
--- /dev/null
+++ b/first_attention_1/model_14000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 14000,
+    "update_step": 14000,
+    "tokens_seen": 8945713152,
+    "tokens_seen_before": 8945074176,
+    "update_time": 0.842217206954956,
+    "wandb_id": "0e3seng4"
+}
\ No newline at end of file
diff --git a/first_attention_1/model_15000/config.json b/first_attention_1/model_15000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_1/model_15000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_1/model_15000/training_state.json b/first_attention_1/model_15000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c659ce53030acf8f35ee7a02753a3e887dde2a8
--- /dev/null
+++ b/first_attention_1/model_15000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 15000,
+    "update_step": 15000,
+    "tokens_seen": 9584689152,
+    "tokens_seen_before": 9584050176,
+    "update_time": 0.8423118591308594,
+    "wandb_id": "0e3seng4"
+}
\ No newline at end of file
diff --git a/first_attention_1/model_2000/config.json b/first_attention_1/model_2000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_1/model_2000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_1/model_2000/training_state.json b/first_attention_1/model_2000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e949d3c14077a2c7d8808443bc624ed92c532cb9
--- /dev/null
+++ b/first_attention_1/model_2000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 2000,
+    "update_step": 2000,
+    "tokens_seen": 1278001152,
+    "tokens_seen_before": 1277362176,
+    "update_time": 0.8418259620666504,
+    "wandb_id": "0e3seng4"
+}
\ No newline at end of file
diff --git a/first_attention_1/model_3000/config.json b/first_attention_1/model_3000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_1/model_3000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_1/model_3000/training_state.json b/first_attention_1/model_3000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..22d9fd0f8d06ae5ea907b471390d579f74b31f2f
--- /dev/null
+++ b/first_attention_1/model_3000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 3000,
+    "update_step": 3000,
+    "tokens_seen": 1916977152,
+    "tokens_seen_before": 1916338176,
+    "update_time": 0.8426218032836914,
+    "wandb_id": "0e3seng4"
+}
\ No newline at end of file
diff --git a/first_attention_1/model_4000/config.json b/first_attention_1/model_4000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_1/model_4000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_1/model_4000/training_state.json b/first_attention_1/model_4000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f7531629fd837bcadf944e3c4152cc592e87fb0
--- /dev/null
+++ b/first_attention_1/model_4000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 4000,
+    "update_step": 4000,
+    "tokens_seen": 2555953152,
+    "tokens_seen_before": 2555314176,
+    "update_time": 0.8424825668334961,
+    "wandb_id": "0e3seng4"
+}
\ No newline at end of file
diff --git a/first_attention_1/model_5000/config.json b/first_attention_1/model_5000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_1/model_5000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_1/model_5000/training_state.json b/first_attention_1/model_5000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8b9691ae3278020f385c2379d2b39dee72e7a2e
--- /dev/null
+++ b/first_attention_1/model_5000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 5000,
+    "update_step": 5000,
+    "tokens_seen": 3194929152,
+    "tokens_seen_before": 3194290176,
+    "update_time": 0.8429272174835205,
+    "wandb_id": "0e3seng4"
+}
\ No newline at end of file
diff --git a/first_attention_1/model_6000/config.json b/first_attention_1/model_6000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_1/model_6000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_1/model_6000/training_state.json b/first_attention_1/model_6000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..482bcc8208621ccc95e946706660a81a7e02dd73
--- /dev/null
+++ b/first_attention_1/model_6000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 6000,
+    "update_step": 6000,
+    "tokens_seen": 3833905152,
+    "tokens_seen_before": 3833266176,
+    "update_time": 0.8419299125671387,
+    "wandb_id": "0e3seng4"
+}
\ No newline at end of file
diff --git a/first_attention_1/model_7000/config.json b/first_attention_1/model_7000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_1/model_7000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_1/model_7000/training_state.json b/first_attention_1/model_7000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8035a097abbef7f8267a0e0e9a578d5c6d30df44
--- /dev/null
+++ b/first_attention_1/model_7000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 7000,
+    "update_step": 7000,
+    "tokens_seen": 4472881152,
+    "tokens_seen_before": 4472242176,
+    "update_time": 0.8422131538391113,
+    "wandb_id": "0e3seng4"
+}
\ No newline at end of file
diff --git a/first_attention_1/model_8000/config.json b/first_attention_1/model_8000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_1/model_8000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_1/model_8000/training_state.json b/first_attention_1/model_8000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6802144700637f9a9cddf43cece43551b75e0366
--- /dev/null
+++ b/first_attention_1/model_8000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 8000,
+    "update_step": 8000,
+    "tokens_seen": 5111857152,
+    "tokens_seen_before": 5111218176,
+    "update_time": 0.8415825366973877,
+    "wandb_id": "0e3seng4"
+}
\ No newline at end of file
diff --git a/first_attention_1/model_9000/config.json b/first_attention_1/model_9000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_1/model_9000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_1/model_9000/training_state.json b/first_attention_1/model_9000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9728504f89a6ea4d09a789781431c5f63f9b75c6
--- /dev/null
+++ b/first_attention_1/model_9000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 9000,
+    "update_step": 9000,
+    "tokens_seen": 5750833152,
+    "tokens_seen_before": 5750194176,
+    "update_time": 0.8423662185668945,
+    "wandb_id": "0e3seng4"
+}
\ No newline at end of file
diff --git a/first_attention_1/training_config.yaml b/first_attention_1/training_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c5a4ed41ddddeeb4b05a67405f0f69663b6feeaa
--- /dev/null
+++ b/first_attention_1/training_config.yaml
@@ -0,0 +1,45 @@
+adam_beta1: 0.9
+adam_beta2: 0.95
+adjust_step: 0
+autoresume: false
+batch_size: 6
+clip_grad_norm: 1.0
+comment: null
+cycle_length: null
+dtype: bfloat16
+emb_freeze: null
+eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/
+eval_every: 1000
+first_attention: true
+gradient_accumulation: 13
+keep_checkpoints: null
+layer_freeze: null
+load_optimizer_state_on_resume: true
+lr: 0.0004
+max_length: 1024
+max_train_tokens: null
+min_lr_ratio: 0.1
+model_config: model_config/478m.json
+model_name_or_path: null
+model_revision: null
+num_training_steps: 15000
+optimizer: Adam
+restart_warmup_steps: null
+resume_from: null
+run_name: robust-frost-63
+save_dir: checkpoints/robust-frost-63
+save_every: 1000
+scheduler: cosine
+seed: 0
+shuffle: true
+skip_batches: !!set {}
+tags:
+- 396m-for-680m
+total_batch_size: 624
+train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/
+training_config: training_config/two_stage/478m_first_attention.yaml
+wandb_watch: true
+warmed_up_model: null
+warmup_steps: 1500
+weight_decay: 0.0
+workers: 8
diff --git a/first_attention_2/final_model/config.json b/first_attention_2/final_model/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2/final_model/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2/model_13000/config.json b/first_attention_2/model_13000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2/model_13000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2/model_13000/training_state.json b/first_attention_2/model_13000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ba0ec05bea3d14cc42fe004080600a228a706756
--- /dev/null
+++ b/first_attention_2/model_13000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 13000,
+    "update_step": 13000,
+    "tokens_seen": 8306737152,
+    "tokens_seen_before": 8306098176,
+    "update_time": 2.8824567794799805,
+    "wandb_id": "hkuum9kt"
+}
\ No newline at end of file
diff --git a/first_attention_2/model_2000/config.json b/first_attention_2/model_2000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2/model_2000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2/model_2000/training_state.json b/first_attention_2/model_2000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0439d03c83dc85f425962da1b0e6d790b524c57
--- /dev/null
+++ b/first_attention_2/model_2000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 2000,
+    "update_step": 2000,
+    "tokens_seen": 1278001152,
+    "tokens_seen_before": 1277362176,
+    "update_time": 2.882199764251709,
+    "wandb_id": "hkuum9kt"
+}
\ No newline at end of file
diff --git a/first_attention_2/model_3000/config.json b/first_attention_2/model_3000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2/model_3000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2/model_3000/training_state.json b/first_attention_2/model_3000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc38513332779e8f8b98cb5319689caaf80caed6
--- /dev/null
+++ b/first_attention_2/model_3000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 3000,
+    "update_step": 3000,
+    "tokens_seen": 1916977152,
+    "tokens_seen_before": 1916338176,
+    "update_time": 2.884432792663574,
+    "wandb_id": "hkuum9kt"
+}
\ No newline at end of file
diff --git a/first_attention_2/model_4000/config.json b/first_attention_2/model_4000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2/model_4000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2/model_4000/training_state.json b/first_attention_2/model_4000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d88c1cb27931678dffba8cfffae8d879dd94f406
--- /dev/null
+++ b/first_attention_2/model_4000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 4000,
+    "update_step": 4000,
+    "tokens_seen": 2555953152,
+    "tokens_seen_before": 2555314176,
+    "update_time": 2.8821418285369873,
+    "wandb_id": "hkuum9kt"
+}
\ No newline at end of file
diff --git a/first_attention_2/training_config.yaml b/first_attention_2/training_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7c3aea981fe2aaff4b61175191bd57ddeadfb22f
--- /dev/null
+++ b/first_attention_2/training_config.yaml
@@ -0,0 +1,47 @@
+adam_beta1: 0.9
+adam_beta2: 0.95
+adjust_step: 0
+autoresume: false
+batch_size: 6
+clip_grad_norm: 1.0
+comment: null
+cycle_length: null
+dtype: bfloat16
+emb_freeze: null
+eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/
+eval_every: 1000
+first_attention: false
+first_attention_resume: true
+gradient_accumulation: 13
+keep_checkpoints: null
+layer_freeze: null
+layer_freeze_2: false
+load_optimizer_state_on_resume: true
+lr: 0.0004
+max_length: 1024
+max_train_tokens: null
+min_lr_ratio: 0.1
+model_config: model_config/478m.json
+model_name_or_path: null
+model_revision: null
+num_training_steps: 15000
+optimizer: Adam
+restart_warmup_steps: null
+resume_from: null
+run_name: first_attention_resume
+save_dir: checkpoints/first_attention_resume
+save_every: 1000
+scheduler: cosine
+seed: 0
+shuffle: true
+skip_batches: !!set {}
+tags:
+- 396m-for-680m
+total_batch_size: 624
+train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/
+training_config: training_config/two_stage/478m_first_attention_resume.yaml
+wandb_watch: true
+warmed_up_model: null
+warmup_steps: 1500
+weight_decay: 0.0
+workers: 8
diff --git a/first_attention_2_attention_unfreeze/final_model/pytorch_model.bin b/first_attention_2_attention_unfreeze/final_model/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..59d910b52deabbf6ed45d083a90fbcdec09b5df7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/final_model/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb83e315f2a4475df30c3eebbca80bf0db8e6a03378d80cb7173a584b2a573b6
+size 2533545094
diff --git a/first_attention_2_attention_unfreeze/model_1000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_1000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..88809f9e85a5fc1baf67f2491e144ed44bf69efe
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_1000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c45a8f3c4f4d021f27a4c0ff889f20655be5677f83819547be076ecd0e0230b
+size 2533545094
diff --git a/first_attention_2_attention_unfreeze/model_10000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_10000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5de6260137427fca68bf64751dfffc00be842477
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_10000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40dfcb9849e0275eeb11bad6fc60b4e556f528000f2e1540145df68c4775597e
+size 2533545094
diff --git a/first_attention_2_attention_unfreeze/model_11000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_11000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ca51914c6e828cabef24620d54840c1355c152f5
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_11000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fea72ecb9a3f26e092ae472e8fe7302e710ac3ce88459751f9b3356b77e07294
+size 2533545094
diff --git a/first_attention_2_attention_unfreeze/model_12000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_12000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2d058f1b76259baa7d5c589b58425cc90f8cd4d6
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_12000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11bccb16f4e719b688af2354ef97f4ff742f05e8633b32537af3b4327b60fd63
+size 2533545094
diff --git a/first_attention_2_attention_unfreeze/model_13000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_13000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a7baaaa9513b6a19fd02f080f14a4db1a3d8b0f5
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_13000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8e9714cf45a316f1c6ce6c9175d953549c83672ac50097a08cff6c105b91efe
+size 2533545094
diff --git a/first_attention_2_attention_unfreeze/model_14000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_14000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fe1f237838ebc96c198f02d0283d63f6d389debd
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_14000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d68b08a96f0d9030740cb911b38932b1e06a4310c63ae20d3f45a1805eee3eeb
+size 2533545094
diff --git a/first_attention_2_attention_unfreeze/model_15000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_15000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..59d910b52deabbf6ed45d083a90fbcdec09b5df7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_15000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb83e315f2a4475df30c3eebbca80bf0db8e6a03378d80cb7173a584b2a573b6
+size 2533545094
diff --git a/first_attention_2_attention_unfreeze/model_2000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_2000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..291aaeb6e7a85264242c50de1be6a78933c2ba1c
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_2000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d6c4a4c83e346de97d4ff6cc8e8f4a74ac7b93f847439ad70ccef0ae2caf30c
+size 2533545094
diff --git a/first_attention_2_attention_unfreeze/model_3000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_3000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ac4292c9ace7ea5f84f99860e5bf8d21619a0fa9
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_3000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3709fc1c4ee2f0133442a25326d061a70010894151b7b1e4655e09389ebe750
+size 2533545094
diff --git a/first_attention_2_attention_unfreeze/model_4000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_4000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1b22fd524b784ad384491033eb1ffa1d94cb2e54
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_4000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7757dd04377135669a447a5c81714fe7428081c5f0b5871cb5c5e4122a97f2ec
+size 2533545094
diff --git a/first_attention_2_attention_unfreeze/model_5000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_5000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b63f2c67ba18d39d5c1a4f3edd66a8eaaf678ffe
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_5000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99fdd0c33c6507a4e3de3908dbec1175e0be3eb1105c4bcdb6ff8015723caedf
+size 2533545094
diff --git a/first_attention_2_attention_unfreeze/model_6000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_6000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cb75558e9892ce2e0694c67df0849be7b67d92ae
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_6000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a32c2a577168221f1c632ca81952072bf37f66585781189db476738be7ca868d
+size 2533545094
diff --git a/first_attention_2_attention_unfreeze/model_7000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_7000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..45c476e264d46bd016760f9ddf1f07bced6bad51
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_7000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ebeb4fd972a34bd54e544f3a0d258e005c4b40d4e89d45c76a2276cb58d57d8
+size 2533545094
diff --git a/first_attention_2_attention_unfreeze/model_8000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_8000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..23960a03f0769236803acebdf6b0870e2b4e916a
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_8000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61289bace0732321741b3315a811dfda363ddc3591efcd441e37063797f28a2e
+size 2533545094
diff --git a/first_attention_2_attention_unfreeze/model_9000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_9000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..05bb65f7467be0e0b85f97dcf21223595dd561ea
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_9000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ec321d147eba726dae0a00e3d99d130cca93b78ca48e81b217a33d2df5cfa88
+size 2533545094
diff --git a/first_attention_resume_loss_convergence/model_3000/pytorch_model.bin b/first_attention_resume_loss_convergence/model_3000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..81816343f415ead9389705edf3a4e24bb34451a9
--- /dev/null
+++ b/first_attention_resume_loss_convergence/model_3000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41187b3bb73fe6be85771de00c84e4368d0566bc7e613d3630edb3d9149c890c
+size 2533545094
diff --git a/first_layer_1/final_model/pytorch_model.bin b/first_layer_1/final_model/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..76fee640f34bfb414450e656d172570fae64fdb4
--- /dev/null
+++ b/first_layer_1/final_model/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5cc8ca606b5c08b402979c6a187af04dad556c28212329f880e7c21ee9327e6
+size 754421397
diff --git a/first_layer_1/model_13000/pytorch_model.bin b/first_layer_1/model_13000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..21e3c0d51e3ba571fee6d3789c7b93fd34f42edc
--- /dev/null
+++ b/first_layer_1/model_13000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:475c20d0f1341b67290dfd842db09917f5317a0c6ff60f0f248ec68449ecd7b3
+size 754421397
diff --git a/first_layer_1/model_2000/pytorch_model.bin b/first_layer_1/model_2000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bc9d5aa4e11016fb94793564a7e805d93e020b12
--- /dev/null
+++ b/first_layer_1/model_2000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63c4a6b76b99daddee34fe8ec21e07bcb4b59d808c5b376220ce401925245381
+size 754421397
diff --git a/first_layer_1/model_3000/pytorch_model.bin b/first_layer_1/model_3000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..812fc4e2b97431b873cdc6f3ceefba7af8bd6e92
--- /dev/null
+++ b/first_layer_1/model_3000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:882d77bc2a3e9850f0dbb3d2fef9c496c0295408294c0bff91c83fac149c2f8b
+size 754421397
diff --git a/first_layer_2/final_model/pytorch_model.bin b/first_layer_2/final_model/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fda57ad8105bc6e0a08f7d08fb0b03d08588253a
--- /dev/null
+++ b/first_layer_2/final_model/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1055488d44e419beec7f4dc8586960f627d130b2d9b3534af2dc0ae37b778b40
+size 2533545094
diff --git a/first_layer_2/model_1000/pytorch_model.bin b/first_layer_2/model_1000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..94011d8c2c15c19e146fed9eb36eccc5de7472d0
--- /dev/null
+++ b/first_layer_2/model_1000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef4918e831f53b9fdf98db08391b70d82efc7d4e0c7888cd688d46b141051790
+size 2533545094
diff --git a/first_layer_2/model_10000/pytorch_model.bin b/first_layer_2/model_10000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3814aa6ffdf30ec933169b2ded9aadc0c77417d0
--- /dev/null
+++ b/first_layer_2/model_10000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6205f6986915a368dca66aef15ea722192e30f3037ddd4034af6ee4377f2344c
+size 2533545094
diff --git a/first_layer_2/model_11000/pytorch_model.bin b/first_layer_2/model_11000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9c83290c0f62817c588dae02e1a52b6bc74d3ef8
--- /dev/null
+++ b/first_layer_2/model_11000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:caa3e0c36245f8997ebb120617e85d06a65d9c434e6f47e41533feb9daa35abb
+size 2533545094
diff --git a/first_layer_2/model_12000/pytorch_model.bin b/first_layer_2/model_12000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dd5a3afa547c0f39ec31712185848f3244017189
--- /dev/null
+++ b/first_layer_2/model_12000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4d651ca79857ff137a36fffe59a3d6d1562cd6fea95cdd11b6ecf0da0de6353
+size 2533545094
diff --git a/first_layer_2/model_13000/pytorch_model.bin b/first_layer_2/model_13000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1654452eebea62746f65c0cc447a5f1a08d8ee40
--- /dev/null
+++ b/first_layer_2/model_13000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff95868727cab2b481b43497784642da1a323a82c5b184a3d935624ffca7c17e
+size 2533545094
diff --git a/first_layer_2/model_14000/pytorch_model.bin b/first_layer_2/model_14000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..aedd3a15ffc28463d8b7f66e40fd90ac20605d6a
--- /dev/null
+++ b/first_layer_2/model_14000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6bb9ba7f46f65527b44e164a8344b3b7a57c9d63ab1c6434a168ac53884c81ce
+size 2533545094
diff --git a/first_layer_2/model_15000/pytorch_model.bin b/first_layer_2/model_15000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fda57ad8105bc6e0a08f7d08fb0b03d08588253a
--- /dev/null
+++ b/first_layer_2/model_15000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1055488d44e419beec7f4dc8586960f627d130b2d9b3534af2dc0ae37b778b40
+size 2533545094
diff --git a/first_layer_2/model_2000/pytorch_model.bin b/first_layer_2/model_2000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4643a50ddc07725a68d5f43c0a830b0b94d8f141
--- /dev/null
+++ b/first_layer_2/model_2000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9226f224657e6bef6c1526a23710e30324f350e9f78614cf76dbe3efbeabba88
+size 2533545094
diff --git a/first_layer_2/model_3000/pytorch_model.bin b/first_layer_2/model_3000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..16e2b22085ca618e91c1d16c41627080121b4abf
--- /dev/null
+++ b/first_layer_2/model_3000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44264c7bf0d5f083455ee4d2635d3103026acf0640bb4061f057a17c00890149
+size 2533545094
diff --git a/first_layer_2/model_4000/pytorch_model.bin b/first_layer_2/model_4000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..47ad51fa9b742432ce389e11a0a630e9ae2aae5a
--- /dev/null
+++ b/first_layer_2/model_4000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77b49ed3050093f152650ea9c90d69a38bc14a9147f54f3a5ff90f5d2f4bf646
+size 2533545094
diff --git a/first_layer_2/model_5000/pytorch_model.bin b/first_layer_2/model_5000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d05868dfbfb725aa756f28b8fda9bad9d19e95a3
--- /dev/null
+++ b/first_layer_2/model_5000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08a6559722b6b81c85cdbed675563f997e7c9d3b3c79a8fdfcec9a0130635c23
+size 2533545094
diff --git a/first_layer_2/model_6000/pytorch_model.bin b/first_layer_2/model_6000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..11e53e755c16482bae099ea803916eef5f42e34b
--- /dev/null
+++ b/first_layer_2/model_6000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19174bee86c290274e86a3ad18746710f81714649969205d271a7d4f636bdcfd
+size 2533545094
diff --git a/first_layer_2/model_7000/pytorch_model.bin b/first_layer_2/model_7000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..259c25f8201e6a8a2e7908588ffee2a719ee88b7
--- /dev/null
+++ b/first_layer_2/model_7000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc19c9ffaa36a141c14208eabfb13a9ba2a4d88a6a14ba02075f934c8c1acc9a
+size 2533545094
diff --git a/first_layer_2/model_8000/pytorch_model.bin b/first_layer_2/model_8000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..11a29cd207731670d72a2a7e3c6eb7090fbaec9b
--- /dev/null
+++ b/first_layer_2/model_8000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3aa26db628424d0c9512b9406e4c864071a76c4f6175947eef0a2827583d2151
+size 2533545094