diff --git a/baseline/final_model/pytorch_model.bin b/baseline/final_model/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..30e9e514722f3081c6e6f733260a5897aac57f5b
--- /dev/null
+++ b/baseline/final_model/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a68abbc43af60bd4c7f2d95c72f10740acbbb95df43e3fd9ba6ae48d8c02ccc
+size 2533545094
diff --git a/bigram_2_full/final_model/config.json b/bigram_2_full/final_model/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2_full/final_model/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2_full/model_1000/training_state.json b/bigram_2_full/model_1000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a63a76e652283c529032274eeb90b1dfc801cff0
--- /dev/null
+++ b/bigram_2_full/model_1000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 1000,
+    "update_step": 1000,
+    "tokens_seen": 639025152,
+    "tokens_seen_before": 638386176,
+    "update_time": 3.281445264816284,
+    "wandb_id": "7nopmkvs"
+}
\ No newline at end of file
diff --git a/bigram_2_full/model_11000/model_config.json b/bigram_2_full/model_11000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2_full/model_11000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2_full/model_11000/training_state.json b/bigram_2_full/model_11000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..4347ecc6da3f7dcaac6a2ca91f3c46af6e0eb120
--- /dev/null
+++ b/bigram_2_full/model_11000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 11000,
+    "update_step": 11000,
+    "tokens_seen": 7028785152,
+    "tokens_seen_before": 7028146176,
+    "update_time": 3.2798073291778564,
+    "wandb_id": "7nopmkvs"
+}
\ No newline at end of file
diff --git a/bigram_2_full/model_13000/model_config.json b/bigram_2_full/model_13000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2_full/model_13000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2_full/model_13000/training_state.json b/bigram_2_full/model_13000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2cfe92d5e43ac87d568bc1b3bb40290e3051c0e
--- /dev/null
+++ b/bigram_2_full/model_13000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 13000,
+    "update_step": 13000,
+    "tokens_seen": 8306737152,
+    "tokens_seen_before": 8306098176,
+    "update_time": 3.2799124717712402,
+    "wandb_id": "7nopmkvs"
+}
\ No newline at end of file
diff --git a/bigram_2_full/model_14000/training_state.json b/bigram_2_full/model_14000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..00522f22657146e53c1216d61f6b78066a289bcc
--- /dev/null
+++ b/bigram_2_full/model_14000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 14000,
+    "update_step": 14000,
+    "tokens_seen": 8945713152,
+    "tokens_seen_before": 8945074176,
+    "update_time": 3.2842938899993896,
+    "wandb_id": "7nopmkvs"
+}
\ No newline at end of file
diff --git a/bigram_2_full/model_2000/model_config.json b/bigram_2_full/model_2000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2_full/model_2000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2_full/model_2000/training_state.json b/bigram_2_full/model_2000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..26b3e58fe6b85d725cb7da315aac61fdb41b9a93
--- /dev/null
+++ b/bigram_2_full/model_2000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 2000,
+    "update_step": 2000,
+    "tokens_seen": 1278001152,
+    "tokens_seen_before": 1277362176,
+    "update_time": 3.285855293273926,
+    "wandb_id": "7nopmkvs"
+}
\ No newline at end of file
diff --git a/bigram_2_full/model_3000/model_config.json b/bigram_2_full/model_3000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2_full/model_3000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2_full/model_3000/training_state.json b/bigram_2_full/model_3000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..664ba5299e7aaf0d3f383d90c5ba424847d154c9
--- /dev/null
+++ b/bigram_2_full/model_3000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 3000,
+    "update_step": 3000,
+    "tokens_seen": 1916977152,
+    "tokens_seen_before": 1916338176,
+    "update_time": 3.2819085121154785,
+    "wandb_id": "7nopmkvs"
+}
\ No newline at end of file
diff --git a/bigram_2_full/model_4000/model_config.json b/bigram_2_full/model_4000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2_full/model_4000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2_full/model_4000/training_state.json b/bigram_2_full/model_4000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..05c99aac2b1fce84f47177137ebe9481fc64cc1e
--- /dev/null
+++ b/bigram_2_full/model_4000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 4000,
+    "update_step": 4000,
+    "tokens_seen": 2555953152,
+    "tokens_seen_before": 2555314176,
+    "update_time": 3.283513307571411,
+    "wandb_id": "7nopmkvs"
+}
\ No newline at end of file
diff --git a/bigram_2_full/model_5000/model_config.json b/bigram_2_full/model_5000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2_full/model_5000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2_full/model_5000/training_state.json b/bigram_2_full/model_5000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..95d62d7f901bde00e39d691261219e5d97214397
--- /dev/null
+++ b/bigram_2_full/model_5000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 5000,
+    "update_step": 5000,
+    "tokens_seen": 3194929152,
+    "tokens_seen_before": 3194290176,
+    "update_time": 3.27878475189209,
+    "wandb_id": "7nopmkvs"
+}
\ No newline at end of file
diff --git a/bigram_2_full/model_9000/model_config.json b/bigram_2_full/model_9000/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/bigram_2_full/model_9000/model_config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/bigram_2_full/model_9000/training_state.json b/bigram_2_full/model_9000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..fafcbe9190a5ca4113929ac1b5f3deb3c2b8f878
--- /dev/null
+++ b/bigram_2_full/model_9000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 9000,
+    "update_step": 9000,
+    "tokens_seen": 5750833152,
+    "tokens_seen_before": 5750194176,
+    "update_time": 3.2838311195373535,
+    "wandb_id": "7nopmkvs"
+}
\ No newline at end of file
diff --git a/bigram_2_full/training_config.yaml b/bigram_2_full/training_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0d562dcde7cc7b826303af3ea329d1a11ef3ad00
--- /dev/null
+++ b/bigram_2_full/training_config.yaml
@@ -0,0 +1,44 @@
+adam_beta1: 0.9
+adam_beta2: 0.95
+adjust_step: 0
+autoresume: false
+batch_size: 6
+clip_grad_norm: 1.0
+comment: null
+cycle_length: null
+dtype: bfloat16
+emb_freeze: null
+eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/
+eval_every: 1000
+gradient_accumulation: 13
+keep_checkpoints: null
+layer_freeze: null
+load_optimizer_state_on_resume: true
+lr: 0.0004
+max_length: 1024
+max_train_tokens: null
+min_lr_ratio: 0.1
+model_config: model_config/478m.json
+model_name_or_path: null
+model_revision: null
+num_training_steps: 15000
+optimizer: Adam
+restart_warmup_steps: null
+resume_from: null
+run_name: earnest-sky-22
+save_dir: checkpoints/earnest-sky-22
+save_every: 1000
+scheduler: cosine
+seed: 0
+shuffle: true
+skip_batches: !!set {}
+tags:
+- 396m-for-680m
+total_batch_size: 624
+train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/
+training_config: training_config/two_stage/478m_resume_unfreeze.yaml
+wandb_watch: true
+warmed_up_model: /lee_embedding/checkpoints/tough-snowflake-18/final_model/
+warmup_steps: 1500
+weight_decay: 0.0
+workers: 8
diff --git a/first_attention_2_attention_unfreeze/final_model/config.json b/first_attention_2_attention_unfreeze/final_model/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/final_model/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_1000/config.json b/first_attention_2_attention_unfreeze/model_1000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_1000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_1000/training_state.json b/first_attention_2_attention_unfreeze/model_1000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1c5bb53f98cb1d74770f02744e434ba910f21e5
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_1000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 1000,
+    "update_step": 1000,
+    "tokens_seen": 639025152,
+    "tokens_seen_before": 638386176,
+    "update_time": 2.9259252548217773,
+    "wandb_id": "0ab5p6ah"
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_10000/config.json b/first_attention_2_attention_unfreeze/model_10000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_10000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_10000/training_state.json b/first_attention_2_attention_unfreeze/model_10000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..379803eb2ecf0ffa73eb6974b937570ac52e4d4d
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_10000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 10000,
+    "update_step": 10000,
+    "tokens_seen": 6389809152,
+    "tokens_seen_before": 6389170176,
+    "update_time": 2.925158977508545,
+    "wandb_id": "0ab5p6ah"
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_11000/config.json b/first_attention_2_attention_unfreeze/model_11000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_11000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_11000/training_state.json b/first_attention_2_attention_unfreeze/model_11000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..924832b1f4f4ac690db6c14b405b8bcc1ec0bab2
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_11000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 11000,
+    "update_step": 11000,
+    "tokens_seen": 7028785152,
+    "tokens_seen_before": 7028146176,
+    "update_time": 2.925471067428589,
+    "wandb_id": "0ab5p6ah"
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_12000/config.json b/first_attention_2_attention_unfreeze/model_12000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_12000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_12000/training_state.json b/first_attention_2_attention_unfreeze/model_12000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b6beb40f4a8f4f504a7f2a0d09146ca22379fab
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_12000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 12000,
+    "update_step": 12000,
+    "tokens_seen": 7667761152,
+    "tokens_seen_before": 7667122176,
+    "update_time": 2.9245717525482178,
+    "wandb_id": "0ab5p6ah"
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_13000/training_state.json b/first_attention_2_attention_unfreeze/model_13000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d542ed3b3e4256440b55ec1dc3cc34f80110d867
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_13000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 13000,
+    "update_step": 13000,
+    "tokens_seen": 8306737152,
+    "tokens_seen_before": 8306098176,
+    "update_time": 2.9278290271759033,
+    "wandb_id": "0ab5p6ah"
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_14000/config.json b/first_attention_2_attention_unfreeze/model_14000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_14000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_14000/training_state.json b/first_attention_2_attention_unfreeze/model_14000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..aecf7ece2424555a569f8f780bef5f66c0dc9077
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_14000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 14000,
+    "update_step": 14000,
+    "tokens_seen": 8945713152,
+    "tokens_seen_before": 8945074176,
+    "update_time": 2.924705743789673,
+    "wandb_id": "0ab5p6ah"
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_15000/config.json b/first_attention_2_attention_unfreeze/model_15000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_15000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_15000/training_state.json b/first_attention_2_attention_unfreeze/model_15000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ba896c58a04b68dd7cc30a7dab6c256e56ca855
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_15000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 15000,
+    "update_step": 15000,
+    "tokens_seen": 9584689152,
+    "tokens_seen_before": 9584050176,
+    "update_time": 2.924987554550171,
+    "wandb_id": "0ab5p6ah"
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_2000/config.json b/first_attention_2_attention_unfreeze/model_2000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_2000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_2000/training_state.json b/first_attention_2_attention_unfreeze/model_2000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7bba5c2b38176089c28f85ae5813217e8b3644e2
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_2000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 2000,
+    "update_step": 2000,
+    "tokens_seen": 1278001152,
+    "tokens_seen_before": 1277362176,
+    "update_time": 2.923201322555542,
+    "wandb_id": "0ab5p6ah"
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_3000/config.json b/first_attention_2_attention_unfreeze/model_3000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_3000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_3000/training_state.json b/first_attention_2_attention_unfreeze/model_3000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3d36da0db922e09678af5b196449415ed85d45f
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_3000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 3000,
+    "update_step": 3000,
+    "tokens_seen": 1916977152,
+    "tokens_seen_before": 1916338176,
+    "update_time": 2.9246528148651123,
+    "wandb_id": "0ab5p6ah"
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_4000/config.json b/first_attention_2_attention_unfreeze/model_4000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_4000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_4000/training_state.json b/first_attention_2_attention_unfreeze/model_4000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d779f12d5024f1e305141bd4177e664206d18e0
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_4000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 4000,
+    "update_step": 4000,
+    "tokens_seen": 2555953152,
+    "tokens_seen_before": 2555314176,
+    "update_time": 2.9246957302093506,
+    "wandb_id": "0ab5p6ah"
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_5000/config.json b/first_attention_2_attention_unfreeze/model_5000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_5000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_5000/training_state.json b/first_attention_2_attention_unfreeze/model_5000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c84388bbf82f6e9b5e1e835284eb859a0bbc32d
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_5000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 5000,
+    "update_step": 5000,
+    "tokens_seen": 3194929152,
+    "tokens_seen_before": 3194290176,
+    "update_time": 2.9258697032928467,
+    "wandb_id": "0ab5p6ah"
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_6000/config.json b/first_attention_2_attention_unfreeze/model_6000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_6000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_6000/training_state.json b/first_attention_2_attention_unfreeze/model_6000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..13f355fd82c4ec683ae12960de7eda1d608de39a
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_6000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 6000,
+    "update_step": 6000,
+    "tokens_seen": 3833905152,
+    "tokens_seen_before": 3833266176,
+    "update_time": 2.92516827583313,
+    "wandb_id": "0ab5p6ah"
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_7000/config.json b/first_attention_2_attention_unfreeze/model_7000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_7000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_7000/training_state.json b/first_attention_2_attention_unfreeze/model_7000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..216bbf81ccfeed98d09061f2891300d21a5e5a1e
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_7000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 7000,
+    "update_step": 7000,
+    "tokens_seen": 4472881152,
+    "tokens_seen_before": 4472242176,
+    "update_time": 2.925605058670044,
+    "wandb_id": "0ab5p6ah"
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_8000/config.json b/first_attention_2_attention_unfreeze/model_8000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_8000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_8000/training_state.json b/first_attention_2_attention_unfreeze/model_8000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd614cfe480ae2f3dfce4f5a7a789903266d1bbb
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_8000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 8000,
+    "update_step": 8000,
+    "tokens_seen": 5111857152,
+    "tokens_seen_before": 5111218176,
+    "update_time": 2.924616575241089,
+    "wandb_id": "0ab5p6ah"
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_9000/config.json b/first_attention_2_attention_unfreeze/model_9000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_9000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/model_9000/training_state.json b/first_attention_2_attention_unfreeze/model_9000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..96b217c52574fee5f318afec9348c20eea3522b1
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/model_9000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 9000,
+    "update_step": 9000,
+    "tokens_seen": 5750833152,
+    "tokens_seen_before": 5750194176,
+    "update_time": 2.9271013736724854,
+    "wandb_id": "0ab5p6ah"
+}
\ No newline at end of file
diff --git a/first_attention_2_attention_unfreeze/training_config.yaml b/first_attention_2_attention_unfreeze/training_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fe098423ebab4b90b7b8e23fb0da48c365c8378b
--- /dev/null
+++ b/first_attention_2_attention_unfreeze/training_config.yaml
@@ -0,0 +1,47 @@
+adam_beta1: 0.9
+adam_beta2: 0.95
+adjust_step: 0
+autoresume: false
+batch_size: 6
+clip_grad_norm: 1.0
+comment: null
+cycle_length: null
+dtype: bfloat16
+emb_freeze: null
+eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/
+eval_every: 1000
+first_attention: false
+first_attention_resume: true
+gradient_accumulation: 13
+keep_checkpoints: null
+layer_freeze: null
+layer_freeze_2: false
+load_optimizer_state_on_resume: true
+lr: 0.0004
+max_length: 1024
+max_train_tokens: null
+min_lr_ratio: 0.1
+model_config: model_config/478m.json
+model_name_or_path: null
+model_revision: null
+num_training_steps: 15000
+optimizer: Adam
+restart_warmup_steps: null
+resume_from: null
+run_name: first_attention_resume_unfreeze
+save_dir: checkpoints/first_attention_resume_unfreeze
+save_every: 1000
+scheduler: cosine
+seed: 0
+shuffle: true
+skip_batches: !!set {}
+tags:
+- 396m-for-680m
+total_batch_size: 624
+train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/
+training_config: training_config/two_stage/478m_first_attention_resume_unfreeze.yaml
+wandb_watch: true
+warmed_up_model: null
+warmup_steps: 1500
+weight_decay: 0.0
+workers: 8
diff --git a/first_layer_2/model_1000/training_state.json b/first_layer_2/model_1000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b1f5d2b3a7ef6d7cab491ae015aecd3fb595bf4
--- /dev/null
+++ b/first_layer_2/model_1000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 1000,
+    "update_step": 1000,
+    "tokens_seen": 639025152,
+    "tokens_seen_before": 638386176,
+    "update_time": 2.883007764816284,
+    "wandb_id": "f3ljzhyw"
+}
\ No newline at end of file
diff --git a/first_layer_2/model_10000/config.json b/first_layer_2/model_10000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_layer_2/model_10000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_layer_2/model_10000/training_state.json b/first_layer_2/model_10000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..613fce36c84b10e6c807cd3bb1b195e799e3ccff
--- /dev/null
+++ b/first_layer_2/model_10000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 10000,
+    "update_step": 10000,
+    "tokens_seen": 6389809152,
+    "tokens_seen_before": 6389170176,
+    "update_time": 2.878925085067749,
+    "wandb_id": "f3ljzhyw"
+}
\ No newline at end of file
diff --git a/first_layer_2/model_12000/training_state.json b/first_layer_2/model_12000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e6e0777e6ad2c7f208096b26aa6ecd7e2b9a584
--- /dev/null
+++ b/first_layer_2/model_12000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 12000,
+    "update_step": 12000,
+    "tokens_seen": 7667761152,
+    "tokens_seen_before": 7667122176,
+    "update_time": 2.8791251182556152,
+    "wandb_id": "f3ljzhyw"
+}
\ No newline at end of file
diff --git a/first_layer_2/model_15000/config.json b/first_layer_2/model_15000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_layer_2/model_15000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_layer_2/model_6000/config.json b/first_layer_2/model_6000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_layer_2/model_6000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/first_layer_2/model_6000/training_state.json b/first_layer_2/model_6000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..29f61e7467334859b4ace11d5ea1ebb0cdf32505
--- /dev/null
+++ b/first_layer_2/model_6000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 6000,
+    "update_step": 6000,
+    "tokens_seen": 3833905152,
+    "tokens_seen_before": 3833266176,
+    "update_time": 2.875896453857422,
+    "wandb_id": "f3ljzhyw"
+}
\ No newline at end of file
diff --git a/first_layer_2/model_9000/config.json b/first_layer_2/model_9000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/first_layer_2/model_9000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/final_model/config.json b/silver-butterfly-62/final_model/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/silver-butterfly-62/final_model/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_1000/config.json b/silver-butterfly-62/model_1000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/silver-butterfly-62/model_1000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_10000/training_state.json b/silver-butterfly-62/model_10000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..62868a01bb9226acf12d87b5e3312bf7ae45240f
--- /dev/null
+++ b/silver-butterfly-62/model_10000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 10000,
+    "update_step": 10000,
+    "tokens_seen": 6389809152,
+    "tokens_seen_before": 6389170176,
+    "update_time": 0.7699310779571533,
+    "wandb_id": "e0h7cx93"
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_11000/config.json b/silver-butterfly-62/model_11000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/silver-butterfly-62/model_11000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_11000/training_state.json b/silver-butterfly-62/model_11000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d91623b956831438f3a0e7ed540b44a979984c7c
--- /dev/null
+++ b/silver-butterfly-62/model_11000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 11000,
+    "update_step": 11000,
+    "tokens_seen": 7028785152,
+    "tokens_seen_before": 7028146176,
+    "update_time": 0.7700908184051514,
+    "wandb_id": "e0h7cx93"
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_12000/training_state.json b/silver-butterfly-62/model_12000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f57cb6217e3637e0a0046d5c2a7bd942c4bc64a
--- /dev/null
+++ b/silver-butterfly-62/model_12000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 12000,
+    "update_step": 12000,
+    "tokens_seen": 7667761152,
+    "tokens_seen_before": 7667122176,
+    "update_time": 0.7706863880157471,
+    "wandb_id": "e0h7cx93"
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_13000/training_state.json b/silver-butterfly-62/model_13000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..bad4e57cf91a65696ce74307be5d27a6d76d3317
--- /dev/null
+++ b/silver-butterfly-62/model_13000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 13000,
+    "update_step": 13000,
+    "tokens_seen": 8306737152,
+    "tokens_seen_before": 8306098176,
+    "update_time": 0.7703430652618408,
+    "wandb_id": "e0h7cx93"
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_14000/training_state.json b/silver-butterfly-62/model_14000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..97dabe821f7760ae6ff29486b0c69d4f4733ddcd
--- /dev/null
+++ b/silver-butterfly-62/model_14000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 14000,
+    "update_step": 14000,
+    "tokens_seen": 8945713152,
+    "tokens_seen_before": 8945074176,
+    "update_time": 0.7704954147338867,
+    "wandb_id": "e0h7cx93"
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_15000/training_state.json b/silver-butterfly-62/model_15000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..aea0ba869bc468bdeef1ab99ca65f853990f20e1
--- /dev/null
+++ b/silver-butterfly-62/model_15000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 15000,
+    "update_step": 15000,
+    "tokens_seen": 9584689152,
+    "tokens_seen_before": 9584050176,
+    "update_time": 0.7696325778961182,
+    "wandb_id": "e0h7cx93"
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_2000/config.json b/silver-butterfly-62/model_2000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/silver-butterfly-62/model_2000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_3000/config.json b/silver-butterfly-62/model_3000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/silver-butterfly-62/model_3000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_4000/training_state.json b/silver-butterfly-62/model_4000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9d1063c2b8a21f2b7cf07493b2fc8cb16ab7b466
--- /dev/null
+++ b/silver-butterfly-62/model_4000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 4000,
+    "update_step": 4000,
+    "tokens_seen": 2555953152,
+    "tokens_seen_before": 2555314176,
+    "update_time": 0.7703680992126465,
+    "wandb_id": "e0h7cx93"
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_5000/config.json b/silver-butterfly-62/model_5000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/silver-butterfly-62/model_5000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_5000/training_state.json b/silver-butterfly-62/model_5000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..80d28e6c221ee63435a0aa1fb62056003895a3c2
--- /dev/null
+++ b/silver-butterfly-62/model_5000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 5000,
+    "update_step": 5000,
+    "tokens_seen": 3194929152,
+    "tokens_seen_before": 3194290176,
+    "update_time": 0.7708892822265625,
+    "wandb_id": "e0h7cx93"
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_6000/training_state.json b/silver-butterfly-62/model_6000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a1d29e8228981ff16fe287fa04d69b4cb87c550
--- /dev/null
+++ b/silver-butterfly-62/model_6000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 6000,
+    "update_step": 6000,
+    "tokens_seen": 3833905152,
+    "tokens_seen_before": 3833266176,
+    "update_time": 0.7694945335388184,
+    "wandb_id": "e0h7cx93"
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_8000/training_state.json b/silver-butterfly-62/model_8000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd0eadd4fb1c52dfc638aced882938f466ab6f15
--- /dev/null
+++ b/silver-butterfly-62/model_8000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 8000,
+    "update_step": 8000,
+    "tokens_seen": 5111857152,
+    "tokens_seen_before": 5111218176,
+    "update_time": 0.7700819969177246,
+    "wandb_id": "e0h7cx93"
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_9000/config.json b/silver-butterfly-62/model_9000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7
--- /dev/null
+++ b/silver-butterfly-62/model_9000/config.json
@@ -0,0 +1,19 @@
+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/silver-butterfly-62/model_9000/training_state.json b/silver-butterfly-62/model_9000/training_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d2b3ec6b90f1513eab68064cbe19d2bc8dca081
--- /dev/null
+++ b/silver-butterfly-62/model_9000/training_state.json
@@ -0,0 +1,8 @@
+{
+    "global_step": 9000,
+    "update_step": 9000,
+    "tokens_seen": 5750833152,
+    "tokens_seen_before": 5750194176,
+    "update_time": 0.7697172164916992,
+    "wandb_id": "e0h7cx93"
+}
\ No newline at end of file