gerou161
/

step15000

Model card Files Files and versions Community

gerou161 commited on Nov 6, 2024

Commit

8f20568

verified ·

1 Parent(s): dbcecc7

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

baseline/final_model/pytorch_model.bin +3 -0
bigram_2_full/final_model/config.json +19 -0
bigram_2_full/model_1000/training_state.json +8 -0
bigram_2_full/model_11000/model_config.json +19 -0
bigram_2_full/model_11000/training_state.json +8 -0
bigram_2_full/model_13000/model_config.json +19 -0
bigram_2_full/model_13000/training_state.json +8 -0
bigram_2_full/model_14000/training_state.json +8 -0
bigram_2_full/model_2000/model_config.json +19 -0
bigram_2_full/model_2000/training_state.json +8 -0
bigram_2_full/model_3000/model_config.json +19 -0
bigram_2_full/model_3000/training_state.json +8 -0
bigram_2_full/model_4000/model_config.json +19 -0
bigram_2_full/model_4000/training_state.json +8 -0
bigram_2_full/model_5000/model_config.json +19 -0
bigram_2_full/model_5000/training_state.json +8 -0
bigram_2_full/model_9000/model_config.json +19 -0
bigram_2_full/model_9000/training_state.json +8 -0
bigram_2_full/training_config.yaml +44 -0
first_attention_2_attention_unfreeze/final_model/config.json +19 -0
first_attention_2_attention_unfreeze/model_1000/config.json +19 -0
first_attention_2_attention_unfreeze/model_1000/training_state.json +8 -0
first_attention_2_attention_unfreeze/model_10000/config.json +19 -0
first_attention_2_attention_unfreeze/model_10000/training_state.json +8 -0
first_attention_2_attention_unfreeze/model_11000/config.json +19 -0
first_attention_2_attention_unfreeze/model_11000/training_state.json +8 -0
first_attention_2_attention_unfreeze/model_12000/config.json +19 -0
first_attention_2_attention_unfreeze/model_12000/training_state.json +8 -0
first_attention_2_attention_unfreeze/model_13000/training_state.json +8 -0
first_attention_2_attention_unfreeze/model_14000/config.json +19 -0
first_attention_2_attention_unfreeze/model_14000/training_state.json +8 -0
first_attention_2_attention_unfreeze/model_15000/config.json +19 -0
first_attention_2_attention_unfreeze/model_15000/training_state.json +8 -0
first_attention_2_attention_unfreeze/model_2000/config.json +19 -0
first_attention_2_attention_unfreeze/model_2000/training_state.json +8 -0
first_attention_2_attention_unfreeze/model_3000/config.json +19 -0
first_attention_2_attention_unfreeze/model_3000/training_state.json +8 -0
first_attention_2_attention_unfreeze/model_4000/config.json +19 -0
first_attention_2_attention_unfreeze/model_4000/training_state.json +8 -0
first_attention_2_attention_unfreeze/model_5000/config.json +19 -0
first_attention_2_attention_unfreeze/model_5000/training_state.json +8 -0
first_attention_2_attention_unfreeze/model_6000/config.json +19 -0
first_attention_2_attention_unfreeze/model_6000/training_state.json +8 -0
first_attention_2_attention_unfreeze/model_7000/config.json +19 -0
first_attention_2_attention_unfreeze/model_7000/training_state.json +8 -0
first_attention_2_attention_unfreeze/model_8000/config.json +19 -0
first_attention_2_attention_unfreeze/model_8000/training_state.json +8 -0
first_attention_2_attention_unfreeze/model_9000/config.json +19 -0
first_attention_2_attention_unfreeze/model_9000/training_state.json +8 -0
first_attention_2_attention_unfreeze/training_config.yaml +47 -0

baseline/final_model/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a68abbc43af60bd4c7f2d95c72f10740acbbb95df43e3fd9ba6ae48d8c02ccc
+size 2533545094

bigram_2_full/final_model/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

bigram_2_full/model_1000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 1000,
+    "update_step": 1000,
+    "tokens_seen": 639025152,
+    "tokens_seen_before": 638386176,
+    "update_time": 3.281445264816284,
+    "wandb_id": "7nopmkvs"
+}

bigram_2_full/model_11000/model_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

bigram_2_full/model_11000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 11000,
+    "update_step": 11000,
+    "tokens_seen": 7028785152,
+    "tokens_seen_before": 7028146176,
+    "update_time": 3.2798073291778564,
+    "wandb_id": "7nopmkvs"
+}

bigram_2_full/model_13000/model_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

bigram_2_full/model_13000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 13000,
+    "update_step": 13000,
+    "tokens_seen": 8306737152,
+    "tokens_seen_before": 8306098176,
+    "update_time": 3.2799124717712402,
+    "wandb_id": "7nopmkvs"
+}

bigram_2_full/model_14000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 14000,
+    "update_step": 14000,
+    "tokens_seen": 8945713152,
+    "tokens_seen_before": 8945074176,
+    "update_time": 3.2842938899993896,
+    "wandb_id": "7nopmkvs"
+}

bigram_2_full/model_2000/model_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

bigram_2_full/model_2000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 2000,
+    "update_step": 2000,
+    "tokens_seen": 1278001152,
+    "tokens_seen_before": 1277362176,
+    "update_time": 3.285855293273926,
+    "wandb_id": "7nopmkvs"
+}

bigram_2_full/model_3000/model_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

bigram_2_full/model_3000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 3000,
+    "update_step": 3000,
+    "tokens_seen": 1916977152,
+    "tokens_seen_before": 1916338176,
+    "update_time": 3.2819085121154785,
+    "wandb_id": "7nopmkvs"
+}

bigram_2_full/model_4000/model_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

bigram_2_full/model_4000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 4000,
+    "update_step": 4000,
+    "tokens_seen": 2555953152,
+    "tokens_seen_before": 2555314176,
+    "update_time": 3.283513307571411,
+    "wandb_id": "7nopmkvs"
+}

bigram_2_full/model_5000/model_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

bigram_2_full/model_5000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 5000,
+    "update_step": 5000,
+    "tokens_seen": 3194929152,
+    "tokens_seen_before": 3194290176,
+    "update_time": 3.27878475189209,
+    "wandb_id": "7nopmkvs"
+}

bigram_2_full/model_9000/model_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

bigram_2_full/model_9000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 9000,
+    "update_step": 9000,
+    "tokens_seen": 5750833152,
+    "tokens_seen_before": 5750194176,
+    "update_time": 3.2838311195373535,
+    "wandb_id": "7nopmkvs"
+}

bigram_2_full/training_config.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+adam_beta1: 0.9
+adam_beta2: 0.95
+adjust_step: 0
+autoresume: false
+batch_size: 6
+clip_grad_norm: 1.0
+comment: null
+cycle_length: null
+dtype: bfloat16
+emb_freeze: null
+eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/
+eval_every: 1000
+gradient_accumulation: 13
+keep_checkpoints: null
+layer_freeze: null
+load_optimizer_state_on_resume: true
+lr: 0.0004
+max_length: 1024
+max_train_tokens: null
+min_lr_ratio: 0.1
+model_config: model_config/478m.json
+model_name_or_path: null
+model_revision: null
+num_training_steps: 15000
+optimizer: Adam
+restart_warmup_steps: null
+resume_from: null
+run_name: earnest-sky-22
+save_dir: checkpoints/earnest-sky-22
+save_every: 1000
+scheduler: cosine
+seed: 0
+shuffle: true
+skip_batches: !!set {}
+tags:
+- 396m-for-680m
+total_batch_size: 624
+train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/
+training_config: training_config/two_stage/478m_resume_unfreeze.yaml
+wandb_watch: true
+warmed_up_model: /lee_embedding/checkpoints/tough-snowflake-18/final_model/
+warmup_steps: 1500
+weight_decay: 0.0
+workers: 8

first_attention_2_attention_unfreeze/final_model/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

first_attention_2_attention_unfreeze/model_1000/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

first_attention_2_attention_unfreeze/model_1000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 1000,
+    "update_step": 1000,
+    "tokens_seen": 639025152,
+    "tokens_seen_before": 638386176,
+    "update_time": 2.9259252548217773,
+    "wandb_id": "0ab5p6ah"
+}

first_attention_2_attention_unfreeze/model_10000/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

first_attention_2_attention_unfreeze/model_10000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 10000,
+    "update_step": 10000,
+    "tokens_seen": 6389809152,
+    "tokens_seen_before": 6389170176,
+    "update_time": 2.925158977508545,
+    "wandb_id": "0ab5p6ah"
+}

first_attention_2_attention_unfreeze/model_11000/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

first_attention_2_attention_unfreeze/model_11000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 11000,
+    "update_step": 11000,
+    "tokens_seen": 7028785152,
+    "tokens_seen_before": 7028146176,
+    "update_time": 2.925471067428589,
+    "wandb_id": "0ab5p6ah"
+}

first_attention_2_attention_unfreeze/model_12000/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

first_attention_2_attention_unfreeze/model_12000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 12000,
+    "update_step": 12000,
+    "tokens_seen": 7667761152,
+    "tokens_seen_before": 7667122176,
+    "update_time": 2.9245717525482178,
+    "wandb_id": "0ab5p6ah"
+}

first_attention_2_attention_unfreeze/model_13000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 13000,
+    "update_step": 13000,
+    "tokens_seen": 8306737152,
+    "tokens_seen_before": 8306098176,
+    "update_time": 2.9278290271759033,
+    "wandb_id": "0ab5p6ah"
+}

first_attention_2_attention_unfreeze/model_14000/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

first_attention_2_attention_unfreeze/model_14000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 14000,
+    "update_step": 14000,
+    "tokens_seen": 8945713152,
+    "tokens_seen_before": 8945074176,
+    "update_time": 2.924705743789673,
+    "wandb_id": "0ab5p6ah"
+}

first_attention_2_attention_unfreeze/model_15000/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

first_attention_2_attention_unfreeze/model_15000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 15000,
+    "update_step": 15000,
+    "tokens_seen": 9584689152,
+    "tokens_seen_before": 9584050176,
+    "update_time": 2.924987554550171,
+    "wandb_id": "0ab5p6ah"
+}

first_attention_2_attention_unfreeze/model_2000/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

first_attention_2_attention_unfreeze/model_2000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 2000,
+    "update_step": 2000,
+    "tokens_seen": 1278001152,
+    "tokens_seen_before": 1277362176,
+    "update_time": 2.923201322555542,
+    "wandb_id": "0ab5p6ah"
+}

first_attention_2_attention_unfreeze/model_3000/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

first_attention_2_attention_unfreeze/model_3000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 3000,
+    "update_step": 3000,
+    "tokens_seen": 1916977152,
+    "tokens_seen_before": 1916338176,
+    "update_time": 2.9246528148651123,
+    "wandb_id": "0ab5p6ah"
+}

first_attention_2_attention_unfreeze/model_4000/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

first_attention_2_attention_unfreeze/model_4000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 4000,
+    "update_step": 4000,
+    "tokens_seen": 2555953152,
+    "tokens_seen_before": 2555314176,
+    "update_time": 2.9246957302093506,
+    "wandb_id": "0ab5p6ah"
+}

first_attention_2_attention_unfreeze/model_5000/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

first_attention_2_attention_unfreeze/model_5000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 5000,
+    "update_step": 5000,
+    "tokens_seen": 3194929152,
+    "tokens_seen_before": 3194290176,
+    "update_time": 2.9258697032928467,
+    "wandb_id": "0ab5p6ah"
+}

first_attention_2_attention_unfreeze/model_6000/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

first_attention_2_attention_unfreeze/model_6000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 6000,
+    "update_step": 6000,
+    "tokens_seen": 3833905152,
+    "tokens_seen_before": 3833266176,
+    "update_time": 2.92516827583313,
+    "wandb_id": "0ab5p6ah"
+}

first_attention_2_attention_unfreeze/model_7000/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

first_attention_2_attention_unfreeze/model_7000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 7000,
+    "update_step": 7000,
+    "tokens_seen": 4472881152,
+    "tokens_seen_before": 4472242176,
+    "update_time": 2.925605058670044,
+    "wandb_id": "0ab5p6ah"
+}

first_attention_2_attention_unfreeze/model_8000/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

first_attention_2_attention_unfreeze/model_8000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 8000,
+    "update_step": 8000,
+    "tokens_seen": 5111857152,
+    "tokens_seen_before": 5111218176,
+    "update_time": 2.924616575241089,
+    "wandb_id": "0ab5p6ah"
+}

first_attention_2_attention_unfreeze/model_9000/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "architectures": [
+        "LLaMAForCausalLM"
+    ],
+    "bos_token_id": 50256,
+    "eos_token_id": 50256,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "intermediate_size": 5376,
+    "initializer_range": 0.02,
+    "max_sequence_length": 1024,
+    "model_type": "llama",
+    "num_attention_heads": 24,
+    "num_hidden_layers": 14,
+    "rms_norm_eps": 1e-05,
+    "transformers_version": "4.28.1",
+    "use_cache": true,
+    "vocab_size": 50257
+}

first_attention_2_attention_unfreeze/model_9000/training_state.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "global_step": 9000,
+    "update_step": 9000,
+    "tokens_seen": 5750833152,
+    "tokens_seen_before": 5750194176,
+    "update_time": 2.9271013736724854,
+    "wandb_id": "0ab5p6ah"
+}

first_attention_2_attention_unfreeze/training_config.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+adam_beta1: 0.9
+adam_beta2: 0.95
+adjust_step: 0
+autoresume: false
+batch_size: 6
+clip_grad_norm: 1.0
+comment: null
+cycle_length: null
+dtype: bfloat16
+emb_freeze: null
+eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/
+eval_every: 1000
+first_attention: false
+first_attention_resume: true
+gradient_accumulation: 13
+keep_checkpoints: null
+layer_freeze: null
+layer_freeze_2: false
+load_optimizer_state_on_resume: true
+lr: 0.0004
+max_length: 1024
+max_train_tokens: null
+min_lr_ratio: 0.1
+model_config: model_config/478m.json
+model_name_or_path: null
+model_revision: null
+num_training_steps: 15000
+optimizer: Adam
+restart_warmup_steps: null
+resume_from: null
+run_name: first_attention_resume_unfreeze
+save_dir: checkpoints/first_attention_resume_unfreeze
+save_every: 1000
+scheduler: cosine
+seed: 0
+shuffle: true
+skip_batches: !!set {}
+tags:
+- 396m-for-680m
+total_batch_size: 624
+train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/
+training_config: training_config/two_stage/478m_first_attention_resume_unfreeze.yaml
+wandb_watch: true
+warmed_up_model: null
+warmup_steps: 1500
+weight_decay: 0.0
+workers: 8