diff --git a/attention_2_only_emb/final_model/config.json b/attention_2_only_emb/final_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/attention_2_only_emb/final_model/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/attention_2_only_emb/model_1000/config.json b/attention_2_only_emb/model_1000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/attention_2_only_emb/model_1000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/attention_2_only_emb/model_1000/training_state.json b/attention_2_only_emb/model_1000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4968bc890f917027c3449c8be8988b88958e61cc --- /dev/null +++ b/attention_2_only_emb/model_1000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 1000, + "update_step": 1000, + "tokens_seen": 639025152, + "tokens_seen_before": 638386176, + "update_time": 2.876735210418701, + "wandb_id": "sgqffduo" +} \ No newline at end of file diff --git a/attention_2_only_emb/model_10000/config.json b/attention_2_only_emb/model_10000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/attention_2_only_emb/model_10000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/attention_2_only_emb/model_10000/training_state.json b/attention_2_only_emb/model_10000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..851522768fd0eb83a1b1c94529e163ec16ead906 --- /dev/null +++ b/attention_2_only_emb/model_10000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 10000, + "update_step": 10000, + "tokens_seen": 6389809152, + "tokens_seen_before": 6389170176, + "update_time": 2.879143476486206, + "wandb_id": "sgqffduo" +} \ No newline at end of file diff --git a/attention_2_only_emb/model_11000/config.json b/attention_2_only_emb/model_11000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/attention_2_only_emb/model_11000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/attention_2_only_emb/model_11000/training_state.json b/attention_2_only_emb/model_11000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f79c0a8ba6c175edc2f074a8e6d675b2e00f2c0a --- /dev/null +++ b/attention_2_only_emb/model_11000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 11000, + "update_step": 11000, + "tokens_seen": 7028785152, + "tokens_seen_before": 7028146176, + "update_time": 2.8783798217773438, + "wandb_id": "sgqffduo" +} \ No newline at end of file diff --git a/attention_2_only_emb/model_12000/config.json b/attention_2_only_emb/model_12000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/attention_2_only_emb/model_12000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/attention_2_only_emb/model_12000/training_state.json b/attention_2_only_emb/model_12000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bfcec46ae5b67b045073545c74c34959ab82c4ef --- /dev/null +++ b/attention_2_only_emb/model_12000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 12000, + "update_step": 12000, + "tokens_seen": 7667761152, + "tokens_seen_before": 7667122176, + "update_time": 2.8790109157562256, + "wandb_id": "sgqffduo" +} \ No newline at end of file diff --git a/attention_2_only_emb/model_13000/config.json b/attention_2_only_emb/model_13000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/attention_2_only_emb/model_13000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/attention_2_only_emb/model_13000/training_state.json b/attention_2_only_emb/model_13000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..55ac5fa9a99428439eea43e698ed3426d8fc5725 --- /dev/null +++ b/attention_2_only_emb/model_13000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 13000, + "update_step": 13000, + "tokens_seen": 8306737152, + "tokens_seen_before": 8306098176, + "update_time": 2.877460241317749, + "wandb_id": "sgqffduo" +} \ No newline at end of file diff --git a/attention_2_only_emb/model_14000/config.json b/attention_2_only_emb/model_14000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/attention_2_only_emb/model_14000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/attention_2_only_emb/model_14000/training_state.json b/attention_2_only_emb/model_14000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..09b75fcf6108fdf3856baa8212751bc6cac2da7c --- /dev/null +++ b/attention_2_only_emb/model_14000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 14000, + "update_step": 14000, + "tokens_seen": 8945713152, + "tokens_seen_before": 8945074176, + "update_time": 2.8788270950317383, + "wandb_id": "sgqffduo" +} \ No newline at end of file diff --git a/attention_2_only_emb/model_15000/config.json b/attention_2_only_emb/model_15000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/attention_2_only_emb/model_15000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/attention_2_only_emb/model_15000/training_state.json b/attention_2_only_emb/model_15000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6e31991ba60daf7dcaa65e2bc18416e1cb870d43 --- /dev/null +++ b/attention_2_only_emb/model_15000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 15000, + "update_step": 15000, + "tokens_seen": 9584689152, + "tokens_seen_before": 9584050176, + "update_time": 2.877842426300049, + "wandb_id": "sgqffduo" +} \ No newline at end of file diff --git a/attention_2_only_emb/model_2000/config.json b/attention_2_only_emb/model_2000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/attention_2_only_emb/model_2000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/attention_2_only_emb/model_2000/training_state.json b/attention_2_only_emb/model_2000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe192ea66074e27e208cf0e2df3198146ed7a01 --- /dev/null +++ b/attention_2_only_emb/model_2000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 2000, + "update_step": 2000, + "tokens_seen": 1278001152, + "tokens_seen_before": 1277362176, + "update_time": 2.8781590461730957, + "wandb_id": "sgqffduo" +} \ No newline at end of file diff --git a/attention_2_only_emb/model_3000/config.json b/attention_2_only_emb/model_3000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/attention_2_only_emb/model_3000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/attention_2_only_emb/model_3000/training_state.json b/attention_2_only_emb/model_3000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e7a1a864014f317b03ed99223a775b597b40882e --- /dev/null +++ b/attention_2_only_emb/model_3000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 3000, + "update_step": 3000, + "tokens_seen": 1916977152, + "tokens_seen_before": 1916338176, + "update_time": 2.876859426498413, + "wandb_id": "sgqffduo" +} \ No newline at end of file diff --git a/attention_2_only_emb/model_4000/config.json b/attention_2_only_emb/model_4000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/attention_2_only_emb/model_4000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/attention_2_only_emb/model_4000/training_state.json b/attention_2_only_emb/model_4000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ec8749d8631abc684e84fda3b2cafef2cac46714 --- /dev/null +++ b/attention_2_only_emb/model_4000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 4000, + "update_step": 4000, + "tokens_seen": 2555953152, + "tokens_seen_before": 2555314176, + "update_time": 2.8781702518463135, + "wandb_id": "sgqffduo" +} \ No newline at end of file diff --git a/attention_2_only_emb/model_5000/config.json b/attention_2_only_emb/model_5000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/attention_2_only_emb/model_5000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/attention_2_only_emb/model_5000/training_state.json b/attention_2_only_emb/model_5000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..adcdd3253138058a81ecd5cf61e8f1e203d9a5c8 --- /dev/null +++ b/attention_2_only_emb/model_5000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 5000, + "update_step": 5000, + "tokens_seen": 3194929152, + "tokens_seen_before": 3194290176, + "update_time": 2.8779265880584717, + "wandb_id": "sgqffduo" +} \ No newline at end of file diff --git a/attention_2_only_emb/model_6000/config.json b/attention_2_only_emb/model_6000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/attention_2_only_emb/model_6000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/attention_2_only_emb/model_6000/training_state.json b/attention_2_only_emb/model_6000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d7abfeea0a7ee87b4247f9b1f01649e14cbe36d0 --- /dev/null +++ b/attention_2_only_emb/model_6000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 6000, + "update_step": 6000, + "tokens_seen": 3833905152, + "tokens_seen_before": 3833266176, + "update_time": 2.880322217941284, + "wandb_id": "sgqffduo" +} \ No newline at end of file diff --git a/attention_2_only_emb/model_7000/config.json b/attention_2_only_emb/model_7000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/attention_2_only_emb/model_7000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/attention_2_only_emb/model_7000/training_state.json b/attention_2_only_emb/model_7000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f5fb2190bef586bc9917c90a1390a4d4018a0d63 --- /dev/null +++ b/attention_2_only_emb/model_7000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 7000, + "update_step": 7000, + "tokens_seen": 4472881152, + "tokens_seen_before": 4472242176, + "update_time": 2.8802781105041504, + "wandb_id": "sgqffduo" +} \ No newline at end of file diff --git a/attention_2_only_emb/model_8000/config.json b/attention_2_only_emb/model_8000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/attention_2_only_emb/model_8000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/attention_2_only_emb/model_8000/training_state.json b/attention_2_only_emb/model_8000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8a1a06d40b7cba985d5fe281cd43975b4ed20955 --- /dev/null +++ b/attention_2_only_emb/model_8000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 8000, + "update_step": 8000, + "tokens_seen": 5111857152, + "tokens_seen_before": 5111218176, + "update_time": 2.8798861503601074, + "wandb_id": "sgqffduo" +} \ No newline at end of file diff --git a/attention_2_only_emb/model_9000/config.json b/attention_2_only_emb/model_9000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/attention_2_only_emb/model_9000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/attention_2_only_emb/model_9000/training_state.json b/attention_2_only_emb/model_9000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9a9293469406b8c7d1b262b58e4f534f10f1a235 --- /dev/null +++ b/attention_2_only_emb/model_9000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 9000, + "update_step": 9000, + "tokens_seen": 5750833152, + "tokens_seen_before": 5750194176, + "update_time": 2.881277322769165, + "wandb_id": "sgqffduo" +} \ No newline at end of file diff --git a/attention_2_only_emb/training_config.yaml b/attention_2_only_emb/training_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..54d5526c2d5c90c21388341678c60cc1661f095d --- /dev/null +++ b/attention_2_only_emb/training_config.yaml @@ -0,0 +1,50 @@ +adam_beta1: 0.9 +adam_beta2: 0.95 +adjust_step: 0 +autoresume: false +batch_size: 6 +clip_grad_norm: 1.0 +comment: null +cycle_length: null +dtype: bfloat16 +emb_freeze: null +eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/ +eval_every: 1000 +first_attention: false +first_attention_2: true +first_attention_resume: false +first_layer: false +first_layer_2: false +gradient_accumulation: 13 +keep_checkpoints: null +layer_freeze: null +layer_freeze_2: false +load_optimizer_state_on_resume: true +lr: 0.0004 +max_length: 1024 +max_train_tokens: null +min_lr_ratio: 0.1 +model_config: model_config/478m.json +model_name_or_path: null +model_revision: null +num_training_steps: 15000 +optimizer: Adam +restart_warmup_steps: null +resume_from: null +run_name: attention_2_only_emb +save_dir: checkpoints/attention_2_only_emb +save_every: 1000 +scheduler: cosine +seed: 0 +shuffle: true +skip_batches: !!set {} +tags: +- 396m-for-680m +total_batch_size: 624 +train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/ +training_config: training_config/two_stage/attention_2_only_emb.yaml +wandb_watch: true +warmed_up_model: null +warmup_steps: 1500 +weight_decay: 0.0 +workers: 8 diff --git a/baseline/model_15000/pytorch_model.bin b/baseline/model_15000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..30e9e514722f3081c6e6f733260a5897aac57f5b --- /dev/null +++ b/baseline/model_15000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a68abbc43af60bd4c7f2d95c72f10740acbbb95df43e3fd9ba6ae48d8c02ccc +size 2533545094 diff --git a/bigram_2_full/model_1000/model_config.json b/bigram_2_full/model_1000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2_full/model_1000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2_full/model_10000/model_config.json b/bigram_2_full/model_10000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2_full/model_10000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2_full/model_10000/training_state.json b/bigram_2_full/model_10000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d2fd369bda7a8cc70c850a25157734867f00a787 --- /dev/null +++ b/bigram_2_full/model_10000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 10000, + "update_step": 10000, + "tokens_seen": 6389809152, + "tokens_seen_before": 6389170176, + "update_time": 3.281773805618286, + "wandb_id": "7nopmkvs" +} \ No newline at end of file diff --git a/bigram_2_full/model_12000/model_config.json b/bigram_2_full/model_12000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2_full/model_12000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2_full/model_12000/training_state.json b/bigram_2_full/model_12000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5eea0cd4a39dedde2c1981aafcee656e3ab9abdf --- /dev/null +++ b/bigram_2_full/model_12000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 12000, + "update_step": 12000, + "tokens_seen": 7667761152, + "tokens_seen_before": 7667122176, + "update_time": 3.285931348800659, + "wandb_id": "7nopmkvs" +} \ No newline at end of file diff --git a/bigram_2_full/model_14000/model_config.json b/bigram_2_full/model_14000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2_full/model_14000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2_full/model_15000/model_config.json b/bigram_2_full/model_15000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2_full/model_15000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2_full/model_15000/training_state.json b/bigram_2_full/model_15000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ca150bacea06751d13b78770a74962f45fe1058b --- /dev/null +++ b/bigram_2_full/model_15000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 15000, + "update_step": 15000, + "tokens_seen": 9584689152, + "tokens_seen_before": 9584050176, + "update_time": 3.2829833030700684, + "wandb_id": "7nopmkvs" +} \ No newline at end of file diff --git a/bigram_2_full/model_6000/model_config.json b/bigram_2_full/model_6000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2_full/model_6000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2_full/model_6000/training_state.json b/bigram_2_full/model_6000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5d37b62625cd716ce446fdc930db0c36bc2e054e --- /dev/null +++ b/bigram_2_full/model_6000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 6000, + "update_step": 6000, + "tokens_seen": 3833905152, + "tokens_seen_before": 3833266176, + "update_time": 3.283099412918091, + "wandb_id": "7nopmkvs" +} \ No newline at end of file diff --git a/bigram_2_full/model_7000/model_config.json b/bigram_2_full/model_7000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2_full/model_7000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2_full/model_7000/training_state.json b/bigram_2_full/model_7000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ac56bd886be103cb6f8fcda4388fe009d9fa3191 --- /dev/null +++ b/bigram_2_full/model_7000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 7000, + "update_step": 7000, + "tokens_seen": 4472881152, + "tokens_seen_before": 4472242176, + "update_time": 3.2802579402923584, + "wandb_id": "7nopmkvs" +} \ No newline at end of file diff --git a/bigram_2_full/model_8000/model_config.json b/bigram_2_full/model_8000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2_full/model_8000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2_full/model_8000/training_state.json b/bigram_2_full/model_8000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d8392b58ed0005966732abdfe6bf54bac6a7d9e6 --- /dev/null +++ b/bigram_2_full/model_8000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 8000, + "update_step": 8000, + "tokens_seen": 5111857152, + "tokens_seen_before": 5111218176, + "update_time": 3.285356044769287, + "wandb_id": "7nopmkvs" +} \ No newline at end of file diff --git a/first_layer_1/final_model/config.json b/first_layer_1/final_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_1/final_model/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_1/model_1000/config.json b/first_layer_1/model_1000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_1/model_1000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_1/model_1000/training_state.json b/first_layer_1/model_1000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..11275d6596c8de3be42bdf5f657bc9ee1c62d5b3 --- /dev/null +++ b/first_layer_1/model_1000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 1000, + "update_step": 1000, + "tokens_seen": 639025152, + "tokens_seen_before": 638386176, + "update_time": 0.9770157337188721, + "wandb_id": "krzb2185" +} \ No newline at end of file diff --git a/first_layer_1/model_10000/config.json b/first_layer_1/model_10000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_1/model_10000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_1/model_10000/training_state.json b/first_layer_1/model_10000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a0b2e5171a425ac84ff34770c23d443f92f971ef --- /dev/null +++ b/first_layer_1/model_10000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 10000, + "update_step": 10000, + "tokens_seen": 6389809152, + "tokens_seen_before": 6389170176, + "update_time": 0.9783554077148438, + "wandb_id": "krzb2185" +} \ No newline at end of file diff --git a/first_layer_1/model_11000/config.json b/first_layer_1/model_11000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_1/model_11000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_1/model_11000/training_state.json b/first_layer_1/model_11000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..28cd0d9077af40cca2aef3d0a0329e7adac1a4ae --- /dev/null +++ b/first_layer_1/model_11000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 11000, + "update_step": 11000, + "tokens_seen": 7028785152, + "tokens_seen_before": 7028146176, + "update_time": 0.9776339530944824, + "wandb_id": "krzb2185" +} \ No newline at end of file diff --git a/first_layer_1/model_13000/config.json b/first_layer_1/model_13000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_1/model_13000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_1/model_13000/training_state.json b/first_layer_1/model_13000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bf8012db5bbc9e18bbf1cfb25f421bd6032ff376 --- /dev/null +++ b/first_layer_1/model_13000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 13000, + "update_step": 13000, + "tokens_seen": 8306737152, + "tokens_seen_before": 8306098176, + "update_time": 0.9766736030578613, + "wandb_id": "krzb2185" +} \ No newline at end of file diff --git a/first_layer_1/model_14000/config.json b/first_layer_1/model_14000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_1/model_14000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_1/model_14000/training_state.json b/first_layer_1/model_14000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1d77c837d1303f3521e8dfe9d04b76b37a4b99af --- /dev/null +++ b/first_layer_1/model_14000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 14000, + "update_step": 14000, + "tokens_seen": 8945713152, + "tokens_seen_before": 8945074176, + "update_time": 0.9781618118286133, + "wandb_id": "krzb2185" +} \ No newline at end of file diff --git a/first_layer_1/model_15000/config.json b/first_layer_1/model_15000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_1/model_15000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_1/model_15000/training_state.json b/first_layer_1/model_15000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c4650593160ea03a1d50720d7132e96c71808133 --- /dev/null +++ b/first_layer_1/model_15000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 15000, + "update_step": 15000, + "tokens_seen": 9584689152, + "tokens_seen_before": 9584050176, + "update_time": 0.9759185314178467, + "wandb_id": "krzb2185" +} \ No newline at end of file diff --git a/first_layer_1/model_2000/config.json b/first_layer_1/model_2000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_1/model_2000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_1/model_2000/training_state.json b/first_layer_1/model_2000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e6eeb6461c316d67f4dd583de3067ceebb34dd43 --- /dev/null +++ b/first_layer_1/model_2000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 2000, + "update_step": 2000, + "tokens_seen": 1278001152, + "tokens_seen_before": 1277362176, + "update_time": 0.9753463268280029, + "wandb_id": "krzb2185" +} \ No newline at end of file diff --git a/first_layer_1/model_3000/config.json b/first_layer_1/model_3000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_1/model_3000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_1/model_3000/training_state.json b/first_layer_1/model_3000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4d801afc9d944134d18a9656dbb57deaebaf2fec --- /dev/null +++ b/first_layer_1/model_3000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 3000, + "update_step": 3000, + "tokens_seen": 1916977152, + "tokens_seen_before": 1916338176, + "update_time": 0.976813793182373, + "wandb_id": "krzb2185" +} \ No newline at end of file diff --git a/first_layer_1/model_4000/config.json b/first_layer_1/model_4000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_1/model_4000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_1/model_4000/training_state.json b/first_layer_1/model_4000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..09d7f382c7351082a87f88146ffb260b13bcddc2 --- /dev/null +++ b/first_layer_1/model_4000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 4000, + "update_step": 4000, + "tokens_seen": 2555953152, + "tokens_seen_before": 2555314176, + "update_time": 0.9765069484710693, + "wandb_id": "krzb2185" +} \ No newline at end of file diff --git a/first_layer_1/model_5000/config.json b/first_layer_1/model_5000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_1/model_5000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_1/model_5000/training_state.json b/first_layer_1/model_5000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b32278240a355060f3fb2081bec19e9932c7d8a5 --- /dev/null +++ b/first_layer_1/model_5000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 5000, + "update_step": 5000, + "tokens_seen": 3194929152, + "tokens_seen_before": 3194290176, + "update_time": 0.9771957397460938, + "wandb_id": "krzb2185" +} \ No newline at end of file diff --git a/first_layer_1/model_6000/config.json b/first_layer_1/model_6000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_1/model_6000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_1/model_7000/config.json b/first_layer_1/model_7000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_1/model_7000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_1/model_7000/training_state.json b/first_layer_1/model_7000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..db67cc277175ea218bd348ef646b38aeee2306c9 --- /dev/null +++ b/first_layer_1/model_7000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 7000, + "update_step": 7000, + "tokens_seen": 4472881152, + "tokens_seen_before": 4472242176, + "update_time": 0.977327823638916, + "wandb_id": "krzb2185" +} \ No newline at end of file diff --git a/first_layer_1/model_8000/config.json b/first_layer_1/model_8000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_1/model_8000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_1/model_8000/training_state.json b/first_layer_1/model_8000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..deaa585d3afcecca1b3ef7ebb37e629caf1f41cf --- /dev/null +++ b/first_layer_1/model_8000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 8000, + "update_step": 8000, + "tokens_seen": 5111857152, + "tokens_seen_before": 5111218176, + "update_time": 0.9763047695159912, + "wandb_id": "krzb2185" +} \ No newline at end of file diff --git a/first_layer_1/model_9000/config.json b/first_layer_1/model_9000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_1/model_9000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_1/model_9000/training_state.json b/first_layer_1/model_9000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2fb08568c1c53d56bc8655960cc24953725fb1b4 --- /dev/null +++ b/first_layer_1/model_9000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 9000, + "update_step": 9000, + "tokens_seen": 5750833152, + "tokens_seen_before": 5750194176, + "update_time": 0.9778523445129395, + "wandb_id": "krzb2185" +} \ No newline at end of file diff --git a/first_layer_1/training_config.yaml b/first_layer_1/training_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cceafd684862ac9adb7abcb2124608f71e786ac7 --- /dev/null +++ b/first_layer_1/training_config.yaml @@ -0,0 +1,48 @@ +adam_beta1: 0.9 +adam_beta2: 0.95 +adjust_step: 0 +autoresume: false +batch_size: 6 +clip_grad_norm: 1.0 +comment: null +cycle_length: null +dtype: bfloat16 +emb_freeze: null +eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/ +eval_every: 1000 +first_attention: false +first_attention_resume: false +first_layer: true +gradient_accumulation: 13 +keep_checkpoints: null +layer_freeze: null +layer_freeze_2: false +load_optimizer_state_on_resume: true +lr: 0.0004 +max_length: 1024 +max_train_tokens: null +min_lr_ratio: 0.1 +model_config: model_config/478m.json +model_name_or_path: null +model_revision: null +num_training_steps: 15000 +optimizer: Adam +restart_warmup_steps: null +resume_from: null +run_name: first_layer +save_dir: checkpoints/first_layer +save_every: 1000 +scheduler: cosine +seed: 0 +shuffle: true +skip_batches: !!set {} +tags: +- 396m-for-680m +total_batch_size: 624 +train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/ +training_config: training_config/two_stage/478m_first_layer.yaml +wandb_watch: true +warmed_up_model: null +warmup_steps: 1500 +weight_decay: 0.0 +workers: 8 diff --git a/silver-butterfly-62/final_model/pytorch_model.bin b/silver-butterfly-62/final_model/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7b090988229bdea3b90594978f33e452f087e902 --- /dev/null +++ b/silver-butterfly-62/final_model/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8485594c631a3ee014df0c45899a8d3c23d104cef2f7384a92e22ad63f6c868 +size 617566142 diff --git a/silver-butterfly-62/model_1000/pytorch_model.bin b/silver-butterfly-62/model_1000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..495bc202ef93640618a2a9e2e11a6b640154ea51 --- /dev/null +++ b/silver-butterfly-62/model_1000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:976f82c7b37546cd33e722337601e8d26d305fca029c1be52e0e37f564a67cf4 +size 617566142 diff --git a/silver-butterfly-62/model_10000/pytorch_model.bin b/silver-butterfly-62/model_10000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..55b4e12507f70b964d41f826fcb0ba789636c03d --- /dev/null +++ b/silver-butterfly-62/model_10000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac27be594fed9a25d80579a206847254310898b702d740449f90ac88883ce5dd +size 617566142 diff --git a/silver-butterfly-62/model_11000/pytorch_model.bin b/silver-butterfly-62/model_11000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..bb0b4967ba4599e104ae0aedd765b40299389b7f --- /dev/null +++ b/silver-butterfly-62/model_11000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6baebc89ef2e5adae52b93350bb3d6606c4a2c528a86dd998816d37746d65aff +size 617566142 diff --git a/silver-butterfly-62/model_12000/pytorch_model.bin b/silver-butterfly-62/model_12000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..2f6f7f42535146b74bd9eb3c30469c85218837f5 --- /dev/null +++ b/silver-butterfly-62/model_12000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64c83bf5fa903b607b27896bb0d9119d9451edc3326f74d2f512af708e9894f1 +size 617566142 diff --git a/silver-butterfly-62/model_13000/pytorch_model.bin b/silver-butterfly-62/model_13000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..afaf06ad302ca9a48e15e033d83486b99e4e9456 --- /dev/null +++ b/silver-butterfly-62/model_13000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3cb5c14fea6f14f593acb5e9194265d33430363f4c7d9bc4b32d0830ba89a52 +size 617566142 diff --git a/silver-butterfly-62/model_14000/pytorch_model.bin b/silver-butterfly-62/model_14000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..4652f7efcd9b3a9afaee67d8ed8a5c132e1971e9 --- /dev/null +++ b/silver-butterfly-62/model_14000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b66b9275a4e25604ae4295e710ac1e176c9add9fc151c3076b0d9b7ede7438a9 +size 617566142 diff --git a/silver-butterfly-62/model_15000/pytorch_model.bin b/silver-butterfly-62/model_15000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7b090988229bdea3b90594978f33e452f087e902 --- /dev/null +++ b/silver-butterfly-62/model_15000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8485594c631a3ee014df0c45899a8d3c23d104cef2f7384a92e22ad63f6c868 +size 617566142 diff --git a/silver-butterfly-62/model_2000/pytorch_model.bin b/silver-butterfly-62/model_2000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ed5b3de69c7cbbe7f1ec4f8353d9a6f360407f6a --- /dev/null +++ b/silver-butterfly-62/model_2000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce4f49bc6c201abe738e7e3513b550900ad735c147e2e6a5df0aebf14a229696 +size 617566142 diff --git a/silver-butterfly-62/model_3000/pytorch_model.bin b/silver-butterfly-62/model_3000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ace4cc0cfe52f9249b5840a25e2a7c730a8d6c22 --- /dev/null +++ b/silver-butterfly-62/model_3000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:960bb96bf458bf9d656b8dfdebfa19e84902e798dca76355a565b68f88ceb7aa +size 617566142 diff --git a/silver-butterfly-62/model_4000/pytorch_model.bin b/silver-butterfly-62/model_4000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d7140617aaaf8fa607f46be127edac3b8a5e06de --- /dev/null +++ b/silver-butterfly-62/model_4000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ed93b5c6b2769efde7409a187bbee4997e37357f2cc84aa79792f37d4e89b80 +size 617566142 diff --git a/silver-butterfly-62/model_5000/pytorch_model.bin b/silver-butterfly-62/model_5000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..666e2437f106a31767251fd23370edcb70ebc85a --- /dev/null +++ b/silver-butterfly-62/model_5000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1258388330c8682885b37017cf26ddee6a2d6054d0f5705d008af599b378b404 +size 617566142 diff --git a/silver-butterfly-62/model_6000/pytorch_model.bin b/silver-butterfly-62/model_6000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..f91a28251a57e3ec3ab165863c3e82c552729c71 --- /dev/null +++ b/silver-butterfly-62/model_6000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b8b2bcae1219ce88896d85e0c4184ae2d3ab18be7f202dc73ccf2a77f601fcd +size 617566142 diff --git a/silver-butterfly-62/model_7000/pytorch_model.bin b/silver-butterfly-62/model_7000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b9cc1081db164228e7e14b009a7542175789677f --- /dev/null +++ b/silver-butterfly-62/model_7000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79ced1ccc56987ab1a159ca096bc6f745f725969f99236e0672ae0959d025681 +size 617566142 diff --git a/silver-butterfly-62/model_8000/pytorch_model.bin b/silver-butterfly-62/model_8000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d23bb027a45b8e7bd045ef1398f54e30f3cc6e97 --- /dev/null +++ b/silver-butterfly-62/model_8000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1c1ea75427ccb84bbab74b67c385a8e9c88adbfd3618119f44e857e4eaf90fb +size 617566142 diff --git a/silver-butterfly-62/model_9000/pytorch_model.bin b/silver-butterfly-62/model_9000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7520ab3294edf7c7da63a97efa397314e772c6e3 --- /dev/null +++ b/silver-butterfly-62/model_9000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec0837267b9b900c5f5bd1da70bec41982b10b288c30c3535cd471cfc6ec4477 +size 617566142