diff --git a/baseline/final_model/config.json b/baseline/final_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/baseline/final_model/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/baseline/model_1000/model_config.json b/baseline/model_1000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/baseline/model_1000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/baseline/model_1000/training_state.json b/baseline/model_1000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..138106d55acf538e96415bd6632ba7e213c5fb4b --- /dev/null +++ b/baseline/model_1000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 1000, + "update_step": 1000, + "tokens_seen": 639025152, + "tokens_seen_before": 638386176, + "update_time": 3.298034429550171, + "wandb_id": "tytetz1i" +} \ No newline at end of file diff --git a/baseline/model_10000/model_config.json b/baseline/model_10000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/baseline/model_10000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/baseline/model_10000/training_state.json b/baseline/model_10000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b2f8250701ba744aca09cac85a1d30cf6a42701a --- /dev/null +++ b/baseline/model_10000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 10000, + "update_step": 10000, + "tokens_seen": 6389809152, + "tokens_seen_before": 6389170176, + "update_time": 3.303941011428833, + "wandb_id": "tytetz1i" +} \ No newline at end of file diff --git a/baseline/model_11000/model_config.json b/baseline/model_11000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/baseline/model_11000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/baseline/model_11000/training_state.json b/baseline/model_11000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e61adc385e729e253c4ec979cd245aeb48cfca23 --- /dev/null +++ b/baseline/model_11000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 11000, + "update_step": 11000, + "tokens_seen": 7028785152, + "tokens_seen_before": 7028146176, + "update_time": 3.299642324447632, + "wandb_id": "tytetz1i" +} \ No newline at end of file diff --git a/baseline/model_12000/model_config.json b/baseline/model_12000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/baseline/model_12000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/baseline/model_12000/training_state.json b/baseline/model_12000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8dba30313d3ccc460b934622a480cdd6d77a30a0 --- /dev/null +++ b/baseline/model_12000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 12000, + "update_step": 12000, + "tokens_seen": 7667761152, + "tokens_seen_before": 7667122176, + "update_time": 3.298994541168213, + "wandb_id": "tytetz1i" +} \ No newline at end of file diff --git a/baseline/model_13000/model_config.json b/baseline/model_13000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/baseline/model_13000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/baseline/model_13000/training_state.json b/baseline/model_13000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6cbf973c744914f8e6abef5675f320afe25be225 --- /dev/null +++ b/baseline/model_13000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 13000, + "update_step": 13000, + "tokens_seen": 8306737152, + "tokens_seen_before": 8306098176, + "update_time": 3.3014893531799316, + "wandb_id": "tytetz1i" +} \ No newline at end of file diff --git a/baseline/model_14000/model_config.json b/baseline/model_14000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/baseline/model_14000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/baseline/model_14000/training_state.json b/baseline/model_14000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7be156d5b4ed73da022e90ce9ce8cd40573ea983 --- /dev/null +++ b/baseline/model_14000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 14000, + "update_step": 14000, + "tokens_seen": 8945713152, + "tokens_seen_before": 8945074176, + "update_time": 3.3026955127716064, + "wandb_id": "tytetz1i" +} \ No newline at end of file diff --git a/baseline/model_15000/model_config.json b/baseline/model_15000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/baseline/model_15000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/baseline/model_15000/training_state.json b/baseline/model_15000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c92df6a2e59b4ff0951a29f43df71d7e234d6e8e --- /dev/null +++ b/baseline/model_15000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 15000, + "update_step": 15000, + "tokens_seen": 9584689152, + "tokens_seen_before": 9584050176, + "update_time": 3.3024754524230957, + "wandb_id": "tytetz1i" +} \ No newline at end of file diff --git a/baseline/model_2000/model_config.json b/baseline/model_2000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/baseline/model_2000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/baseline/model_2000/training_state.json b/baseline/model_2000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dfce188d3dc39e9a1ab6a2290ea1e2f95c2d8133 --- /dev/null +++ b/baseline/model_2000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 2000, + "update_step": 2000, + "tokens_seen": 1278001152, + "tokens_seen_before": 1277362176, + "update_time": 3.2954349517822266, + "wandb_id": "tytetz1i" +} \ No newline at end of file diff --git a/baseline/model_3000/model_config.json b/baseline/model_3000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/baseline/model_3000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/baseline/model_3000/training_state.json b/baseline/model_3000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..26921ee4fa5670cf6ab5512244f95289e66de8df --- /dev/null +++ b/baseline/model_3000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 3000, + "update_step": 3000, + "tokens_seen": 1916977152, + "tokens_seen_before": 1916338176, + "update_time": 3.3039631843566895, + "wandb_id": "tytetz1i" +} \ No newline at end of file diff --git a/baseline/model_4000/model_config.json b/baseline/model_4000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/baseline/model_4000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/baseline/model_4000/training_state.json b/baseline/model_4000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c88d4af47e991cba4e2230b4c27aea9c30fa079e --- /dev/null +++ b/baseline/model_4000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 4000, + "update_step": 4000, + "tokens_seen": 2555953152, + "tokens_seen_before": 2555314176, + "update_time": 3.2994651794433594, + "wandb_id": "tytetz1i" +} \ No newline at end of file diff --git a/baseline/model_5000/model_config.json b/baseline/model_5000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/baseline/model_5000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/baseline/model_5000/training_state.json b/baseline/model_5000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9156432030e946479eb266aaf0d09742a84e23a5 --- /dev/null +++ b/baseline/model_5000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 5000, + "update_step": 5000, + "tokens_seen": 3194929152, + "tokens_seen_before": 3194290176, + "update_time": 3.3040542602539062, + "wandb_id": "tytetz1i" +} \ No newline at end of file diff --git a/baseline/model_6000/model_config.json b/baseline/model_6000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/baseline/model_6000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/baseline/model_6000/training_state.json b/baseline/model_6000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3515147a0eeda37b4e33731033928c788be96037 --- /dev/null +++ b/baseline/model_6000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 6000, + "update_step": 6000, + "tokens_seen": 3833905152, + "tokens_seen_before": 3833266176, + "update_time": 3.2993881702423096, + "wandb_id": "tytetz1i" +} \ No newline at end of file diff --git a/baseline/model_7000/model_config.json b/baseline/model_7000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/baseline/model_7000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/baseline/model_7000/training_state.json b/baseline/model_7000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..175fc34d80f0bcac705fd2b3a86b5c33aeaa8f34 --- /dev/null +++ b/baseline/model_7000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 7000, + "update_step": 7000, + "tokens_seen": 4472881152, + "tokens_seen_before": 4472242176, + "update_time": 3.297553777694702, + "wandb_id": "tytetz1i" +} \ No newline at end of file diff --git a/baseline/model_8000/model_config.json b/baseline/model_8000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/baseline/model_8000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/baseline/model_8000/training_state.json b/baseline/model_8000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c1e943f5907287dd338d70744a70beff1618d958 --- /dev/null +++ b/baseline/model_8000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 8000, + "update_step": 8000, + "tokens_seen": 5111857152, + "tokens_seen_before": 5111218176, + "update_time": 3.300821542739868, + "wandb_id": "tytetz1i" +} \ No newline at end of file diff --git a/baseline/model_9000/model_config.json b/baseline/model_9000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/baseline/model_9000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/baseline/model_9000/training_state.json b/baseline/model_9000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..643b94d4f162b12e80d61a9471ba9a12b2ecfede --- /dev/null +++ b/baseline/model_9000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 9000, + "update_step": 9000, + "tokens_seen": 5750833152, + "tokens_seen_before": 5750194176, + "update_time": 3.299265146255493, + "wandb_id": "tytetz1i" +} \ No newline at end of file diff --git a/baseline/training_config.yaml b/baseline/training_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9f37da42f1ee1a67cb00d5e4809bbbe0c2f187a7 --- /dev/null +++ b/baseline/training_config.yaml @@ -0,0 +1,42 @@ +adam_beta1: 0.9 +adam_beta2: 0.95 +adjust_step: 0 +autoresume: false +batch_size: 6 +clip_grad_norm: 1.0 +comment: null +cycle_length: null +dtype: bfloat16 +eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/ +eval_every: 1000 +gradient_accumulation: 13 +keep_checkpoints: null +load_optimizer_state_on_resume: true +lr: 0.0004 +max_length: 1024 +max_train_tokens: null +min_lr_ratio: 0.1 +model_config: model_config/478m.json +model_name_or_path: null +model_revision: null +num_training_steps: 15000 +optimizer: Adam +restart_warmup_steps: null +resume_from: null +run_name: dark-moon-14 +save_dir: checkpoints/dark-moon-14 +save_every: 1000 +scheduler: cosine +seed: 0 +shuffle: true +skip_batches: !!set {} +tags: +- 396m-for-680m +total_batch_size: 624 +train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/ +training_config: training_config/478m.yaml +wandb_watch: true +warmed_up_model: null +warmup_steps: 1500 +weight_decay: 0.0 +workers: 8 diff --git a/first_attention_2_attention_unfreeze/model_13000/config.json b/first_attention_2_attention_unfreeze/model_13000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_13000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_resume_loss_convergence/model_3000/config.json b/first_attention_resume_loss_convergence/model_3000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_resume_loss_convergence/model_3000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_resume_loss_convergence/model_3000/training_state.json b/first_attention_resume_loss_convergence/model_3000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cc38513332779e8f8b98cb5319689caaf80caed6 --- /dev/null +++ b/first_attention_resume_loss_convergence/model_3000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 3000, + "update_step": 3000, + "tokens_seen": 1916977152, + "tokens_seen_before": 1916338176, + "update_time": 2.884432792663574, + "wandb_id": "hkuum9kt" +} \ No newline at end of file diff --git a/first_attention_resume_loss_convergence/training_config.yaml b/first_attention_resume_loss_convergence/training_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..944bdaee0bee527c67e31b47c4d94458c20072d3 --- /dev/null +++ b/first_attention_resume_loss_convergence/training_config.yaml @@ -0,0 +1,47 @@ +adam_beta1: 0.9 +adam_beta2: 0.95 +adjust_step: 0 +autoresume: false +batch_size: 6 +clip_grad_norm: 1.0 +comment: null +cycle_length: null +dtype: bfloat16 +emb_freeze: null +eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/ +eval_every: 1000 +first_attention: false +first_attention_resume: false +gradient_accumulation: 13 +keep_checkpoints: null +layer_freeze: null +layer_freeze_2: false +load_optimizer_state_on_resume: true +lr: 0.0004 +max_length: 1024 +max_train_tokens: null +min_lr_ratio: 0.1 +model_config: model_config/478m.json +model_name_or_path: null +model_revision: null +num_training_steps: 15000 +optimizer: Adam +restart_warmup_steps: null +resume_from: /lee_embedding/checkpoints/first_attention_resume_loss_convergence/model_3000 +run_name: first_attention_resume_loss_convergence +save_dir: checkpoints/first_attention_resume_loss_convergence +save_every: 1000 +scheduler: cosine +seed: 0 +shuffle: true +skip_batches: !!set {} +tags: +- 396m-for-680m +total_batch_size: 624 +train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/ +training_config: training_config/two_stage/478m_first_attention_resume_loss_convergence.yaml +wandb_watch: true +warmed_up_model: null +warmup_steps: 1500 +weight_decay: 0.0 +workers: 8 diff --git a/first_layer_2/final_model/config.json b/first_layer_2/final_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_2/final_model/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_2/model_1000/config.json b/first_layer_2/model_1000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_2/model_1000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_2/model_11000/config.json b/first_layer_2/model_11000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_2/model_11000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_2/model_11000/training_state.json b/first_layer_2/model_11000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f28b4e34e01970d976a1a6917e7050d00649928b --- /dev/null +++ b/first_layer_2/model_11000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 11000, + "update_step": 11000, + "tokens_seen": 7028785152, + "tokens_seen_before": 7028146176, + "update_time": 2.8764145374298096, + "wandb_id": "f3ljzhyw" +} \ No newline at end of file diff --git a/first_layer_2/model_12000/config.json b/first_layer_2/model_12000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_2/model_12000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_2/model_13000/config.json b/first_layer_2/model_13000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_2/model_13000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_2/model_13000/training_state.json b/first_layer_2/model_13000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4358581775ec44c9f20a155fa90219843a7260a3 --- /dev/null +++ b/first_layer_2/model_13000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 13000, + "update_step": 13000, + "tokens_seen": 8306737152, + "tokens_seen_before": 8306098176, + "update_time": 2.8782799243927, + "wandb_id": "f3ljzhyw" +} \ No newline at end of file diff --git a/first_layer_2/model_14000/config.json b/first_layer_2/model_14000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_2/model_14000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_2/model_14000/training_state.json b/first_layer_2/model_14000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..74efa2244756074633416529a593c4573be6db80 --- /dev/null +++ b/first_layer_2/model_14000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 14000, + "update_step": 14000, + "tokens_seen": 8945713152, + "tokens_seen_before": 8945074176, + "update_time": 2.8797836303710938, + "wandb_id": "f3ljzhyw" +} \ No newline at end of file diff --git a/first_layer_2/model_15000/training_state.json b/first_layer_2/model_15000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..16cc46e46e8f99a0dc9de3a567f89ab04437278b --- /dev/null +++ b/first_layer_2/model_15000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 15000, + "update_step": 15000, + "tokens_seen": 9584689152, + "tokens_seen_before": 9584050176, + "update_time": 2.877997398376465, + "wandb_id": "f3ljzhyw" +} \ No newline at end of file diff --git a/first_layer_2/model_2000/config.json b/first_layer_2/model_2000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_2/model_2000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_2/model_2000/training_state.json b/first_layer_2/model_2000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..880a739301aa373b27552249a60f55046e1af81a --- /dev/null +++ b/first_layer_2/model_2000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 2000, + "update_step": 2000, + "tokens_seen": 1278001152, + "tokens_seen_before": 1277362176, + "update_time": 2.8755557537078857, + "wandb_id": "f3ljzhyw" +} \ No newline at end of file diff --git a/first_layer_2/model_3000/config.json b/first_layer_2/model_3000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_2/model_3000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_2/model_3000/training_state.json b/first_layer_2/model_3000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..60cb2b0f618ee10c5dff945d549aed23137bb4cf --- /dev/null +++ b/first_layer_2/model_3000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 3000, + "update_step": 3000, + "tokens_seen": 1916977152, + "tokens_seen_before": 1916338176, + "update_time": 2.87614107131958, + "wandb_id": "f3ljzhyw" +} \ No newline at end of file diff --git a/first_layer_2/model_4000/config.json b/first_layer_2/model_4000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_2/model_4000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_2/model_4000/training_state.json b/first_layer_2/model_4000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..87bcbbceed285f053710d03705e26f99d411b7b3 --- /dev/null +++ b/first_layer_2/model_4000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 4000, + "update_step": 4000, + "tokens_seen": 2555953152, + "tokens_seen_before": 2555314176, + "update_time": 2.8790698051452637, + "wandb_id": "f3ljzhyw" +} \ No newline at end of file diff --git a/first_layer_2/model_5000/config.json b/first_layer_2/model_5000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_2/model_5000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_2/model_5000/training_state.json b/first_layer_2/model_5000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fd330ac79c0dc6005d01c78c4fe74c502136d37f --- /dev/null +++ b/first_layer_2/model_5000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 5000, + "update_step": 5000, + "tokens_seen": 3194929152, + "tokens_seen_before": 3194290176, + "update_time": 2.8782315254211426, + "wandb_id": "f3ljzhyw" +} \ No newline at end of file diff --git a/first_layer_2/model_7000/config.json b/first_layer_2/model_7000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_2/model_7000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_2/model_7000/training_state.json b/first_layer_2/model_7000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7a6c1367beb4f95ebd7e58acb07ecc5b682ec3cd --- /dev/null +++ b/first_layer_2/model_7000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 7000, + "update_step": 7000, + "tokens_seen": 4472881152, + "tokens_seen_before": 4472242176, + "update_time": 2.877455711364746, + "wandb_id": "f3ljzhyw" +} \ No newline at end of file diff --git a/first_layer_2/model_8000/config.json b/first_layer_2/model_8000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_2/model_8000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_2/model_8000/training_state.json b/first_layer_2/model_8000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ac6b254b440dc75446593d1be2d3bc7af52a696a --- /dev/null +++ b/first_layer_2/model_8000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 8000, + "update_step": 8000, + "tokens_seen": 5111857152, + "tokens_seen_before": 5111218176, + "update_time": 2.87611722946167, + "wandb_id": "f3ljzhyw" +} \ No newline at end of file diff --git a/first_layer_2/model_9000/training_state.json b/first_layer_2/model_9000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..97b2b9f46374b2d2ea0597612becf8aebf777d52 --- /dev/null +++ b/first_layer_2/model_9000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 9000, + "update_step": 9000, + "tokens_seen": 5750833152, + "tokens_seen_before": 5750194176, + "update_time": 2.876600980758667, + "wandb_id": "f3ljzhyw" +} \ No newline at end of file diff --git a/first_layer_2/training_config.yaml b/first_layer_2/training_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..09e9aa6b2a3bc6561252b2d4ea5f80affc93b605 --- /dev/null +++ b/first_layer_2/training_config.yaml @@ -0,0 +1,49 @@ +adam_beta1: 0.9 +adam_beta2: 0.95 +adjust_step: 0 +autoresume: false +batch_size: 6 +clip_grad_norm: 1.0 +comment: null +cycle_length: null +dtype: bfloat16 +emb_freeze: null +eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/ +eval_every: 1000 +first_attention: false +first_attention_resume: false +first_layer: false +first_layer_2: true +gradient_accumulation: 13 +keep_checkpoints: null +layer_freeze: null +layer_freeze_2: false +load_optimizer_state_on_resume: true +lr: 0.0004 +max_length: 1024 +max_train_tokens: null +min_lr_ratio: 0.1 +model_config: model_config/478m.json +model_name_or_path: null +model_revision: null +num_training_steps: 15000 +optimizer: Adam +restart_warmup_steps: null +resume_from: null +run_name: first_layer_2 +save_dir: checkpoints/first_layer_2 +save_every: 1000 +scheduler: cosine +seed: 0 +shuffle: true +skip_batches: !!set {} +tags: +- 396m-for-680m +total_batch_size: 624 +train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/ +training_config: training_config/two_stage/478m_first_layer_2.yaml +wandb_watch: true +warmed_up_model: null +warmup_steps: 1500 +weight_decay: 0.0 +workers: 8 diff --git a/silver-butterfly-62/model_1000/training_state.json b/silver-butterfly-62/model_1000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..888b6048517728f85ec1825439e59e47dc5c6ef4 --- /dev/null +++ b/silver-butterfly-62/model_1000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 1000, + "update_step": 1000, + "tokens_seen": 639025152, + "tokens_seen_before": 638386176, + "update_time": 0.769603967666626, + "wandb_id": "e0h7cx93" +} \ No newline at end of file diff --git a/silver-butterfly-62/model_10000/config.json b/silver-butterfly-62/model_10000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/silver-butterfly-62/model_10000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/model_12000/config.json b/silver-butterfly-62/model_12000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/silver-butterfly-62/model_12000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/model_13000/config.json b/silver-butterfly-62/model_13000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/silver-butterfly-62/model_13000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/model_14000/config.json b/silver-butterfly-62/model_14000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/silver-butterfly-62/model_14000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/model_15000/config.json b/silver-butterfly-62/model_15000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/silver-butterfly-62/model_15000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/model_2000/training_state.json b/silver-butterfly-62/model_2000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a1077f9f7120956449b4da6ffdf22b39c9eafba8 --- /dev/null +++ b/silver-butterfly-62/model_2000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 2000, + "update_step": 2000, + "tokens_seen": 1278001152, + "tokens_seen_before": 1277362176, + "update_time": 0.7704322338104248, + "wandb_id": "e0h7cx93" +} \ No newline at end of file diff --git a/silver-butterfly-62/model_3000/training_state.json b/silver-butterfly-62/model_3000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ad91ec1a3849f4f192e9e22732131f18c6c0a27a --- /dev/null +++ b/silver-butterfly-62/model_3000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 3000, + "update_step": 3000, + "tokens_seen": 1916977152, + "tokens_seen_before": 1916338176, + "update_time": 0.7697722911834717, + "wandb_id": "e0h7cx93" +} \ No newline at end of file diff --git a/silver-butterfly-62/model_4000/config.json b/silver-butterfly-62/model_4000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/silver-butterfly-62/model_4000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/model_6000/config.json b/silver-butterfly-62/model_6000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/silver-butterfly-62/model_6000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/model_7000/config.json b/silver-butterfly-62/model_7000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/silver-butterfly-62/model_7000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/model_7000/training_state.json b/silver-butterfly-62/model_7000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..34c5ccc3ded17bb52101ba76371e206e206974b7 --- /dev/null +++ b/silver-butterfly-62/model_7000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 7000, + "update_step": 7000, + "tokens_seen": 4472881152, + "tokens_seen_before": 4472242176, + "update_time": 0.7704808712005615, + "wandb_id": "e0h7cx93" +} \ No newline at end of file diff --git a/silver-butterfly-62/model_8000/config.json b/silver-butterfly-62/model_8000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/silver-butterfly-62/model_8000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/training_config.yaml b/silver-butterfly-62/training_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5f898db487f8a99da8b811593e9c9e08c9640427 --- /dev/null +++ b/silver-butterfly-62/training_config.yaml @@ -0,0 +1,45 @@ +adam_beta1: 0.9 +adam_beta2: 0.95 +adjust_step: 0 +autoresume: false +batch_size: 6 +clip_grad_norm: 1.0 +comment: null +cycle_length: null +dtype: bfloat16 +emb_freeze: null +eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/ +eval_every: 1000 +first_attention: null +gradient_accumulation: 13 +keep_checkpoints: null +layer_freeze: null +load_optimizer_state_on_resume: true +lr: 0.0004 +max_length: 1024 +max_train_tokens: null +min_lr_ratio: 0.1 +model_config: model_config/478m.json +model_name_or_path: null +model_revision: null +num_training_steps: 15000 +optimizer: Adam +restart_warmup_steps: null +resume_from: null +run_name: silver-butterfly-62 +save_dir: checkpoints/silver-butterfly-62 +save_every: 1000 +scheduler: cosine +seed: 0 +shuffle: true +skip_batches: !!set {} +tags: +- 396m-for-680m +total_batch_size: 624 +train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/ +training_config: training_config/478m.yaml +wandb_watch: true +warmed_up_model: null +warmup_steps: 1500 +weight_decay: 0.0 +workers: 8