diff --git a/baseline/final_model/pytorch_model.bin b/baseline/final_model/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..30e9e514722f3081c6e6f733260a5897aac57f5b --- /dev/null +++ b/baseline/final_model/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a68abbc43af60bd4c7f2d95c72f10740acbbb95df43e3fd9ba6ae48d8c02ccc +size 2533545094 diff --git a/bigram_2_full/final_model/config.json b/bigram_2_full/final_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2_full/final_model/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2_full/model_1000/training_state.json b/bigram_2_full/model_1000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a63a76e652283c529032274eeb90b1dfc801cff0 --- /dev/null +++ b/bigram_2_full/model_1000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 1000, + "update_step": 1000, + "tokens_seen": 639025152, + "tokens_seen_before": 638386176, + "update_time": 3.281445264816284, + "wandb_id": "7nopmkvs" +} \ No newline at end of file diff --git a/bigram_2_full/model_11000/model_config.json b/bigram_2_full/model_11000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2_full/model_11000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2_full/model_11000/training_state.json b/bigram_2_full/model_11000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4347ecc6da3f7dcaac6a2ca91f3c46af6e0eb120 --- /dev/null +++ b/bigram_2_full/model_11000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 11000, + "update_step": 11000, + "tokens_seen": 7028785152, + "tokens_seen_before": 7028146176, + "update_time": 3.2798073291778564, + "wandb_id": "7nopmkvs" +} \ No newline at end of file diff --git a/bigram_2_full/model_13000/model_config.json b/bigram_2_full/model_13000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2_full/model_13000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2_full/model_13000/training_state.json b/bigram_2_full/model_13000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a2cfe92d5e43ac87d568bc1b3bb40290e3051c0e --- /dev/null +++ b/bigram_2_full/model_13000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 13000, + "update_step": 13000, + "tokens_seen": 8306737152, + "tokens_seen_before": 8306098176, + "update_time": 3.2799124717712402, + "wandb_id": "7nopmkvs" +} \ No newline at end of file diff --git a/bigram_2_full/model_14000/training_state.json b/bigram_2_full/model_14000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..00522f22657146e53c1216d61f6b78066a289bcc --- /dev/null +++ b/bigram_2_full/model_14000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 14000, + "update_step": 14000, + "tokens_seen": 8945713152, + "tokens_seen_before": 8945074176, + "update_time": 3.2842938899993896, + "wandb_id": "7nopmkvs" +} \ No newline at end of file diff --git a/bigram_2_full/model_2000/model_config.json b/bigram_2_full/model_2000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2_full/model_2000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2_full/model_2000/training_state.json b/bigram_2_full/model_2000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..26b3e58fe6b85d725cb7da315aac61fdb41b9a93 --- /dev/null +++ b/bigram_2_full/model_2000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 2000, + "update_step": 2000, + "tokens_seen": 1278001152, + "tokens_seen_before": 1277362176, + "update_time": 3.285855293273926, + "wandb_id": "7nopmkvs" +} \ No newline at end of file diff --git a/bigram_2_full/model_3000/model_config.json b/bigram_2_full/model_3000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2_full/model_3000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2_full/model_3000/training_state.json b/bigram_2_full/model_3000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..664ba5299e7aaf0d3f383d90c5ba424847d154c9 --- /dev/null +++ b/bigram_2_full/model_3000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 3000, + "update_step": 3000, + "tokens_seen": 1916977152, + "tokens_seen_before": 1916338176, + "update_time": 3.2819085121154785, + "wandb_id": "7nopmkvs" +} \ No newline at end of file diff --git a/bigram_2_full/model_4000/model_config.json b/bigram_2_full/model_4000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2_full/model_4000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2_full/model_4000/training_state.json b/bigram_2_full/model_4000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..05c99aac2b1fce84f47177137ebe9481fc64cc1e --- /dev/null +++ b/bigram_2_full/model_4000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 4000, + "update_step": 4000, + "tokens_seen": 2555953152, + "tokens_seen_before": 2555314176, + "update_time": 3.283513307571411, + "wandb_id": "7nopmkvs" +} \ No newline at end of file diff --git a/bigram_2_full/model_5000/model_config.json b/bigram_2_full/model_5000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2_full/model_5000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2_full/model_5000/training_state.json b/bigram_2_full/model_5000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..95d62d7f901bde00e39d691261219e5d97214397 --- /dev/null +++ b/bigram_2_full/model_5000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 5000, + "update_step": 5000, + "tokens_seen": 3194929152, + "tokens_seen_before": 3194290176, + "update_time": 3.27878475189209, + "wandb_id": "7nopmkvs" +} \ No newline at end of file diff --git a/bigram_2_full/model_9000/model_config.json b/bigram_2_full/model_9000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2_full/model_9000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2_full/model_9000/training_state.json b/bigram_2_full/model_9000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fafcbe9190a5ca4113929ac1b5f3deb3c2b8f878 --- /dev/null +++ b/bigram_2_full/model_9000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 9000, + "update_step": 9000, + "tokens_seen": 5750833152, + "tokens_seen_before": 5750194176, + "update_time": 3.2838311195373535, + "wandb_id": "7nopmkvs" +} \ No newline at end of file diff --git a/bigram_2_full/training_config.yaml b/bigram_2_full/training_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0d562dcde7cc7b826303af3ea329d1a11ef3ad00 --- /dev/null +++ b/bigram_2_full/training_config.yaml @@ -0,0 +1,44 @@ +adam_beta1: 0.9 +adam_beta2: 0.95 +adjust_step: 0 +autoresume: false +batch_size: 6 +clip_grad_norm: 1.0 +comment: null +cycle_length: null +dtype: bfloat16 +emb_freeze: null +eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/ +eval_every: 1000 +gradient_accumulation: 13 +keep_checkpoints: null +layer_freeze: null +load_optimizer_state_on_resume: true +lr: 0.0004 +max_length: 1024 +max_train_tokens: null +min_lr_ratio: 0.1 +model_config: model_config/478m.json +model_name_or_path: null +model_revision: null +num_training_steps: 15000 +optimizer: Adam +restart_warmup_steps: null +resume_from: null +run_name: earnest-sky-22 +save_dir: checkpoints/earnest-sky-22 +save_every: 1000 +scheduler: cosine +seed: 0 +shuffle: true +skip_batches: !!set {} +tags: +- 396m-for-680m +total_batch_size: 624 +train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/ +training_config: training_config/two_stage/478m_resume_unfreeze.yaml +wandb_watch: true +warmed_up_model: /lee_embedding/checkpoints/tough-snowflake-18/final_model/ +warmup_steps: 1500 +weight_decay: 0.0 +workers: 8 diff --git a/first_attention_2_attention_unfreeze/final_model/config.json b/first_attention_2_attention_unfreeze/final_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/final_model/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_1000/config.json b/first_attention_2_attention_unfreeze/model_1000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_1000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_1000/training_state.json b/first_attention_2_attention_unfreeze/model_1000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d1c5bb53f98cb1d74770f02744e434ba910f21e5 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_1000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 1000, + "update_step": 1000, + "tokens_seen": 639025152, + "tokens_seen_before": 638386176, + "update_time": 2.9259252548217773, + "wandb_id": "0ab5p6ah" +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_10000/config.json b/first_attention_2_attention_unfreeze/model_10000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_10000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_10000/training_state.json b/first_attention_2_attention_unfreeze/model_10000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..379803eb2ecf0ffa73eb6974b937570ac52e4d4d --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_10000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 10000, + "update_step": 10000, + "tokens_seen": 6389809152, + "tokens_seen_before": 6389170176, + "update_time": 2.925158977508545, + "wandb_id": "0ab5p6ah" +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_11000/config.json b/first_attention_2_attention_unfreeze/model_11000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_11000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_11000/training_state.json b/first_attention_2_attention_unfreeze/model_11000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..924832b1f4f4ac690db6c14b405b8bcc1ec0bab2 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_11000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 11000, + "update_step": 11000, + "tokens_seen": 7028785152, + "tokens_seen_before": 7028146176, + "update_time": 2.925471067428589, + "wandb_id": "0ab5p6ah" +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_12000/config.json b/first_attention_2_attention_unfreeze/model_12000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_12000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_12000/training_state.json b/first_attention_2_attention_unfreeze/model_12000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8b6beb40f4a8f4f504a7f2a0d09146ca22379fab --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_12000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 12000, + "update_step": 12000, + "tokens_seen": 7667761152, + "tokens_seen_before": 7667122176, + "update_time": 2.9245717525482178, + "wandb_id": "0ab5p6ah" +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_13000/training_state.json b/first_attention_2_attention_unfreeze/model_13000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d542ed3b3e4256440b55ec1dc3cc34f80110d867 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_13000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 13000, + "update_step": 13000, + "tokens_seen": 8306737152, + "tokens_seen_before": 8306098176, + "update_time": 2.9278290271759033, + "wandb_id": "0ab5p6ah" +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_14000/config.json b/first_attention_2_attention_unfreeze/model_14000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_14000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_14000/training_state.json b/first_attention_2_attention_unfreeze/model_14000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..aecf7ece2424555a569f8f780bef5f66c0dc9077 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_14000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 14000, + "update_step": 14000, + "tokens_seen": 8945713152, + "tokens_seen_before": 8945074176, + "update_time": 2.924705743789673, + "wandb_id": "0ab5p6ah" +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_15000/config.json b/first_attention_2_attention_unfreeze/model_15000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_15000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_15000/training_state.json b/first_attention_2_attention_unfreeze/model_15000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5ba896c58a04b68dd7cc30a7dab6c256e56ca855 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_15000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 15000, + "update_step": 15000, + "tokens_seen": 9584689152, + "tokens_seen_before": 9584050176, + "update_time": 2.924987554550171, + "wandb_id": "0ab5p6ah" +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_2000/config.json b/first_attention_2_attention_unfreeze/model_2000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_2000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_2000/training_state.json b/first_attention_2_attention_unfreeze/model_2000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7bba5c2b38176089c28f85ae5813217e8b3644e2 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_2000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 2000, + "update_step": 2000, + "tokens_seen": 1278001152, + "tokens_seen_before": 1277362176, + "update_time": 2.923201322555542, + "wandb_id": "0ab5p6ah" +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_3000/config.json b/first_attention_2_attention_unfreeze/model_3000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_3000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_3000/training_state.json b/first_attention_2_attention_unfreeze/model_3000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b3d36da0db922e09678af5b196449415ed85d45f --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_3000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 3000, + "update_step": 3000, + "tokens_seen": 1916977152, + "tokens_seen_before": 1916338176, + "update_time": 2.9246528148651123, + "wandb_id": "0ab5p6ah" +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_4000/config.json b/first_attention_2_attention_unfreeze/model_4000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_4000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_4000/training_state.json b/first_attention_2_attention_unfreeze/model_4000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2d779f12d5024f1e305141bd4177e664206d18e0 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_4000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 4000, + "update_step": 4000, + "tokens_seen": 2555953152, + "tokens_seen_before": 2555314176, + "update_time": 2.9246957302093506, + "wandb_id": "0ab5p6ah" +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_5000/config.json b/first_attention_2_attention_unfreeze/model_5000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_5000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_5000/training_state.json b/first_attention_2_attention_unfreeze/model_5000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8c84388bbf82f6e9b5e1e835284eb859a0bbc32d --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_5000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 5000, + "update_step": 5000, + "tokens_seen": 3194929152, + "tokens_seen_before": 3194290176, + "update_time": 2.9258697032928467, + "wandb_id": "0ab5p6ah" +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_6000/config.json b/first_attention_2_attention_unfreeze/model_6000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_6000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_6000/training_state.json b/first_attention_2_attention_unfreeze/model_6000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..13f355fd82c4ec683ae12960de7eda1d608de39a --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_6000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 6000, + "update_step": 6000, + "tokens_seen": 3833905152, + "tokens_seen_before": 3833266176, + "update_time": 2.92516827583313, + "wandb_id": "0ab5p6ah" +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_7000/config.json b/first_attention_2_attention_unfreeze/model_7000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_7000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_7000/training_state.json b/first_attention_2_attention_unfreeze/model_7000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..216bbf81ccfeed98d09061f2891300d21a5e5a1e --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_7000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 7000, + "update_step": 7000, + "tokens_seen": 4472881152, + "tokens_seen_before": 4472242176, + "update_time": 2.925605058670044, + "wandb_id": "0ab5p6ah" +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_8000/config.json b/first_attention_2_attention_unfreeze/model_8000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_8000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_8000/training_state.json b/first_attention_2_attention_unfreeze/model_8000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bd614cfe480ae2f3dfce4f5a7a789903266d1bbb --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_8000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 8000, + "update_step": 8000, + "tokens_seen": 5111857152, + "tokens_seen_before": 5111218176, + "update_time": 2.924616575241089, + "wandb_id": "0ab5p6ah" +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_9000/config.json b/first_attention_2_attention_unfreeze/model_9000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_9000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/model_9000/training_state.json b/first_attention_2_attention_unfreeze/model_9000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..96b217c52574fee5f318afec9348c20eea3522b1 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_9000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 9000, + "update_step": 9000, + "tokens_seen": 5750833152, + "tokens_seen_before": 5750194176, + "update_time": 2.9271013736724854, + "wandb_id": "0ab5p6ah" +} \ No newline at end of file diff --git a/first_attention_2_attention_unfreeze/training_config.yaml b/first_attention_2_attention_unfreeze/training_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fe098423ebab4b90b7b8e23fb0da48c365c8378b --- /dev/null +++ b/first_attention_2_attention_unfreeze/training_config.yaml @@ -0,0 +1,47 @@ +adam_beta1: 0.9 +adam_beta2: 0.95 +adjust_step: 0 +autoresume: false +batch_size: 6 +clip_grad_norm: 1.0 +comment: null +cycle_length: null +dtype: bfloat16 +emb_freeze: null +eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/ +eval_every: 1000 +first_attention: false +first_attention_resume: true +gradient_accumulation: 13 +keep_checkpoints: null +layer_freeze: null +layer_freeze_2: false +load_optimizer_state_on_resume: true +lr: 0.0004 +max_length: 1024 +max_train_tokens: null +min_lr_ratio: 0.1 +model_config: model_config/478m.json +model_name_or_path: null +model_revision: null +num_training_steps: 15000 +optimizer: Adam +restart_warmup_steps: null +resume_from: null +run_name: first_attention_resume_unfreeze +save_dir: checkpoints/first_attention_resume_unfreeze +save_every: 1000 +scheduler: cosine +seed: 0 +shuffle: true +skip_batches: !!set {} +tags: +- 396m-for-680m +total_batch_size: 624 +train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/ +training_config: training_config/two_stage/478m_first_attention_resume_unfreeze.yaml +wandb_watch: true +warmed_up_model: null +warmup_steps: 1500 +weight_decay: 0.0 +workers: 8 diff --git a/first_layer_2/model_1000/training_state.json b/first_layer_2/model_1000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0b1f5d2b3a7ef6d7cab491ae015aecd3fb595bf4 --- /dev/null +++ b/first_layer_2/model_1000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 1000, + "update_step": 1000, + "tokens_seen": 639025152, + "tokens_seen_before": 638386176, + "update_time": 2.883007764816284, + "wandb_id": "f3ljzhyw" +} \ No newline at end of file diff --git a/first_layer_2/model_10000/config.json b/first_layer_2/model_10000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_2/model_10000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_2/model_10000/training_state.json b/first_layer_2/model_10000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..613fce36c84b10e6c807cd3bb1b195e799e3ccff --- /dev/null +++ b/first_layer_2/model_10000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 10000, + "update_step": 10000, + "tokens_seen": 6389809152, + "tokens_seen_before": 6389170176, + "update_time": 2.878925085067749, + "wandb_id": "f3ljzhyw" +} \ No newline at end of file diff --git a/first_layer_2/model_12000/training_state.json b/first_layer_2/model_12000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4e6e0777e6ad2c7f208096b26aa6ecd7e2b9a584 --- /dev/null +++ b/first_layer_2/model_12000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 12000, + "update_step": 12000, + "tokens_seen": 7667761152, + "tokens_seen_before": 7667122176, + "update_time": 2.8791251182556152, + "wandb_id": "f3ljzhyw" +} \ No newline at end of file diff --git a/first_layer_2/model_15000/config.json b/first_layer_2/model_15000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_2/model_15000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_2/model_6000/config.json b/first_layer_2/model_6000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_2/model_6000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_layer_2/model_6000/training_state.json b/first_layer_2/model_6000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..29f61e7467334859b4ace11d5ea1ebb0cdf32505 --- /dev/null +++ b/first_layer_2/model_6000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 6000, + "update_step": 6000, + "tokens_seen": 3833905152, + "tokens_seen_before": 3833266176, + "update_time": 2.875896453857422, + "wandb_id": "f3ljzhyw" +} \ No newline at end of file diff --git a/first_layer_2/model_9000/config.json b/first_layer_2/model_9000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_layer_2/model_9000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/final_model/config.json b/silver-butterfly-62/final_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/silver-butterfly-62/final_model/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/model_1000/config.json b/silver-butterfly-62/model_1000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/silver-butterfly-62/model_1000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/model_10000/training_state.json b/silver-butterfly-62/model_10000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..62868a01bb9226acf12d87b5e3312bf7ae45240f --- /dev/null +++ b/silver-butterfly-62/model_10000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 10000, + "update_step": 10000, + "tokens_seen": 6389809152, + "tokens_seen_before": 6389170176, + "update_time": 0.7699310779571533, + "wandb_id": "e0h7cx93" +} \ No newline at end of file diff --git a/silver-butterfly-62/model_11000/config.json b/silver-butterfly-62/model_11000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/silver-butterfly-62/model_11000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/model_11000/training_state.json b/silver-butterfly-62/model_11000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d91623b956831438f3a0e7ed540b44a979984c7c --- /dev/null +++ b/silver-butterfly-62/model_11000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 11000, + "update_step": 11000, + "tokens_seen": 7028785152, + "tokens_seen_before": 7028146176, + "update_time": 0.7700908184051514, + "wandb_id": "e0h7cx93" +} \ No newline at end of file diff --git a/silver-butterfly-62/model_12000/training_state.json b/silver-butterfly-62/model_12000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1f57cb6217e3637e0a0046d5c2a7bd942c4bc64a --- /dev/null +++ b/silver-butterfly-62/model_12000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 12000, + "update_step": 12000, + "tokens_seen": 7667761152, + "tokens_seen_before": 7667122176, + "update_time": 0.7706863880157471, + "wandb_id": "e0h7cx93" +} \ No newline at end of file diff --git a/silver-butterfly-62/model_13000/training_state.json b/silver-butterfly-62/model_13000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bad4e57cf91a65696ce74307be5d27a6d76d3317 --- /dev/null +++ b/silver-butterfly-62/model_13000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 13000, + "update_step": 13000, + "tokens_seen": 8306737152, + "tokens_seen_before": 8306098176, + "update_time": 0.7703430652618408, + "wandb_id": "e0h7cx93" +} \ No newline at end of file diff --git a/silver-butterfly-62/model_14000/training_state.json b/silver-butterfly-62/model_14000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..97dabe821f7760ae6ff29486b0c69d4f4733ddcd --- /dev/null +++ b/silver-butterfly-62/model_14000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 14000, + "update_step": 14000, + "tokens_seen": 8945713152, + "tokens_seen_before": 8945074176, + "update_time": 0.7704954147338867, + "wandb_id": "e0h7cx93" +} \ No newline at end of file diff --git a/silver-butterfly-62/model_15000/training_state.json b/silver-butterfly-62/model_15000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..aea0ba869bc468bdeef1ab99ca65f853990f20e1 --- /dev/null +++ b/silver-butterfly-62/model_15000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 15000, + "update_step": 15000, + "tokens_seen": 9584689152, + "tokens_seen_before": 9584050176, + "update_time": 0.7696325778961182, + "wandb_id": "e0h7cx93" +} \ No newline at end of file diff --git a/silver-butterfly-62/model_2000/config.json b/silver-butterfly-62/model_2000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/silver-butterfly-62/model_2000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/model_3000/config.json b/silver-butterfly-62/model_3000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/silver-butterfly-62/model_3000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/model_4000/training_state.json b/silver-butterfly-62/model_4000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9d1063c2b8a21f2b7cf07493b2fc8cb16ab7b466 --- /dev/null +++ b/silver-butterfly-62/model_4000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 4000, + "update_step": 4000, + "tokens_seen": 2555953152, + "tokens_seen_before": 2555314176, + "update_time": 0.7703680992126465, + "wandb_id": "e0h7cx93" +} \ No newline at end of file diff --git a/silver-butterfly-62/model_5000/config.json b/silver-butterfly-62/model_5000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/silver-butterfly-62/model_5000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/model_5000/training_state.json b/silver-butterfly-62/model_5000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..80d28e6c221ee63435a0aa1fb62056003895a3c2 --- /dev/null +++ b/silver-butterfly-62/model_5000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 5000, + "update_step": 5000, + "tokens_seen": 3194929152, + "tokens_seen_before": 3194290176, + "update_time": 0.7708892822265625, + "wandb_id": "e0h7cx93" +} \ No newline at end of file diff --git a/silver-butterfly-62/model_6000/training_state.json b/silver-butterfly-62/model_6000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7a1d29e8228981ff16fe287fa04d69b4cb87c550 --- /dev/null +++ b/silver-butterfly-62/model_6000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 6000, + "update_step": 6000, + "tokens_seen": 3833905152, + "tokens_seen_before": 3833266176, + "update_time": 0.7694945335388184, + "wandb_id": "e0h7cx93" +} \ No newline at end of file diff --git a/silver-butterfly-62/model_8000/training_state.json b/silver-butterfly-62/model_8000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bd0eadd4fb1c52dfc638aced882938f466ab6f15 --- /dev/null +++ b/silver-butterfly-62/model_8000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 8000, + "update_step": 8000, + "tokens_seen": 5111857152, + "tokens_seen_before": 5111218176, + "update_time": 0.7700819969177246, + "wandb_id": "e0h7cx93" +} \ No newline at end of file diff --git a/silver-butterfly-62/model_9000/config.json b/silver-butterfly-62/model_9000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/silver-butterfly-62/model_9000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/silver-butterfly-62/model_9000/training_state.json b/silver-butterfly-62/model_9000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7d2b3ec6b90f1513eab68064cbe19d2bc8dca081 --- /dev/null +++ b/silver-butterfly-62/model_9000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 9000, + "update_step": 9000, + "tokens_seen": 5750833152, + "tokens_seen_before": 5750194176, + "update_time": 0.7697172164916992, + "wandb_id": "e0h7cx93" +} \ No newline at end of file