diff --git a/attention_2_only_emb/final_model/pytorch_model.bin b/attention_2_only_emb/final_model/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..95671e5b221e307dfd8b43fd774ce1a8e341cf9b --- /dev/null +++ b/attention_2_only_emb/final_model/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65238b445c26f43da67f2f54cecb176acdc8f4316c36230e1baccf453169754b +size 2533545094 diff --git a/attention_2_only_emb/model_1000/pytorch_model.bin b/attention_2_only_emb/model_1000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..378e8f4dfc77739b9b7a3041fada61ea4a3811f2 --- /dev/null +++ b/attention_2_only_emb/model_1000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d48e11d99e74e3ac615d5f6bd78b0f524e0ad1951fca5b223c9aec52ebb23596 +size 2533545094 diff --git a/attention_2_only_emb/model_10000/pytorch_model.bin b/attention_2_only_emb/model_10000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..03697e58d7b3d79c041b33f8cf1a36cc2bdcc3a1 --- /dev/null +++ b/attention_2_only_emb/model_10000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e190ea29dd8119871d88bc6c120b49a4c3b6e9e979286dac7b464f9c42fb5454 +size 2533545094 diff --git a/attention_2_only_emb/model_11000/pytorch_model.bin b/attention_2_only_emb/model_11000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..f6a3d29ecb67e69c8b18ad5eba25236b3eb48800 --- /dev/null +++ b/attention_2_only_emb/model_11000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e8a06de57e06be1e59112b19c52985b6e1e1905645464df7d675c61f8a62a67 +size 2533545094 diff --git a/attention_2_only_emb/model_13000/pytorch_model.bin b/attention_2_only_emb/model_13000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..13ad192ebe0e8705f80b5cc28f1daf5d0c51ade5 --- /dev/null +++ b/attention_2_only_emb/model_13000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33dfd88af702c854bab079669a8dde4157c14ab4dbbea412e8382b1157fed607 +size 2533545094 diff --git a/attention_2_only_emb/model_14000/pytorch_model.bin b/attention_2_only_emb/model_14000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..51e3b9125da682e52a3dabf3e86f4d910e2d8b57 --- /dev/null +++ b/attention_2_only_emb/model_14000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:288665e9230d153d7a863e7a8e66cc3c48d5133ba629a788f8c8302f62bcf508 +size 2533545094 diff --git a/attention_2_only_emb/model_15000/pytorch_model.bin b/attention_2_only_emb/model_15000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..95671e5b221e307dfd8b43fd774ce1a8e341cf9b --- /dev/null +++ b/attention_2_only_emb/model_15000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65238b445c26f43da67f2f54cecb176acdc8f4316c36230e1baccf453169754b +size 2533545094 diff --git a/attention_2_only_emb/model_4000/pytorch_model.bin b/attention_2_only_emb/model_4000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..9b1416b0bab8868541ea4571a456a5e63423a241 --- /dev/null +++ b/attention_2_only_emb/model_4000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f831b13a1e989d024050ac81156cca927d96d40afd492847292ea167b8f97d8e +size 2533545094 diff --git a/attention_2_only_emb/model_5000/pytorch_model.bin b/attention_2_only_emb/model_5000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..17d01f228b890cc161410230468fad2793b9b850 --- /dev/null +++ b/attention_2_only_emb/model_5000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a72b2ed53bbf3dbc3fc4711abcd9d661d0064760380d4c3f2ce7c6b59db76dbf +size 2533545094 diff --git a/attention_2_only_emb/model_9000/pytorch_model.bin b/attention_2_only_emb/model_9000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..854b13c5fe697f8de1eba0c8474e6b371e273a18 --- /dev/null +++ b/attention_2_only_emb/model_9000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4851e88488581a963da7b13d25b32cf82e0512d7a393bfbf957b3671ff379c7a +size 2533545094 diff --git a/baseline/model_1000/pytorch_model.bin b/baseline/model_1000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..cd0cac02a4f88d9f645ff007ef5f596a434d5edc --- /dev/null +++ b/baseline/model_1000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00edd9bd916b1d6059044c1e496b6c21775ba6e8cfc08fb54ea38a112cefeb4e +size 2533545094 diff --git a/baseline/model_11000/pytorch_model.bin b/baseline/model_11000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..03dfcbce61dd13f29230fde15f02811e200e0dba --- /dev/null +++ b/baseline/model_11000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef2caffc4e634a9b5cc17988d11913c72e3a6eeede9a45f38257db62bff5f661 +size 2533545094 diff --git a/baseline/model_12000/pytorch_model.bin b/baseline/model_12000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..f7c0e4d4150892b1a3ea7a22dd5d0c92557ca872 --- /dev/null +++ b/baseline/model_12000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e719eae49624f71ba9389bd9764c1cb5d07c0acf33a1220eb4f8dda73240b9b +size 2533545094 diff --git a/baseline/model_13000/pytorch_model.bin b/baseline/model_13000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e828cb6f6046020e30081a5b903a54f1e087f3df --- /dev/null +++ b/baseline/model_13000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:672bb2ac05b1618a7d748e23fa392f5197065cef854f22b6dce109b472470517 +size 2533545094 diff --git a/baseline/model_14000/pytorch_model.bin b/baseline/model_14000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..57ed073b2f9d4c0a39a4b0ce8f91c5016e4bb101 --- /dev/null +++ b/baseline/model_14000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4739b4a78103197ee542a1010ba48e38824f60f5ec00ce70a9ae33ca5f2e4343 +size 2533545094 diff --git a/baseline/model_2000/pytorch_model.bin b/baseline/model_2000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..c33f206a54f0dac0d693f3a0ba40c054203c8bab --- /dev/null +++ b/baseline/model_2000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc43549f54dd1e2edc06ec8ea811fc48c5b51a83830ec22027266878f3657b91 +size 2533545094 diff --git a/baseline/model_3000/pytorch_model.bin b/baseline/model_3000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..41c403d50726d881633aab9dd1ff78e7459096ad --- /dev/null +++ b/baseline/model_3000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83671facca5fb99530b5c52bbd111dda3383639f4ef50f90bd7379aa1b37fae2 +size 2533545094 diff --git a/baseline/model_4000/pytorch_model.bin b/baseline/model_4000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..013f0b0f3896454305be7ecd36202cf677723804 --- /dev/null +++ b/baseline/model_4000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebb2010e8f368ae6fe61f89a9d71b8ee6a821a59727301688097911f2337d544 +size 2533545094 diff --git a/baseline/model_5000/pytorch_model.bin b/baseline/model_5000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e0e57f5d7b997b27d32aded45ef22a03993eddce --- /dev/null +++ b/baseline/model_5000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d88bc7652172e2291ad1e16d2326344a1f9651dec698d2f14ff2b8a909f82dc +size 2533545094 diff --git a/baseline/model_6000/pytorch_model.bin b/baseline/model_6000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..1e6820276ba5468f0adab2d82c0fa2faf7e6b5b9 --- /dev/null +++ b/baseline/model_6000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e8417cded07b68977443fb14688e0a5d1fdfe3727a5a58da05dd5df10423795 +size 2533545094 diff --git a/baseline/model_7000/pytorch_model.bin b/baseline/model_7000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..3ffd06ffe06db37eaf8209c0afc616d1a91ab2c5 --- /dev/null +++ b/baseline/model_7000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bedb26968104d1fe2d880f5b4e0ebb006fbea3dde5ca3603f7606c07e0a4bed8 +size 2533545094 diff --git a/baseline/model_8000/pytorch_model.bin b/baseline/model_8000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..4e065d99cda8d1e005df97f758c6cfe717c15156 --- /dev/null +++ b/baseline/model_8000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7d8381c12bd8d5c961d08860dfeac2a8cfbea2af8400b6d62bd2623c90258f5 +size 2533545094 diff --git a/baseline/model_9000/pytorch_model.bin b/baseline/model_9000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..1b71c5d2f55e8e54d275d20ac8d787344f940aab --- /dev/null +++ b/baseline/model_9000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d62dca90201b94b66ead46c71e3b2036bbe678cd38ab831997b6ce49b858773f +size 2533545094 diff --git a/bigram_2/final_model/config.json b/bigram_2/final_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2/final_model/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2/model_1000/model_config.json b/bigram_2/model_1000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2/model_1000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2/model_1000/training_state.json b/bigram_2/model_1000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..81cd0d80404a5e37372412f6a02ed258d564ca89 --- /dev/null +++ b/bigram_2/model_1000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 1000, + "update_step": 1000, + "tokens_seen": 639025152, + "tokens_seen_before": 638386176, + "update_time": 2.892822504043579, + "wandb_id": "walilhuz" +} \ No newline at end of file diff --git a/bigram_2/model_10000/model_config.json b/bigram_2/model_10000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2/model_10000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2/model_10000/training_state.json b/bigram_2/model_10000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1fcc39d0302aca08bc8dd8de528073aefb462059 --- /dev/null +++ b/bigram_2/model_10000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 10000, + "update_step": 10000, + "tokens_seen": 6389809152, + "tokens_seen_before": 6389170176, + "update_time": 2.8910531997680664, + "wandb_id": "walilhuz" +} \ No newline at end of file diff --git a/bigram_2/model_11000/model_config.json b/bigram_2/model_11000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2/model_11000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2/model_11000/training_state.json b/bigram_2/model_11000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3424a3084540e7fcc2928dc45d77e68af62afa0e --- /dev/null +++ b/bigram_2/model_11000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 11000, + "update_step": 11000, + "tokens_seen": 7028785152, + "tokens_seen_before": 7028146176, + "update_time": 2.88981556892395, + "wandb_id": "walilhuz" +} \ No newline at end of file diff --git a/bigram_2/model_12000/model_config.json b/bigram_2/model_12000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2/model_12000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2/model_12000/training_state.json b/bigram_2/model_12000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6b9a5e45ee6f4d159ad10d6465b8d5d3d7f04282 --- /dev/null +++ b/bigram_2/model_12000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 12000, + "update_step": 12000, + "tokens_seen": 7667761152, + "tokens_seen_before": 7667122176, + "update_time": 2.8912298679351807, + "wandb_id": "walilhuz" +} \ No newline at end of file diff --git a/bigram_2/model_14000/model_config.json b/bigram_2/model_14000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2/model_14000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2/model_14000/training_state.json b/bigram_2/model_14000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..022a95a57b3f716cdedd9cf9d11fe05e285e47a7 --- /dev/null +++ b/bigram_2/model_14000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 14000, + "update_step": 14000, + "tokens_seen": 8945713152, + "tokens_seen_before": 8945074176, + "update_time": 2.8920021057128906, + "wandb_id": "walilhuz" +} \ No newline at end of file diff --git a/bigram_2/model_15000/model_config.json b/bigram_2/model_15000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2/model_15000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2/model_15000/training_state.json b/bigram_2/model_15000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c898806ef419c4f1ededd28aaf71a03240351f02 --- /dev/null +++ b/bigram_2/model_15000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 15000, + "update_step": 15000, + "tokens_seen": 9584689152, + "tokens_seen_before": 9584050176, + "update_time": 2.89182710647583, + "wandb_id": "walilhuz" +} \ No newline at end of file diff --git a/bigram_2/model_4000/model_config.json b/bigram_2/model_4000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2/model_4000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2/model_5000/model_config.json b/bigram_2/model_5000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2/model_5000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2/model_5000/training_state.json b/bigram_2/model_5000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1bdc22f88f77c3f3f5e88f9efee1591baa9acb72 --- /dev/null +++ b/bigram_2/model_5000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 5000, + "update_step": 5000, + "tokens_seen": 3194929152, + "tokens_seen_before": 3194290176, + "update_time": 2.8917791843414307, + "wandb_id": "walilhuz" +} \ No newline at end of file diff --git a/bigram_2/model_6000/model_config.json b/bigram_2/model_6000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2/model_6000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2/model_6000/training_state.json b/bigram_2/model_6000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..388967265b0e371f66582c6662e629e7cd031941 --- /dev/null +++ b/bigram_2/model_6000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 6000, + "update_step": 6000, + "tokens_seen": 3833905152, + "tokens_seen_before": 3833266176, + "update_time": 2.892122983932495, + "wandb_id": "walilhuz" +} \ No newline at end of file diff --git a/bigram_2/model_7000/model_config.json b/bigram_2/model_7000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2/model_7000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2/model_7000/training_state.json b/bigram_2/model_7000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7f71315234af27e3e7ac92e9dc7a68e1e61972ca --- /dev/null +++ b/bigram_2/model_7000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 7000, + "update_step": 7000, + "tokens_seen": 4472881152, + "tokens_seen_before": 4472242176, + "update_time": 2.8900256156921387, + "wandb_id": "walilhuz" +} \ No newline at end of file diff --git a/bigram_2/model_8000/model_config.json b/bigram_2/model_8000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2/model_8000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2/model_8000/training_state.json b/bigram_2/model_8000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0b142ba0aeeea8a30e710428701b65c91cc00b48 --- /dev/null +++ b/bigram_2/model_8000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 8000, + "update_step": 8000, + "tokens_seen": 5111857152, + "tokens_seen_before": 5111218176, + "update_time": 2.8911120891571045, + "wandb_id": "walilhuz" +} \ No newline at end of file diff --git a/bigram_2/model_9000/model_config.json b/bigram_2/model_9000/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/bigram_2/model_9000/model_config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/bigram_2/model_9000/training_state.json b/bigram_2/model_9000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e5f5b9a718929823c4233ee0407ecd8eff0d632a --- /dev/null +++ b/bigram_2/model_9000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 9000, + "update_step": 9000, + "tokens_seen": 5750833152, + "tokens_seen_before": 5750194176, + "update_time": 2.890213966369629, + "wandb_id": "walilhuz" +} \ No newline at end of file diff --git a/bigram_2_full/final_model/pytorch_model.bin b/bigram_2_full/final_model/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..1d2c7868988799fa9135d1ee3c14ea7f1357b0d5 --- /dev/null +++ b/bigram_2_full/final_model/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:477b0b02d1fa370dd03a88359bf7784e17681740c07a6952b75b5bab9f5e333f +size 2533545094 diff --git a/bigram_2_full/model_1000/pytorch_model.bin b/bigram_2_full/model_1000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..1d9946f12cb9210a4dc458bfc123703dba656497 --- /dev/null +++ b/bigram_2_full/model_1000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f696208f3842be6f4c26371224bc32cb8cb8f5a268c879be0dd18afef088b4f +size 2533545094 diff --git a/bigram_2_full/model_10000/pytorch_model.bin b/bigram_2_full/model_10000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..941e96234d5b7e40485a2e8337148fc4cd6c7832 --- /dev/null +++ b/bigram_2_full/model_10000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:808a7200d616e0ad374917803a0a3e5e53c4751e33bdf1761e5d421e603581cd +size 2533545094 diff --git a/bigram_2_full/model_11000/pytorch_model.bin b/bigram_2_full/model_11000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..02d773600791e65e1ee32d30261135522400c82c --- /dev/null +++ b/bigram_2_full/model_11000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b2c8b1b152a92fcb299306899342ad9e07e2fc3b3c270141d5153b2da982088 +size 2533545094 diff --git a/bigram_2_full/model_12000/pytorch_model.bin b/bigram_2_full/model_12000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..6ebae8cc7dec6c1ae08cc9dfc0414830136a3f8b --- /dev/null +++ b/bigram_2_full/model_12000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:062de95bd0fbe5d415395f0ef568fc9c5a047f5ece6ae39233004ac42ad5b2ec +size 2533545094 diff --git a/bigram_2_full/model_13000/pytorch_model.bin b/bigram_2_full/model_13000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..31734c6a114e1f290b1e0c97d6bd037811bd0bff --- /dev/null +++ b/bigram_2_full/model_13000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:322b22a2bda58c621d02d89a01ec0f2532dc3ce28ad99ba73ff5b899173e8a38 +size 2533545094 diff --git a/bigram_2_full/model_14000/pytorch_model.bin b/bigram_2_full/model_14000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..755e1fe55c4f7996208cce7010cf511d50d3e146 --- /dev/null +++ b/bigram_2_full/model_14000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1451295b36110a92129f88c001b929004b0ca76833c9c254578777aff9c0d7ba +size 2533545094 diff --git a/bigram_2_full/model_15000/pytorch_model.bin b/bigram_2_full/model_15000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..1d2c7868988799fa9135d1ee3c14ea7f1357b0d5 --- /dev/null +++ b/bigram_2_full/model_15000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:477b0b02d1fa370dd03a88359bf7784e17681740c07a6952b75b5bab9f5e333f +size 2533545094 diff --git a/bigram_2_full/model_2000/pytorch_model.bin b/bigram_2_full/model_2000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..c84717271e76a8b4a35c394502eefdbf3e1c9a78 --- /dev/null +++ b/bigram_2_full/model_2000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fbef22f7891155e70e7c265b62b090caa50f0364de5a11b4adb0f6d4fbc6417 +size 2533545094 diff --git a/bigram_2_full/model_3000/pytorch_model.bin b/bigram_2_full/model_3000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..293e4118218f0dc98de29eee6820d0fb061ecf8a --- /dev/null +++ b/bigram_2_full/model_3000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed6b522ff8826748d3c9a799bd85be1d1825c64b80c388c15b00bf3838b24efc +size 2533545094 diff --git a/bigram_2_full/model_4000/pytorch_model.bin b/bigram_2_full/model_4000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..f00cdf477a754683d78c9bddbda101fa06097843 --- /dev/null +++ b/bigram_2_full/model_4000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbb49985fad62c8da6673d6768bb3843c2678436167c6e503a90be5f5ad0e7cc +size 2533545094 diff --git a/bigram_2_full/model_5000/pytorch_model.bin b/bigram_2_full/model_5000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..fcc2841a14b685e0f195cc972ce208468c276a4a --- /dev/null +++ b/bigram_2_full/model_5000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e4d8c60a18da227bcf549ccb742861073d736f37887e7a60fd344342c17af79 +size 2533545094 diff --git a/bigram_2_full/model_6000/pytorch_model.bin b/bigram_2_full/model_6000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..37d8d1ec61a3c0504537927c5d1f0709fad2f52e --- /dev/null +++ b/bigram_2_full/model_6000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c65e3e9109eedadfcd0e30f45c3ce0f1b1674999b4bf92b1c3a5182ed7cbf27c +size 2533545094 diff --git a/bigram_2_full/model_7000/pytorch_model.bin b/bigram_2_full/model_7000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..595a6f8e75df1442fe3bdd39ccb9d2082af58e7d --- /dev/null +++ b/bigram_2_full/model_7000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8031e8f5551f47b130bfd7cc58b8927d62516ecc78ac3b09927b025c72935c5d +size 2533545094 diff --git a/bigram_2_full/model_8000/pytorch_model.bin b/bigram_2_full/model_8000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..132a06eb4c3c2a1d8f07f3d2d47b2a9b7e378c18 --- /dev/null +++ b/bigram_2_full/model_8000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9edbd2567cbe48ae392443870d7b60fd3ac6970556059d45142a05161b713bc0 +size 2533545094 diff --git a/bigram_2_full/model_9000/pytorch_model.bin b/bigram_2_full/model_9000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..2c4ee7b61e9b7704fc7094e6d3c99508bf12d40e --- /dev/null +++ b/bigram_2_full/model_9000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:812b7b7e03b649ec9c67c3694b220076bbcf95816c4681fca07cde6fecc7c402 +size 2533545094 diff --git a/first_attention_1/final_model/config.json b/first_attention_1/final_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_1/final_model/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_1/model_1000/config.json b/first_attention_1/model_1000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_1/model_1000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_1/model_1000/training_state.json b/first_attention_1/model_1000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7b25800c7ee1a6849329d986937842af1564ffbc --- /dev/null +++ b/first_attention_1/model_1000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 1000, + "update_step": 1000, + "tokens_seen": 639025152, + "tokens_seen_before": 638386176, + "update_time": 0.842038631439209, + "wandb_id": "0e3seng4" +} \ No newline at end of file diff --git a/first_attention_1/model_10000/config.json b/first_attention_1/model_10000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_1/model_10000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_1/model_10000/training_state.json b/first_attention_1/model_10000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..750dddae0916ac9388afdb5ed60f4c449b676932 --- /dev/null +++ b/first_attention_1/model_10000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 10000, + "update_step": 10000, + "tokens_seen": 6389809152, + "tokens_seen_before": 6389170176, + "update_time": 0.8426082134246826, + "wandb_id": "0e3seng4" +} \ No newline at end of file diff --git a/first_attention_1/model_11000/config.json b/first_attention_1/model_11000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_1/model_11000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_1/model_11000/training_state.json b/first_attention_1/model_11000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d1181ecb49ed7a3067a6b2dc83f5ead852f5ed33 --- /dev/null +++ b/first_attention_1/model_11000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 11000, + "update_step": 11000, + "tokens_seen": 7028785152, + "tokens_seen_before": 7028146176, + "update_time": 0.8413209915161133, + "wandb_id": "0e3seng4" +} \ No newline at end of file diff --git a/first_attention_1/model_12000/config.json b/first_attention_1/model_12000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_1/model_12000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_1/model_12000/training_state.json b/first_attention_1/model_12000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bf5ada7befaa42a325142d48bb6f07cf982ccca5 --- /dev/null +++ b/first_attention_1/model_12000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 12000, + "update_step": 12000, + "tokens_seen": 7667761152, + "tokens_seen_before": 7667122176, + "update_time": 0.8421485424041748, + "wandb_id": "0e3seng4" +} \ No newline at end of file diff --git a/first_attention_1/model_13000/config.json b/first_attention_1/model_13000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_1/model_13000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_1/model_13000/training_state.json b/first_attention_1/model_13000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a0dcc3de2b7c8e2309734711edd9610d4e88d5a7 --- /dev/null +++ b/first_attention_1/model_13000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 13000, + "update_step": 13000, + "tokens_seen": 8306737152, + "tokens_seen_before": 8306098176, + "update_time": 0.8420231342315674, + "wandb_id": "0e3seng4" +} \ No newline at end of file diff --git a/first_attention_1/model_14000/config.json b/first_attention_1/model_14000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_1/model_14000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_1/model_14000/training_state.json b/first_attention_1/model_14000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ece6a192ef3ed2d193d138d89a84be049c747b0c --- /dev/null +++ b/first_attention_1/model_14000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 14000, + "update_step": 14000, + "tokens_seen": 8945713152, + "tokens_seen_before": 8945074176, + "update_time": 0.842217206954956, + "wandb_id": "0e3seng4" +} \ No newline at end of file diff --git a/first_attention_1/model_15000/config.json b/first_attention_1/model_15000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_1/model_15000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_1/model_15000/training_state.json b/first_attention_1/model_15000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0c659ce53030acf8f35ee7a02753a3e887dde2a8 --- /dev/null +++ b/first_attention_1/model_15000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 15000, + "update_step": 15000, + "tokens_seen": 9584689152, + "tokens_seen_before": 9584050176, + "update_time": 0.8423118591308594, + "wandb_id": "0e3seng4" +} \ No newline at end of file diff --git a/first_attention_1/model_2000/config.json b/first_attention_1/model_2000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_1/model_2000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_1/model_2000/training_state.json b/first_attention_1/model_2000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e949d3c14077a2c7d8808443bc624ed92c532cb9 --- /dev/null +++ b/first_attention_1/model_2000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 2000, + "update_step": 2000, + "tokens_seen": 1278001152, + "tokens_seen_before": 1277362176, + "update_time": 0.8418259620666504, + "wandb_id": "0e3seng4" +} \ No newline at end of file diff --git a/first_attention_1/model_3000/config.json b/first_attention_1/model_3000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_1/model_3000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_1/model_3000/training_state.json b/first_attention_1/model_3000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..22d9fd0f8d06ae5ea907b471390d579f74b31f2f --- /dev/null +++ b/first_attention_1/model_3000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 3000, + "update_step": 3000, + "tokens_seen": 1916977152, + "tokens_seen_before": 1916338176, + "update_time": 0.8426218032836914, + "wandb_id": "0e3seng4" +} \ No newline at end of file diff --git a/first_attention_1/model_4000/config.json b/first_attention_1/model_4000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_1/model_4000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_1/model_4000/training_state.json b/first_attention_1/model_4000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8f7531629fd837bcadf944e3c4152cc592e87fb0 --- /dev/null +++ b/first_attention_1/model_4000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 4000, + "update_step": 4000, + "tokens_seen": 2555953152, + "tokens_seen_before": 2555314176, + "update_time": 0.8424825668334961, + "wandb_id": "0e3seng4" +} \ No newline at end of file diff --git a/first_attention_1/model_5000/config.json b/first_attention_1/model_5000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_1/model_5000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_1/model_5000/training_state.json b/first_attention_1/model_5000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d8b9691ae3278020f385c2379d2b39dee72e7a2e --- /dev/null +++ b/first_attention_1/model_5000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 5000, + "update_step": 5000, + "tokens_seen": 3194929152, + "tokens_seen_before": 3194290176, + "update_time": 0.8429272174835205, + "wandb_id": "0e3seng4" +} \ No newline at end of file diff --git a/first_attention_1/model_6000/config.json b/first_attention_1/model_6000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_1/model_6000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_1/model_6000/training_state.json b/first_attention_1/model_6000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..482bcc8208621ccc95e946706660a81a7e02dd73 --- /dev/null +++ b/first_attention_1/model_6000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 6000, + "update_step": 6000, + "tokens_seen": 3833905152, + "tokens_seen_before": 3833266176, + "update_time": 0.8419299125671387, + "wandb_id": "0e3seng4" +} \ No newline at end of file diff --git a/first_attention_1/model_7000/config.json b/first_attention_1/model_7000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_1/model_7000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_1/model_7000/training_state.json b/first_attention_1/model_7000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8035a097abbef7f8267a0e0e9a578d5c6d30df44 --- /dev/null +++ b/first_attention_1/model_7000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 7000, + "update_step": 7000, + "tokens_seen": 4472881152, + "tokens_seen_before": 4472242176, + "update_time": 0.8422131538391113, + "wandb_id": "0e3seng4" +} \ No newline at end of file diff --git a/first_attention_1/model_8000/config.json b/first_attention_1/model_8000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_1/model_8000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_1/model_8000/training_state.json b/first_attention_1/model_8000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6802144700637f9a9cddf43cece43551b75e0366 --- /dev/null +++ b/first_attention_1/model_8000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 8000, + "update_step": 8000, + "tokens_seen": 5111857152, + "tokens_seen_before": 5111218176, + "update_time": 0.8415825366973877, + "wandb_id": "0e3seng4" +} \ No newline at end of file diff --git a/first_attention_1/model_9000/config.json b/first_attention_1/model_9000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_1/model_9000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_1/model_9000/training_state.json b/first_attention_1/model_9000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9728504f89a6ea4d09a789781431c5f63f9b75c6 --- /dev/null +++ b/first_attention_1/model_9000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 9000, + "update_step": 9000, + "tokens_seen": 5750833152, + "tokens_seen_before": 5750194176, + "update_time": 0.8423662185668945, + "wandb_id": "0e3seng4" +} \ No newline at end of file diff --git a/first_attention_1/training_config.yaml b/first_attention_1/training_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c5a4ed41ddddeeb4b05a67405f0f69663b6feeaa --- /dev/null +++ b/first_attention_1/training_config.yaml @@ -0,0 +1,45 @@ +adam_beta1: 0.9 +adam_beta2: 0.95 +adjust_step: 0 +autoresume: false +batch_size: 6 +clip_grad_norm: 1.0 +comment: null +cycle_length: null +dtype: bfloat16 +emb_freeze: null +eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/ +eval_every: 1000 +first_attention: true +gradient_accumulation: 13 +keep_checkpoints: null +layer_freeze: null +load_optimizer_state_on_resume: true +lr: 0.0004 +max_length: 1024 +max_train_tokens: null +min_lr_ratio: 0.1 +model_config: model_config/478m.json +model_name_or_path: null +model_revision: null +num_training_steps: 15000 +optimizer: Adam +restart_warmup_steps: null +resume_from: null +run_name: robust-frost-63 +save_dir: checkpoints/robust-frost-63 +save_every: 1000 +scheduler: cosine +seed: 0 +shuffle: true +skip_batches: !!set {} +tags: +- 396m-for-680m +total_batch_size: 624 +train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/ +training_config: training_config/two_stage/478m_first_attention.yaml +wandb_watch: true +warmed_up_model: null +warmup_steps: 1500 +weight_decay: 0.0 +workers: 8 diff --git a/first_attention_2/final_model/config.json b/first_attention_2/final_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2/final_model/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2/model_13000/config.json b/first_attention_2/model_13000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2/model_13000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2/model_13000/training_state.json b/first_attention_2/model_13000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ba0ec05bea3d14cc42fe004080600a228a706756 --- /dev/null +++ b/first_attention_2/model_13000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 13000, + "update_step": 13000, + "tokens_seen": 8306737152, + "tokens_seen_before": 8306098176, + "update_time": 2.8824567794799805, + "wandb_id": "hkuum9kt" +} \ No newline at end of file diff --git a/first_attention_2/model_2000/config.json b/first_attention_2/model_2000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2/model_2000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2/model_2000/training_state.json b/first_attention_2/model_2000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b0439d03c83dc85f425962da1b0e6d790b524c57 --- /dev/null +++ b/first_attention_2/model_2000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 2000, + "update_step": 2000, + "tokens_seen": 1278001152, + "tokens_seen_before": 1277362176, + "update_time": 2.882199764251709, + "wandb_id": "hkuum9kt" +} \ No newline at end of file diff --git a/first_attention_2/model_3000/config.json b/first_attention_2/model_3000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2/model_3000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2/model_3000/training_state.json b/first_attention_2/model_3000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cc38513332779e8f8b98cb5319689caaf80caed6 --- /dev/null +++ b/first_attention_2/model_3000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 3000, + "update_step": 3000, + "tokens_seen": 1916977152, + "tokens_seen_before": 1916338176, + "update_time": 2.884432792663574, + "wandb_id": "hkuum9kt" +} \ No newline at end of file diff --git a/first_attention_2/model_4000/config.json b/first_attention_2/model_4000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..035e41b6a20fdaefb7a2b7ca435dba6fff270ef7 --- /dev/null +++ b/first_attention_2/model_4000/config.json @@ -0,0 +1,19 @@ +{ + "architectures": [ + "LLaMAForCausalLM" + ], + "bos_token_id": 50256, + "eos_token_id": 50256, + "hidden_act": "silu", + "hidden_size": 1536, + "intermediate_size": 5376, + "initializer_range": 0.02, + "max_sequence_length": 1024, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 14, + "rms_norm_eps": 1e-05, + "transformers_version": "4.28.1", + "use_cache": true, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/first_attention_2/model_4000/training_state.json b/first_attention_2/model_4000/training_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d88c1cb27931678dffba8cfffae8d879dd94f406 --- /dev/null +++ b/first_attention_2/model_4000/training_state.json @@ -0,0 +1,8 @@ +{ + "global_step": 4000, + "update_step": 4000, + "tokens_seen": 2555953152, + "tokens_seen_before": 2555314176, + "update_time": 2.8821418285369873, + "wandb_id": "hkuum9kt" +} \ No newline at end of file diff --git a/first_attention_2/training_config.yaml b/first_attention_2/training_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7c3aea981fe2aaff4b61175191bd57ddeadfb22f --- /dev/null +++ b/first_attention_2/training_config.yaml @@ -0,0 +1,47 @@ +adam_beta1: 0.9 +adam_beta2: 0.95 +adjust_step: 0 +autoresume: false +batch_size: 6 +clip_grad_norm: 1.0 +comment: null +cycle_length: null +dtype: bfloat16 +emb_freeze: null +eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/ +eval_every: 1000 +first_attention: false +first_attention_resume: true +gradient_accumulation: 13 +keep_checkpoints: null +layer_freeze: null +layer_freeze_2: false +load_optimizer_state_on_resume: true +lr: 0.0004 +max_length: 1024 +max_train_tokens: null +min_lr_ratio: 0.1 +model_config: model_config/478m.json +model_name_or_path: null +model_revision: null +num_training_steps: 15000 +optimizer: Adam +restart_warmup_steps: null +resume_from: null +run_name: first_attention_resume +save_dir: checkpoints/first_attention_resume +save_every: 1000 +scheduler: cosine +seed: 0 +shuffle: true +skip_batches: !!set {} +tags: +- 396m-for-680m +total_batch_size: 624 +train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/ +training_config: training_config/two_stage/478m_first_attention_resume.yaml +wandb_watch: true +warmed_up_model: null +warmup_steps: 1500 +weight_decay: 0.0 +workers: 8 diff --git a/first_attention_2_attention_unfreeze/final_model/pytorch_model.bin b/first_attention_2_attention_unfreeze/final_model/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..59d910b52deabbf6ed45d083a90fbcdec09b5df7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/final_model/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb83e315f2a4475df30c3eebbca80bf0db8e6a03378d80cb7173a584b2a573b6 +size 2533545094 diff --git a/first_attention_2_attention_unfreeze/model_1000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_1000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..88809f9e85a5fc1baf67f2491e144ed44bf69efe --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_1000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c45a8f3c4f4d021f27a4c0ff889f20655be5677f83819547be076ecd0e0230b +size 2533545094 diff --git a/first_attention_2_attention_unfreeze/model_10000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_10000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..5de6260137427fca68bf64751dfffc00be842477 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_10000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40dfcb9849e0275eeb11bad6fc60b4e556f528000f2e1540145df68c4775597e +size 2533545094 diff --git a/first_attention_2_attention_unfreeze/model_11000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_11000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ca51914c6e828cabef24620d54840c1355c152f5 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_11000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fea72ecb9a3f26e092ae472e8fe7302e710ac3ce88459751f9b3356b77e07294 +size 2533545094 diff --git a/first_attention_2_attention_unfreeze/model_12000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_12000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..2d058f1b76259baa7d5c589b58425cc90f8cd4d6 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_12000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11bccb16f4e719b688af2354ef97f4ff742f05e8633b32537af3b4327b60fd63 +size 2533545094 diff --git a/first_attention_2_attention_unfreeze/model_13000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_13000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..a7baaaa9513b6a19fd02f080f14a4db1a3d8b0f5 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_13000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8e9714cf45a316f1c6ce6c9175d953549c83672ac50097a08cff6c105b91efe +size 2533545094 diff --git a/first_attention_2_attention_unfreeze/model_14000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_14000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..fe1f237838ebc96c198f02d0283d63f6d389debd --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_14000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d68b08a96f0d9030740cb911b38932b1e06a4310c63ae20d3f45a1805eee3eeb +size 2533545094 diff --git a/first_attention_2_attention_unfreeze/model_15000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_15000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..59d910b52deabbf6ed45d083a90fbcdec09b5df7 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_15000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb83e315f2a4475df30c3eebbca80bf0db8e6a03378d80cb7173a584b2a573b6 +size 2533545094 diff --git a/first_attention_2_attention_unfreeze/model_2000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_2000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..291aaeb6e7a85264242c50de1be6a78933c2ba1c --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_2000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d6c4a4c83e346de97d4ff6cc8e8f4a74ac7b93f847439ad70ccef0ae2caf30c +size 2533545094 diff --git a/first_attention_2_attention_unfreeze/model_3000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_3000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ac4292c9ace7ea5f84f99860e5bf8d21619a0fa9 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_3000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3709fc1c4ee2f0133442a25326d061a70010894151b7b1e4655e09389ebe750 +size 2533545094 diff --git a/first_attention_2_attention_unfreeze/model_4000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_4000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..1b22fd524b784ad384491033eb1ffa1d94cb2e54 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_4000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7757dd04377135669a447a5c81714fe7428081c5f0b5871cb5c5e4122a97f2ec +size 2533545094 diff --git a/first_attention_2_attention_unfreeze/model_5000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_5000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b63f2c67ba18d39d5c1a4f3edd66a8eaaf678ffe --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_5000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99fdd0c33c6507a4e3de3908dbec1175e0be3eb1105c4bcdb6ff8015723caedf +size 2533545094 diff --git a/first_attention_2_attention_unfreeze/model_6000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_6000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..cb75558e9892ce2e0694c67df0849be7b67d92ae --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_6000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a32c2a577168221f1c632ca81952072bf37f66585781189db476738be7ca868d +size 2533545094 diff --git a/first_attention_2_attention_unfreeze/model_7000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_7000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..45c476e264d46bd016760f9ddf1f07bced6bad51 --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_7000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ebeb4fd972a34bd54e544f3a0d258e005c4b40d4e89d45c76a2276cb58d57d8 +size 2533545094 diff --git a/first_attention_2_attention_unfreeze/model_8000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_8000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..23960a03f0769236803acebdf6b0870e2b4e916a --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_8000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61289bace0732321741b3315a811dfda363ddc3591efcd441e37063797f28a2e +size 2533545094 diff --git a/first_attention_2_attention_unfreeze/model_9000/pytorch_model.bin b/first_attention_2_attention_unfreeze/model_9000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..05bb65f7467be0e0b85f97dcf21223595dd561ea --- /dev/null +++ b/first_attention_2_attention_unfreeze/model_9000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ec321d147eba726dae0a00e3d99d130cca93b78ca48e81b217a33d2df5cfa88 +size 2533545094 diff --git a/first_attention_resume_loss_convergence/model_3000/pytorch_model.bin b/first_attention_resume_loss_convergence/model_3000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..81816343f415ead9389705edf3a4e24bb34451a9 --- /dev/null +++ b/first_attention_resume_loss_convergence/model_3000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41187b3bb73fe6be85771de00c84e4368d0566bc7e613d3630edb3d9149c890c +size 2533545094 diff --git a/first_layer_1/final_model/pytorch_model.bin b/first_layer_1/final_model/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..76fee640f34bfb414450e656d172570fae64fdb4 --- /dev/null +++ b/first_layer_1/final_model/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5cc8ca606b5c08b402979c6a187af04dad556c28212329f880e7c21ee9327e6 +size 754421397 diff --git a/first_layer_1/model_13000/pytorch_model.bin b/first_layer_1/model_13000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..21e3c0d51e3ba571fee6d3789c7b93fd34f42edc --- /dev/null +++ b/first_layer_1/model_13000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:475c20d0f1341b67290dfd842db09917f5317a0c6ff60f0f248ec68449ecd7b3 +size 754421397 diff --git a/first_layer_1/model_2000/pytorch_model.bin b/first_layer_1/model_2000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..bc9d5aa4e11016fb94793564a7e805d93e020b12 --- /dev/null +++ b/first_layer_1/model_2000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63c4a6b76b99daddee34fe8ec21e07bcb4b59d808c5b376220ce401925245381 +size 754421397 diff --git a/first_layer_1/model_3000/pytorch_model.bin b/first_layer_1/model_3000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..812fc4e2b97431b873cdc6f3ceefba7af8bd6e92 --- /dev/null +++ b/first_layer_1/model_3000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:882d77bc2a3e9850f0dbb3d2fef9c496c0295408294c0bff91c83fac149c2f8b +size 754421397 diff --git a/first_layer_2/final_model/pytorch_model.bin b/first_layer_2/final_model/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..fda57ad8105bc6e0a08f7d08fb0b03d08588253a --- /dev/null +++ b/first_layer_2/final_model/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1055488d44e419beec7f4dc8586960f627d130b2d9b3534af2dc0ae37b778b40 +size 2533545094 diff --git a/first_layer_2/model_1000/pytorch_model.bin b/first_layer_2/model_1000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..94011d8c2c15c19e146fed9eb36eccc5de7472d0 --- /dev/null +++ b/first_layer_2/model_1000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef4918e831f53b9fdf98db08391b70d82efc7d4e0c7888cd688d46b141051790 +size 2533545094 diff --git a/first_layer_2/model_10000/pytorch_model.bin b/first_layer_2/model_10000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..3814aa6ffdf30ec933169b2ded9aadc0c77417d0 --- /dev/null +++ b/first_layer_2/model_10000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6205f6986915a368dca66aef15ea722192e30f3037ddd4034af6ee4377f2344c +size 2533545094 diff --git a/first_layer_2/model_11000/pytorch_model.bin b/first_layer_2/model_11000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..9c83290c0f62817c588dae02e1a52b6bc74d3ef8 --- /dev/null +++ b/first_layer_2/model_11000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caa3e0c36245f8997ebb120617e85d06a65d9c434e6f47e41533feb9daa35abb +size 2533545094 diff --git a/first_layer_2/model_12000/pytorch_model.bin b/first_layer_2/model_12000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..dd5a3afa547c0f39ec31712185848f3244017189 --- /dev/null +++ b/first_layer_2/model_12000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4d651ca79857ff137a36fffe59a3d6d1562cd6fea95cdd11b6ecf0da0de6353 +size 2533545094 diff --git a/first_layer_2/model_13000/pytorch_model.bin b/first_layer_2/model_13000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..1654452eebea62746f65c0cc447a5f1a08d8ee40 --- /dev/null +++ b/first_layer_2/model_13000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff95868727cab2b481b43497784642da1a323a82c5b184a3d935624ffca7c17e +size 2533545094 diff --git a/first_layer_2/model_14000/pytorch_model.bin b/first_layer_2/model_14000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..aedd3a15ffc28463d8b7f66e40fd90ac20605d6a --- /dev/null +++ b/first_layer_2/model_14000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bb9ba7f46f65527b44e164a8344b3b7a57c9d63ab1c6434a168ac53884c81ce +size 2533545094 diff --git a/first_layer_2/model_15000/pytorch_model.bin b/first_layer_2/model_15000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..fda57ad8105bc6e0a08f7d08fb0b03d08588253a --- /dev/null +++ b/first_layer_2/model_15000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1055488d44e419beec7f4dc8586960f627d130b2d9b3534af2dc0ae37b778b40 +size 2533545094 diff --git a/first_layer_2/model_2000/pytorch_model.bin b/first_layer_2/model_2000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..4643a50ddc07725a68d5f43c0a830b0b94d8f141 --- /dev/null +++ b/first_layer_2/model_2000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9226f224657e6bef6c1526a23710e30324f350e9f78614cf76dbe3efbeabba88 +size 2533545094 diff --git a/first_layer_2/model_3000/pytorch_model.bin b/first_layer_2/model_3000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..16e2b22085ca618e91c1d16c41627080121b4abf --- /dev/null +++ b/first_layer_2/model_3000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44264c7bf0d5f083455ee4d2635d3103026acf0640bb4061f057a17c00890149 +size 2533545094 diff --git a/first_layer_2/model_4000/pytorch_model.bin b/first_layer_2/model_4000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..47ad51fa9b742432ce389e11a0a630e9ae2aae5a --- /dev/null +++ b/first_layer_2/model_4000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77b49ed3050093f152650ea9c90d69a38bc14a9147f54f3a5ff90f5d2f4bf646 +size 2533545094 diff --git a/first_layer_2/model_5000/pytorch_model.bin b/first_layer_2/model_5000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d05868dfbfb725aa756f28b8fda9bad9d19e95a3 --- /dev/null +++ b/first_layer_2/model_5000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08a6559722b6b81c85cdbed675563f997e7c9d3b3c79a8fdfcec9a0130635c23 +size 2533545094 diff --git a/first_layer_2/model_6000/pytorch_model.bin b/first_layer_2/model_6000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..11e53e755c16482bae099ea803916eef5f42e34b --- /dev/null +++ b/first_layer_2/model_6000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19174bee86c290274e86a3ad18746710f81714649969205d271a7d4f636bdcfd +size 2533545094 diff --git a/first_layer_2/model_7000/pytorch_model.bin b/first_layer_2/model_7000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..259c25f8201e6a8a2e7908588ffee2a719ee88b7 --- /dev/null +++ b/first_layer_2/model_7000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc19c9ffaa36a141c14208eabfb13a9ba2a4d88a6a14ba02075f934c8c1acc9a +size 2533545094 diff --git a/first_layer_2/model_8000/pytorch_model.bin b/first_layer_2/model_8000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..11a29cd207731670d72a2a7e3c6eb7090fbaec9b --- /dev/null +++ b/first_layer_2/model_8000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aa26db628424d0c9512b9406e4c864071a76c4f6175947eef0a2827583d2151 +size 2533545094