Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- attention_2_only_emb/model_1000/optimizer.pt +3 -0
- attention_2_only_emb/model_11000/optimizer.pt +3 -0
- attention_2_only_emb/model_13000/optimizer.pt +3 -0
- attention_2_only_emb/model_14000/optimizer.pt +3 -0
- attention_2_only_emb/model_2000/optimizer.pt +3 -0
- attention_2_only_emb/model_3000/optimizer.pt +3 -0
- attention_2_only_emb/model_4000/optimizer.pt +3 -0
- attention_2_only_emb/model_5000/optimizer.pt +3 -0
- attention_2_only_emb/model_7000/pytorch_model.bin +3 -0
- attention_2_only_emb/model_9000/optimizer.pt +3 -0
- bigram_1/model_13000/optimizer.pt +3 -0
- bigram_1/model_2000/optimizer.pt +3 -0
- first_attention_2/model_1000/config.json +19 -0
- first_attention_2/model_1000/training_state.json +8 -0
- first_attention_2/model_10000/config.json +19 -0
- first_attention_2/model_10000/training_state.json +8 -0
- first_attention_2/model_11000/config.json +19 -0
- first_attention_2/model_11000/training_state.json +8 -0
- first_attention_2/model_12000/config.json +19 -0
- first_attention_2/model_12000/training_state.json +8 -0
- first_attention_2/model_14000/config.json +19 -0
- first_attention_2/model_14000/training_state.json +8 -0
- first_attention_2/model_15000/config.json +19 -0
- first_attention_2/model_15000/training_state.json +8 -0
- first_attention_2/model_5000/config.json +19 -0
- first_attention_2/model_5000/training_state.json +8 -0
- first_attention_2/model_6000/config.json +19 -0
- first_attention_2/model_6000/training_state.json +8 -0
- first_attention_2/model_7000/config.json +19 -0
- first_attention_2/model_7000/training_state.json +8 -0
- first_attention_2/model_8000/config.json +19 -0
- first_attention_2/model_8000/training_state.json +8 -0
- first_attention_2/model_9000/config.json +19 -0
- first_attention_2/model_9000/training_state.json +8 -0
- first_attention_2_attention_unfreeze/model_1000/optimizer.pt +3 -0
- first_attention_2_attention_unfreeze/model_10000/optimizer.pt +3 -0
- first_attention_2_attention_unfreeze/model_12000/optimizer.pt +3 -0
- first_attention_2_attention_unfreeze/model_13000/optimizer.pt +3 -0
- first_attention_2_attention_unfreeze/model_15000/optimizer.pt +3 -0
- first_attention_2_attention_unfreeze/model_2000/optimizer.pt +3 -0
- first_attention_2_attention_unfreeze/model_3000/optimizer.pt +3 -0
- first_attention_2_attention_unfreeze/model_4000/optimizer.pt +3 -0
- first_attention_2_attention_unfreeze/model_5000/optimizer.pt +3 -0
- first_attention_2_attention_unfreeze/model_6000/optimizer.pt +3 -0
- first_attention_2_attention_unfreeze/model_8000/optimizer.pt +3 -0
- first_attention_resume_loss_convergence/model_3000/optimizer.pt +3 -0
- first_layer_1/model_1000/optimizer.pt +3 -0
- first_layer_1/model_1000/pytorch_model.bin +3 -0
- first_layer_1/model_10000/optimizer.pt +3 -0
- first_layer_1/model_10000/pytorch_model.bin +3 -0
attention_2_only_emb/model_1000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90e36709a2be1f44091ca2b79e3da0d807a8e1b63e9dd484ca40aa563d6806f9
|
3 |
+
size 3831971578
|
attention_2_only_emb/model_11000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5e1034cac02becda9747198b1d50eef735ee2f09feda0072544b26e891950744
|
3 |
+
size 3831971578
|
attention_2_only_emb/model_13000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e8bd5fcd410c58a01c810503b05f4b10d79dae9c30ce05db36d1a1ab9acf891a
|
3 |
+
size 3831971578
|
attention_2_only_emb/model_14000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:56aee6ce75242ef897404ce0d71199504b76bd1d94d5e1944eaf60383bbd1c9a
|
3 |
+
size 3831971578
|
attention_2_only_emb/model_2000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fff35e5764903efe1bc0e3c6183bcede724b86a46808ee76b9ca1a603638b9e0
|
3 |
+
size 3831971578
|
attention_2_only_emb/model_3000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f62e15803c9dec9b94261f7dd38148f24cca9d0fb2904f5d98fba4499d99f47c
|
3 |
+
size 3831971578
|
attention_2_only_emb/model_4000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dfb492a7c81974c21467d61d402f328d07fc4fbec60714e63a347b68edc23835
|
3 |
+
size 3831971578
|
attention_2_only_emb/model_5000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f487d5050fc7515d3b5ae387945268e79be8bcb60fe97ac101217fbf0d7ec2b
|
3 |
+
size 3831971578
|
attention_2_only_emb/model_7000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:78fc551240970d2e19e77c2c6d786e1133a98032538efbc249a7900c5772506b
|
3 |
+
size 2533545094
|
attention_2_only_emb/model_9000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:999c518c597d729ba6de7041a915ef77e80d142438acff911ab850422a16d60a
|
3 |
+
size 3831971578
|
bigram_1/model_13000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d22bb0bb9a1d249471583ae4ee828f82e867df6e0ddba520594ef2019efa7a56
|
3 |
+
size 1235135766
|
bigram_1/model_2000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af1e99bcab5a86059566dc445b7ba5350f5a19fcd7b6edb0df72a96bd3ffbcd6
|
3 |
+
size 1235135766
|
first_attention_2/model_1000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_attention_2/model_1000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 1000,
|
3 |
+
"update_step": 1000,
|
4 |
+
"tokens_seen": 639025152,
|
5 |
+
"tokens_seen_before": 638386176,
|
6 |
+
"update_time": 2.8817572593688965,
|
7 |
+
"wandb_id": "hkuum9kt"
|
8 |
+
}
|
first_attention_2/model_10000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_attention_2/model_10000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 10000,
|
3 |
+
"update_step": 10000,
|
4 |
+
"tokens_seen": 6389809152,
|
5 |
+
"tokens_seen_before": 6389170176,
|
6 |
+
"update_time": 2.883666515350342,
|
7 |
+
"wandb_id": "hkuum9kt"
|
8 |
+
}
|
first_attention_2/model_11000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_attention_2/model_11000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 11000,
|
3 |
+
"update_step": 11000,
|
4 |
+
"tokens_seen": 7028785152,
|
5 |
+
"tokens_seen_before": 7028146176,
|
6 |
+
"update_time": 2.884199380874634,
|
7 |
+
"wandb_id": "hkuum9kt"
|
8 |
+
}
|
first_attention_2/model_12000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_attention_2/model_12000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 12000,
|
3 |
+
"update_step": 12000,
|
4 |
+
"tokens_seen": 7667761152,
|
5 |
+
"tokens_seen_before": 7667122176,
|
6 |
+
"update_time": 2.8828084468841553,
|
7 |
+
"wandb_id": "hkuum9kt"
|
8 |
+
}
|
first_attention_2/model_14000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_attention_2/model_14000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 14000,
|
3 |
+
"update_step": 14000,
|
4 |
+
"tokens_seen": 8945713152,
|
5 |
+
"tokens_seen_before": 8945074176,
|
6 |
+
"update_time": 2.8815410137176514,
|
7 |
+
"wandb_id": "hkuum9kt"
|
8 |
+
}
|
first_attention_2/model_15000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_attention_2/model_15000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 15000,
|
3 |
+
"update_step": 15000,
|
4 |
+
"tokens_seen": 9584689152,
|
5 |
+
"tokens_seen_before": 9584050176,
|
6 |
+
"update_time": 2.8837459087371826,
|
7 |
+
"wandb_id": "hkuum9kt"
|
8 |
+
}
|
first_attention_2/model_5000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_attention_2/model_5000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 5000,
|
3 |
+
"update_step": 5000,
|
4 |
+
"tokens_seen": 3194929152,
|
5 |
+
"tokens_seen_before": 3194290176,
|
6 |
+
"update_time": 2.882314920425415,
|
7 |
+
"wandb_id": "hkuum9kt"
|
8 |
+
}
|
first_attention_2/model_6000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_attention_2/model_6000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 6000,
|
3 |
+
"update_step": 6000,
|
4 |
+
"tokens_seen": 3833905152,
|
5 |
+
"tokens_seen_before": 3833266176,
|
6 |
+
"update_time": 2.8819949626922607,
|
7 |
+
"wandb_id": "hkuum9kt"
|
8 |
+
}
|
first_attention_2/model_7000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_attention_2/model_7000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 7000,
|
3 |
+
"update_step": 7000,
|
4 |
+
"tokens_seen": 4472881152,
|
5 |
+
"tokens_seen_before": 4472242176,
|
6 |
+
"update_time": 2.882638931274414,
|
7 |
+
"wandb_id": "hkuum9kt"
|
8 |
+
}
|
first_attention_2/model_8000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_attention_2/model_8000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 8000,
|
3 |
+
"update_step": 8000,
|
4 |
+
"tokens_seen": 5111857152,
|
5 |
+
"tokens_seen_before": 5111218176,
|
6 |
+
"update_time": 2.883155584335327,
|
7 |
+
"wandb_id": "hkuum9kt"
|
8 |
+
}
|
first_attention_2/model_9000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_attention_2/model_9000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 9000,
|
3 |
+
"update_step": 9000,
|
4 |
+
"tokens_seen": 5750833152,
|
5 |
+
"tokens_seen_before": 5750194176,
|
6 |
+
"update_time": 2.8817710876464844,
|
7 |
+
"wandb_id": "hkuum9kt"
|
8 |
+
}
|
first_attention_2_attention_unfreeze/model_1000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:335ca2fea887d6351734877081fb0485012166dacdd9dffddaf45a97542c5540
|
3 |
+
size 3831971514
|
first_attention_2_attention_unfreeze/model_10000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:399f3c992ccedd62a1a627476e1fbf2da337f382748b28500155dd76f9c9c1c1
|
3 |
+
size 3831971514
|
first_attention_2_attention_unfreeze/model_12000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f433318142a54b2952f46d861bffae52d16c319c33deb9ed0885fece81e5988b
|
3 |
+
size 3831971514
|
first_attention_2_attention_unfreeze/model_13000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8231c6fc85acb8a3f9dffebbd2ef7bf2ededb60ff0e9156c834e55171b4890a0
|
3 |
+
size 3831971514
|
first_attention_2_attention_unfreeze/model_15000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b28f9756b186c271da190744e6caade888e2f28220db16562a52eca35adbecbc
|
3 |
+
size 3831971514
|
first_attention_2_attention_unfreeze/model_2000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4bf7e28193962e52d92b64d8b1191455056d8f329b5d3030920296e6d1352505
|
3 |
+
size 3831971514
|
first_attention_2_attention_unfreeze/model_3000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df15bb40ab7fcf7f243ec79873a49d97d4d9cd0b250cb6a0564eb2c6f70afe41
|
3 |
+
size 3831971514
|
first_attention_2_attention_unfreeze/model_4000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2770afb7ed9c9c02311193c7d3448277c0abde3999b0605297c98cf592f96b2b
|
3 |
+
size 3831971514
|
first_attention_2_attention_unfreeze/model_5000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:97a4593759067a02ce02201600bc3849ffe4cbe3cb38d139843a891a86766e2d
|
3 |
+
size 3831971514
|
first_attention_2_attention_unfreeze/model_6000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:978162edef6a89660973a72eb5060715911fae80e422fc57ecb900135db49035
|
3 |
+
size 3831971514
|
first_attention_2_attention_unfreeze/model_8000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3bf28dd241e068f5023e250724d1e5627d72274f5df6ec5f2bd63031d09a40e8
|
3 |
+
size 3831971514
|
first_attention_resume_loss_convergence/model_3000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e7b78cdb11858e591dc6fac98c8ab5c1bbe9e75fb0e34ba23f7fb3e40a817dfc
|
3 |
+
size 3756457274
|
first_layer_1/model_1000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5f9e122b12f54c1b0db7d1247c17b1944413ea784d421a3cc1e30d083b987997
|
3 |
+
size 1508846842
|
first_layer_1/model_1000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa53593417514a2af1663dabfa810205ce8806929b866234a00019623c1a402a
|
3 |
+
size 754421397
|
first_layer_1/model_10000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8c0741af4742da582f045d3ca5062ab1de591ac3cc6859f263f6df901b4f8fa0
|
3 |
+
size 1508846842
|
first_layer_1/model_10000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f432387b760233d555fd966f70c99c54cbe73c9c7983c45d08dbfad81f0d262e
|
3 |
+
size 754421397
|