Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- attention_2_only_emb/final_model/pytorch_model.bin +3 -0
- attention_2_only_emb/model_1000/pytorch_model.bin +3 -0
- attention_2_only_emb/model_10000/pytorch_model.bin +3 -0
- attention_2_only_emb/model_11000/pytorch_model.bin +3 -0
- attention_2_only_emb/model_13000/pytorch_model.bin +3 -0
- attention_2_only_emb/model_14000/pytorch_model.bin +3 -0
- attention_2_only_emb/model_15000/pytorch_model.bin +3 -0
- attention_2_only_emb/model_4000/pytorch_model.bin +3 -0
- attention_2_only_emb/model_5000/pytorch_model.bin +3 -0
- attention_2_only_emb/model_9000/pytorch_model.bin +3 -0
- baseline/model_1000/pytorch_model.bin +3 -0
- baseline/model_11000/pytorch_model.bin +3 -0
- baseline/model_12000/pytorch_model.bin +3 -0
- baseline/model_13000/pytorch_model.bin +3 -0
- baseline/model_14000/pytorch_model.bin +3 -0
- baseline/model_2000/pytorch_model.bin +3 -0
- baseline/model_3000/pytorch_model.bin +3 -0
- baseline/model_4000/pytorch_model.bin +3 -0
- baseline/model_5000/pytorch_model.bin +3 -0
- baseline/model_6000/pytorch_model.bin +3 -0
- baseline/model_7000/pytorch_model.bin +3 -0
- baseline/model_8000/pytorch_model.bin +3 -0
- baseline/model_9000/pytorch_model.bin +3 -0
- bigram_2/final_model/config.json +19 -0
- bigram_2/model_1000/model_config.json +19 -0
- bigram_2/model_1000/training_state.json +8 -0
- bigram_2/model_10000/model_config.json +19 -0
- bigram_2/model_10000/training_state.json +8 -0
- bigram_2/model_11000/model_config.json +19 -0
- bigram_2/model_11000/training_state.json +8 -0
- bigram_2/model_12000/model_config.json +19 -0
- bigram_2/model_12000/training_state.json +8 -0
- bigram_2/model_14000/model_config.json +19 -0
- bigram_2/model_14000/training_state.json +8 -0
- bigram_2/model_15000/model_config.json +19 -0
- bigram_2/model_15000/training_state.json +8 -0
- bigram_2/model_4000/model_config.json +19 -0
- bigram_2/model_5000/model_config.json +19 -0
- bigram_2/model_5000/training_state.json +8 -0
- bigram_2/model_6000/model_config.json +19 -0
- bigram_2/model_6000/training_state.json +8 -0
- bigram_2/model_7000/model_config.json +19 -0
- bigram_2/model_7000/training_state.json +8 -0
- bigram_2/model_8000/model_config.json +19 -0
- bigram_2/model_8000/training_state.json +8 -0
- bigram_2/model_9000/model_config.json +19 -0
- bigram_2/model_9000/training_state.json +8 -0
- bigram_2_full/final_model/pytorch_model.bin +3 -0
- bigram_2_full/model_1000/pytorch_model.bin +3 -0
- bigram_2_full/model_10000/pytorch_model.bin +3 -0
attention_2_only_emb/final_model/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:65238b445c26f43da67f2f54cecb176acdc8f4316c36230e1baccf453169754b
|
3 |
+
size 2533545094
|
attention_2_only_emb/model_1000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d48e11d99e74e3ac615d5f6bd78b0f524e0ad1951fca5b223c9aec52ebb23596
|
3 |
+
size 2533545094
|
attention_2_only_emb/model_10000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e190ea29dd8119871d88bc6c120b49a4c3b6e9e979286dac7b464f9c42fb5454
|
3 |
+
size 2533545094
|
attention_2_only_emb/model_11000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e8a06de57e06be1e59112b19c52985b6e1e1905645464df7d675c61f8a62a67
|
3 |
+
size 2533545094
|
attention_2_only_emb/model_13000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:33dfd88af702c854bab079669a8dde4157c14ab4dbbea412e8382b1157fed607
|
3 |
+
size 2533545094
|
attention_2_only_emb/model_14000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:288665e9230d153d7a863e7a8e66cc3c48d5133ba629a788f8c8302f62bcf508
|
3 |
+
size 2533545094
|
attention_2_only_emb/model_15000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:65238b445c26f43da67f2f54cecb176acdc8f4316c36230e1baccf453169754b
|
3 |
+
size 2533545094
|
attention_2_only_emb/model_4000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f831b13a1e989d024050ac81156cca927d96d40afd492847292ea167b8f97d8e
|
3 |
+
size 2533545094
|
attention_2_only_emb/model_5000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a72b2ed53bbf3dbc3fc4711abcd9d661d0064760380d4c3f2ce7c6b59db76dbf
|
3 |
+
size 2533545094
|
attention_2_only_emb/model_9000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4851e88488581a963da7b13d25b32cf82e0512d7a393bfbf957b3671ff379c7a
|
3 |
+
size 2533545094
|
baseline/model_1000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:00edd9bd916b1d6059044c1e496b6c21775ba6e8cfc08fb54ea38a112cefeb4e
|
3 |
+
size 2533545094
|
baseline/model_11000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef2caffc4e634a9b5cc17988d11913c72e3a6eeede9a45f38257db62bff5f661
|
3 |
+
size 2533545094
|
baseline/model_12000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4e719eae49624f71ba9389bd9764c1cb5d07c0acf33a1220eb4f8dda73240b9b
|
3 |
+
size 2533545094
|
baseline/model_13000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:672bb2ac05b1618a7d748e23fa392f5197065cef854f22b6dce109b472470517
|
3 |
+
size 2533545094
|
baseline/model_14000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4739b4a78103197ee542a1010ba48e38824f60f5ec00ce70a9ae33ca5f2e4343
|
3 |
+
size 2533545094
|
baseline/model_2000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bc43549f54dd1e2edc06ec8ea811fc48c5b51a83830ec22027266878f3657b91
|
3 |
+
size 2533545094
|
baseline/model_3000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:83671facca5fb99530b5c52bbd111dda3383639f4ef50f90bd7379aa1b37fae2
|
3 |
+
size 2533545094
|
baseline/model_4000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ebb2010e8f368ae6fe61f89a9d71b8ee6a821a59727301688097911f2337d544
|
3 |
+
size 2533545094
|
baseline/model_5000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d88bc7652172e2291ad1e16d2326344a1f9651dec698d2f14ff2b8a909f82dc
|
3 |
+
size 2533545094
|
baseline/model_6000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8e8417cded07b68977443fb14688e0a5d1fdfe3727a5a58da05dd5df10423795
|
3 |
+
size 2533545094
|
baseline/model_7000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bedb26968104d1fe2d880f5b4e0ebb006fbea3dde5ca3603f7606c07e0a4bed8
|
3 |
+
size 2533545094
|
baseline/model_8000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c7d8381c12bd8d5c961d08860dfeac2a8cfbea2af8400b6d62bd2623c90258f5
|
3 |
+
size 2533545094
|
baseline/model_9000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d62dca90201b94b66ead46c71e3b2036bbe678cd38ab831997b6ce49b858773f
|
3 |
+
size 2533545094
|
bigram_2/final_model/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
bigram_2/model_1000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
bigram_2/model_1000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 1000,
|
3 |
+
"update_step": 1000,
|
4 |
+
"tokens_seen": 639025152,
|
5 |
+
"tokens_seen_before": 638386176,
|
6 |
+
"update_time": 2.892822504043579,
|
7 |
+
"wandb_id": "walilhuz"
|
8 |
+
}
|
bigram_2/model_10000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
bigram_2/model_10000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 10000,
|
3 |
+
"update_step": 10000,
|
4 |
+
"tokens_seen": 6389809152,
|
5 |
+
"tokens_seen_before": 6389170176,
|
6 |
+
"update_time": 2.8910531997680664,
|
7 |
+
"wandb_id": "walilhuz"
|
8 |
+
}
|
bigram_2/model_11000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
bigram_2/model_11000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 11000,
|
3 |
+
"update_step": 11000,
|
4 |
+
"tokens_seen": 7028785152,
|
5 |
+
"tokens_seen_before": 7028146176,
|
6 |
+
"update_time": 2.88981556892395,
|
7 |
+
"wandb_id": "walilhuz"
|
8 |
+
}
|
bigram_2/model_12000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
bigram_2/model_12000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 12000,
|
3 |
+
"update_step": 12000,
|
4 |
+
"tokens_seen": 7667761152,
|
5 |
+
"tokens_seen_before": 7667122176,
|
6 |
+
"update_time": 2.8912298679351807,
|
7 |
+
"wandb_id": "walilhuz"
|
8 |
+
}
|
bigram_2/model_14000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
bigram_2/model_14000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 14000,
|
3 |
+
"update_step": 14000,
|
4 |
+
"tokens_seen": 8945713152,
|
5 |
+
"tokens_seen_before": 8945074176,
|
6 |
+
"update_time": 2.8920021057128906,
|
7 |
+
"wandb_id": "walilhuz"
|
8 |
+
}
|
bigram_2/model_15000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
bigram_2/model_15000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 15000,
|
3 |
+
"update_step": 15000,
|
4 |
+
"tokens_seen": 9584689152,
|
5 |
+
"tokens_seen_before": 9584050176,
|
6 |
+
"update_time": 2.89182710647583,
|
7 |
+
"wandb_id": "walilhuz"
|
8 |
+
}
|
bigram_2/model_4000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
bigram_2/model_5000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
bigram_2/model_5000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 5000,
|
3 |
+
"update_step": 5000,
|
4 |
+
"tokens_seen": 3194929152,
|
5 |
+
"tokens_seen_before": 3194290176,
|
6 |
+
"update_time": 2.8917791843414307,
|
7 |
+
"wandb_id": "walilhuz"
|
8 |
+
}
|
bigram_2/model_6000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
bigram_2/model_6000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 6000,
|
3 |
+
"update_step": 6000,
|
4 |
+
"tokens_seen": 3833905152,
|
5 |
+
"tokens_seen_before": 3833266176,
|
6 |
+
"update_time": 2.892122983932495,
|
7 |
+
"wandb_id": "walilhuz"
|
8 |
+
}
|
bigram_2/model_7000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
bigram_2/model_7000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 7000,
|
3 |
+
"update_step": 7000,
|
4 |
+
"tokens_seen": 4472881152,
|
5 |
+
"tokens_seen_before": 4472242176,
|
6 |
+
"update_time": 2.8900256156921387,
|
7 |
+
"wandb_id": "walilhuz"
|
8 |
+
}
|
bigram_2/model_8000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
bigram_2/model_8000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 8000,
|
3 |
+
"update_step": 8000,
|
4 |
+
"tokens_seen": 5111857152,
|
5 |
+
"tokens_seen_before": 5111218176,
|
6 |
+
"update_time": 2.8911120891571045,
|
7 |
+
"wandb_id": "walilhuz"
|
8 |
+
}
|
bigram_2/model_9000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
bigram_2/model_9000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 9000,
|
3 |
+
"update_step": 9000,
|
4 |
+
"tokens_seen": 5750833152,
|
5 |
+
"tokens_seen_before": 5750194176,
|
6 |
+
"update_time": 2.890213966369629,
|
7 |
+
"wandb_id": "walilhuz"
|
8 |
+
}
|
bigram_2_full/final_model/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:477b0b02d1fa370dd03a88359bf7784e17681740c07a6952b75b5bab9f5e333f
|
3 |
+
size 2533545094
|
bigram_2_full/model_1000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f696208f3842be6f4c26371224bc32cb8cb8f5a268c879be0dd18afef088b4f
|
3 |
+
size 2533545094
|
bigram_2_full/model_10000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:808a7200d616e0ad374917803a0a3e5e53c4751e33bdf1761e5d421e603581cd
|
3 |
+
size 2533545094
|