Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- baseline/final_model/config.json +19 -0
- baseline/model_1000/model_config.json +19 -0
- baseline/model_1000/training_state.json +8 -0
- baseline/model_10000/model_config.json +19 -0
- baseline/model_10000/training_state.json +8 -0
- baseline/model_11000/model_config.json +19 -0
- baseline/model_11000/training_state.json +8 -0
- baseline/model_12000/model_config.json +19 -0
- baseline/model_12000/training_state.json +8 -0
- baseline/model_13000/model_config.json +19 -0
- baseline/model_13000/training_state.json +8 -0
- baseline/model_14000/model_config.json +19 -0
- baseline/model_14000/training_state.json +8 -0
- baseline/model_15000/model_config.json +19 -0
- baseline/model_15000/training_state.json +8 -0
- baseline/model_2000/model_config.json +19 -0
- baseline/model_2000/training_state.json +8 -0
- baseline/model_3000/model_config.json +19 -0
- baseline/model_3000/training_state.json +8 -0
- baseline/model_4000/model_config.json +19 -0
- baseline/model_4000/training_state.json +8 -0
- baseline/model_5000/model_config.json +19 -0
- baseline/model_5000/training_state.json +8 -0
- baseline/model_6000/model_config.json +19 -0
- baseline/model_6000/training_state.json +8 -0
- baseline/model_7000/model_config.json +19 -0
- baseline/model_7000/training_state.json +8 -0
- baseline/model_8000/model_config.json +19 -0
- baseline/model_8000/training_state.json +8 -0
- baseline/model_9000/model_config.json +19 -0
- baseline/model_9000/training_state.json +8 -0
- baseline/training_config.yaml +42 -0
- first_attention_2_attention_unfreeze/model_13000/config.json +19 -0
- first_attention_resume_loss_convergence/model_3000/config.json +19 -0
- first_attention_resume_loss_convergence/model_3000/training_state.json +8 -0
- first_attention_resume_loss_convergence/training_config.yaml +47 -0
- first_layer_2/final_model/config.json +19 -0
- first_layer_2/model_1000/config.json +19 -0
- first_layer_2/model_11000/config.json +19 -0
- first_layer_2/model_11000/training_state.json +8 -0
- first_layer_2/model_12000/config.json +19 -0
- first_layer_2/model_13000/config.json +19 -0
- first_layer_2/model_13000/training_state.json +8 -0
- first_layer_2/model_14000/config.json +19 -0
- first_layer_2/model_14000/training_state.json +8 -0
- first_layer_2/model_15000/training_state.json +8 -0
- first_layer_2/model_2000/config.json +19 -0
- first_layer_2/model_2000/training_state.json +8 -0
- first_layer_2/model_3000/config.json +19 -0
- first_layer_2/model_3000/training_state.json +8 -0
baseline/final_model/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
baseline/model_1000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
baseline/model_1000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 1000,
|
3 |
+
"update_step": 1000,
|
4 |
+
"tokens_seen": 639025152,
|
5 |
+
"tokens_seen_before": 638386176,
|
6 |
+
"update_time": 3.298034429550171,
|
7 |
+
"wandb_id": "tytetz1i"
|
8 |
+
}
|
baseline/model_10000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
baseline/model_10000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 10000,
|
3 |
+
"update_step": 10000,
|
4 |
+
"tokens_seen": 6389809152,
|
5 |
+
"tokens_seen_before": 6389170176,
|
6 |
+
"update_time": 3.303941011428833,
|
7 |
+
"wandb_id": "tytetz1i"
|
8 |
+
}
|
baseline/model_11000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
baseline/model_11000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 11000,
|
3 |
+
"update_step": 11000,
|
4 |
+
"tokens_seen": 7028785152,
|
5 |
+
"tokens_seen_before": 7028146176,
|
6 |
+
"update_time": 3.299642324447632,
|
7 |
+
"wandb_id": "tytetz1i"
|
8 |
+
}
|
baseline/model_12000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
baseline/model_12000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 12000,
|
3 |
+
"update_step": 12000,
|
4 |
+
"tokens_seen": 7667761152,
|
5 |
+
"tokens_seen_before": 7667122176,
|
6 |
+
"update_time": 3.298994541168213,
|
7 |
+
"wandb_id": "tytetz1i"
|
8 |
+
}
|
baseline/model_13000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
baseline/model_13000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 13000,
|
3 |
+
"update_step": 13000,
|
4 |
+
"tokens_seen": 8306737152,
|
5 |
+
"tokens_seen_before": 8306098176,
|
6 |
+
"update_time": 3.3014893531799316,
|
7 |
+
"wandb_id": "tytetz1i"
|
8 |
+
}
|
baseline/model_14000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
baseline/model_14000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 14000,
|
3 |
+
"update_step": 14000,
|
4 |
+
"tokens_seen": 8945713152,
|
5 |
+
"tokens_seen_before": 8945074176,
|
6 |
+
"update_time": 3.3026955127716064,
|
7 |
+
"wandb_id": "tytetz1i"
|
8 |
+
}
|
baseline/model_15000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
baseline/model_15000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 15000,
|
3 |
+
"update_step": 15000,
|
4 |
+
"tokens_seen": 9584689152,
|
5 |
+
"tokens_seen_before": 9584050176,
|
6 |
+
"update_time": 3.3024754524230957,
|
7 |
+
"wandb_id": "tytetz1i"
|
8 |
+
}
|
baseline/model_2000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
baseline/model_2000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 2000,
|
3 |
+
"update_step": 2000,
|
4 |
+
"tokens_seen": 1278001152,
|
5 |
+
"tokens_seen_before": 1277362176,
|
6 |
+
"update_time": 3.2954349517822266,
|
7 |
+
"wandb_id": "tytetz1i"
|
8 |
+
}
|
baseline/model_3000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
baseline/model_3000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 3000,
|
3 |
+
"update_step": 3000,
|
4 |
+
"tokens_seen": 1916977152,
|
5 |
+
"tokens_seen_before": 1916338176,
|
6 |
+
"update_time": 3.3039631843566895,
|
7 |
+
"wandb_id": "tytetz1i"
|
8 |
+
}
|
baseline/model_4000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
baseline/model_4000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 4000,
|
3 |
+
"update_step": 4000,
|
4 |
+
"tokens_seen": 2555953152,
|
5 |
+
"tokens_seen_before": 2555314176,
|
6 |
+
"update_time": 3.2994651794433594,
|
7 |
+
"wandb_id": "tytetz1i"
|
8 |
+
}
|
baseline/model_5000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
baseline/model_5000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 5000,
|
3 |
+
"update_step": 5000,
|
4 |
+
"tokens_seen": 3194929152,
|
5 |
+
"tokens_seen_before": 3194290176,
|
6 |
+
"update_time": 3.3040542602539062,
|
7 |
+
"wandb_id": "tytetz1i"
|
8 |
+
}
|
baseline/model_6000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
baseline/model_6000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 6000,
|
3 |
+
"update_step": 6000,
|
4 |
+
"tokens_seen": 3833905152,
|
5 |
+
"tokens_seen_before": 3833266176,
|
6 |
+
"update_time": 3.2993881702423096,
|
7 |
+
"wandb_id": "tytetz1i"
|
8 |
+
}
|
baseline/model_7000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
baseline/model_7000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 7000,
|
3 |
+
"update_step": 7000,
|
4 |
+
"tokens_seen": 4472881152,
|
5 |
+
"tokens_seen_before": 4472242176,
|
6 |
+
"update_time": 3.297553777694702,
|
7 |
+
"wandb_id": "tytetz1i"
|
8 |
+
}
|
baseline/model_8000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
baseline/model_8000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 8000,
|
3 |
+
"update_step": 8000,
|
4 |
+
"tokens_seen": 5111857152,
|
5 |
+
"tokens_seen_before": 5111218176,
|
6 |
+
"update_time": 3.300821542739868,
|
7 |
+
"wandb_id": "tytetz1i"
|
8 |
+
}
|
baseline/model_9000/model_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
baseline/model_9000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 9000,
|
3 |
+
"update_step": 9000,
|
4 |
+
"tokens_seen": 5750833152,
|
5 |
+
"tokens_seen_before": 5750194176,
|
6 |
+
"update_time": 3.299265146255493,
|
7 |
+
"wandb_id": "tytetz1i"
|
8 |
+
}
|
baseline/training_config.yaml
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.95
|
3 |
+
adjust_step: 0
|
4 |
+
autoresume: false
|
5 |
+
batch_size: 6
|
6 |
+
clip_grad_norm: 1.0
|
7 |
+
comment: null
|
8 |
+
cycle_length: null
|
9 |
+
dtype: bfloat16
|
10 |
+
eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/
|
11 |
+
eval_every: 1000
|
12 |
+
gradient_accumulation: 13
|
13 |
+
keep_checkpoints: null
|
14 |
+
load_optimizer_state_on_resume: true
|
15 |
+
lr: 0.0004
|
16 |
+
max_length: 1024
|
17 |
+
max_train_tokens: null
|
18 |
+
min_lr_ratio: 0.1
|
19 |
+
model_config: model_config/478m.json
|
20 |
+
model_name_or_path: null
|
21 |
+
model_revision: null
|
22 |
+
num_training_steps: 15000
|
23 |
+
optimizer: Adam
|
24 |
+
restart_warmup_steps: null
|
25 |
+
resume_from: null
|
26 |
+
run_name: dark-moon-14
|
27 |
+
save_dir: checkpoints/dark-moon-14
|
28 |
+
save_every: 1000
|
29 |
+
scheduler: cosine
|
30 |
+
seed: 0
|
31 |
+
shuffle: true
|
32 |
+
skip_batches: !!set {}
|
33 |
+
tags:
|
34 |
+
- 396m-for-680m
|
35 |
+
total_batch_size: 624
|
36 |
+
train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/
|
37 |
+
training_config: training_config/478m.yaml
|
38 |
+
wandb_watch: true
|
39 |
+
warmed_up_model: null
|
40 |
+
warmup_steps: 1500
|
41 |
+
weight_decay: 0.0
|
42 |
+
workers: 8
|
first_attention_2_attention_unfreeze/model_13000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_attention_resume_loss_convergence/model_3000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_attention_resume_loss_convergence/model_3000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 3000,
|
3 |
+
"update_step": 3000,
|
4 |
+
"tokens_seen": 1916977152,
|
5 |
+
"tokens_seen_before": 1916338176,
|
6 |
+
"update_time": 2.884432792663574,
|
7 |
+
"wandb_id": "hkuum9kt"
|
8 |
+
}
|
first_attention_resume_loss_convergence/training_config.yaml
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_beta1: 0.9
|
2 |
+
adam_beta2: 0.95
|
3 |
+
adjust_step: 0
|
4 |
+
autoresume: false
|
5 |
+
batch_size: 6
|
6 |
+
clip_grad_norm: 1.0
|
7 |
+
comment: null
|
8 |
+
cycle_length: null
|
9 |
+
dtype: bfloat16
|
10 |
+
emb_freeze: null
|
11 |
+
eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/
|
12 |
+
eval_every: 1000
|
13 |
+
first_attention: false
|
14 |
+
first_attention_resume: false
|
15 |
+
gradient_accumulation: 13
|
16 |
+
keep_checkpoints: null
|
17 |
+
layer_freeze: null
|
18 |
+
layer_freeze_2: false
|
19 |
+
load_optimizer_state_on_resume: true
|
20 |
+
lr: 0.0004
|
21 |
+
max_length: 1024
|
22 |
+
max_train_tokens: null
|
23 |
+
min_lr_ratio: 0.1
|
24 |
+
model_config: model_config/478m.json
|
25 |
+
model_name_or_path: null
|
26 |
+
model_revision: null
|
27 |
+
num_training_steps: 15000
|
28 |
+
optimizer: Adam
|
29 |
+
restart_warmup_steps: null
|
30 |
+
resume_from: /lee_embedding/checkpoints/first_attention_resume_loss_convergence/model_3000
|
31 |
+
run_name: first_attention_resume_loss_convergence
|
32 |
+
save_dir: checkpoints/first_attention_resume_loss_convergence
|
33 |
+
save_every: 1000
|
34 |
+
scheduler: cosine
|
35 |
+
seed: 0
|
36 |
+
shuffle: true
|
37 |
+
skip_batches: !!set {}
|
38 |
+
tags:
|
39 |
+
- 396m-for-680m
|
40 |
+
total_batch_size: 624
|
41 |
+
train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/
|
42 |
+
training_config: training_config/two_stage/478m_first_attention_resume_loss_convergence.yaml
|
43 |
+
wandb_watch: true
|
44 |
+
warmed_up_model: null
|
45 |
+
warmup_steps: 1500
|
46 |
+
weight_decay: 0.0
|
47 |
+
workers: 8
|
first_layer_2/final_model/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_layer_2/model_1000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_layer_2/model_11000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_layer_2/model_11000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 11000,
|
3 |
+
"update_step": 11000,
|
4 |
+
"tokens_seen": 7028785152,
|
5 |
+
"tokens_seen_before": 7028146176,
|
6 |
+
"update_time": 2.8764145374298096,
|
7 |
+
"wandb_id": "f3ljzhyw"
|
8 |
+
}
|
first_layer_2/model_12000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_layer_2/model_13000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_layer_2/model_13000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 13000,
|
3 |
+
"update_step": 13000,
|
4 |
+
"tokens_seen": 8306737152,
|
5 |
+
"tokens_seen_before": 8306098176,
|
6 |
+
"update_time": 2.8782799243927,
|
7 |
+
"wandb_id": "f3ljzhyw"
|
8 |
+
}
|
first_layer_2/model_14000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_layer_2/model_14000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 14000,
|
3 |
+
"update_step": 14000,
|
4 |
+
"tokens_seen": 8945713152,
|
5 |
+
"tokens_seen_before": 8945074176,
|
6 |
+
"update_time": 2.8797836303710938,
|
7 |
+
"wandb_id": "f3ljzhyw"
|
8 |
+
}
|
first_layer_2/model_15000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 15000,
|
3 |
+
"update_step": 15000,
|
4 |
+
"tokens_seen": 9584689152,
|
5 |
+
"tokens_seen_before": 9584050176,
|
6 |
+
"update_time": 2.877997398376465,
|
7 |
+
"wandb_id": "f3ljzhyw"
|
8 |
+
}
|
first_layer_2/model_2000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_layer_2/model_2000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 2000,
|
3 |
+
"update_step": 2000,
|
4 |
+
"tokens_seen": 1278001152,
|
5 |
+
"tokens_seen_before": 1277362176,
|
6 |
+
"update_time": 2.8755557537078857,
|
7 |
+
"wandb_id": "f3ljzhyw"
|
8 |
+
}
|
first_layer_2/model_3000/config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"LLaMAForCausalLM"
|
4 |
+
],
|
5 |
+
"bos_token_id": 50256,
|
6 |
+
"eos_token_id": 50256,
|
7 |
+
"hidden_act": "silu",
|
8 |
+
"hidden_size": 1536,
|
9 |
+
"intermediate_size": 5376,
|
10 |
+
"initializer_range": 0.02,
|
11 |
+
"max_sequence_length": 1024,
|
12 |
+
"model_type": "llama",
|
13 |
+
"num_attention_heads": 24,
|
14 |
+
"num_hidden_layers": 14,
|
15 |
+
"rms_norm_eps": 1e-05,
|
16 |
+
"transformers_version": "4.28.1",
|
17 |
+
"use_cache": true,
|
18 |
+
"vocab_size": 50257
|
19 |
+
}
|
first_layer_2/model_3000/training_state.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"global_step": 3000,
|
3 |
+
"update_step": 3000,
|
4 |
+
"tokens_seen": 1916977152,
|
5 |
+
"tokens_seen_before": 1916338176,
|
6 |
+
"update_time": 2.87614107131958,
|
7 |
+
"wandb_id": "f3ljzhyw"
|
8 |
+
}
|