gerou161 commited on
Commit
8f20568
1 Parent(s): dbcecc7

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. baseline/final_model/pytorch_model.bin +3 -0
  2. bigram_2_full/final_model/config.json +19 -0
  3. bigram_2_full/model_1000/training_state.json +8 -0
  4. bigram_2_full/model_11000/model_config.json +19 -0
  5. bigram_2_full/model_11000/training_state.json +8 -0
  6. bigram_2_full/model_13000/model_config.json +19 -0
  7. bigram_2_full/model_13000/training_state.json +8 -0
  8. bigram_2_full/model_14000/training_state.json +8 -0
  9. bigram_2_full/model_2000/model_config.json +19 -0
  10. bigram_2_full/model_2000/training_state.json +8 -0
  11. bigram_2_full/model_3000/model_config.json +19 -0
  12. bigram_2_full/model_3000/training_state.json +8 -0
  13. bigram_2_full/model_4000/model_config.json +19 -0
  14. bigram_2_full/model_4000/training_state.json +8 -0
  15. bigram_2_full/model_5000/model_config.json +19 -0
  16. bigram_2_full/model_5000/training_state.json +8 -0
  17. bigram_2_full/model_9000/model_config.json +19 -0
  18. bigram_2_full/model_9000/training_state.json +8 -0
  19. bigram_2_full/training_config.yaml +44 -0
  20. first_attention_2_attention_unfreeze/final_model/config.json +19 -0
  21. first_attention_2_attention_unfreeze/model_1000/config.json +19 -0
  22. first_attention_2_attention_unfreeze/model_1000/training_state.json +8 -0
  23. first_attention_2_attention_unfreeze/model_10000/config.json +19 -0
  24. first_attention_2_attention_unfreeze/model_10000/training_state.json +8 -0
  25. first_attention_2_attention_unfreeze/model_11000/config.json +19 -0
  26. first_attention_2_attention_unfreeze/model_11000/training_state.json +8 -0
  27. first_attention_2_attention_unfreeze/model_12000/config.json +19 -0
  28. first_attention_2_attention_unfreeze/model_12000/training_state.json +8 -0
  29. first_attention_2_attention_unfreeze/model_13000/training_state.json +8 -0
  30. first_attention_2_attention_unfreeze/model_14000/config.json +19 -0
  31. first_attention_2_attention_unfreeze/model_14000/training_state.json +8 -0
  32. first_attention_2_attention_unfreeze/model_15000/config.json +19 -0
  33. first_attention_2_attention_unfreeze/model_15000/training_state.json +8 -0
  34. first_attention_2_attention_unfreeze/model_2000/config.json +19 -0
  35. first_attention_2_attention_unfreeze/model_2000/training_state.json +8 -0
  36. first_attention_2_attention_unfreeze/model_3000/config.json +19 -0
  37. first_attention_2_attention_unfreeze/model_3000/training_state.json +8 -0
  38. first_attention_2_attention_unfreeze/model_4000/config.json +19 -0
  39. first_attention_2_attention_unfreeze/model_4000/training_state.json +8 -0
  40. first_attention_2_attention_unfreeze/model_5000/config.json +19 -0
  41. first_attention_2_attention_unfreeze/model_5000/training_state.json +8 -0
  42. first_attention_2_attention_unfreeze/model_6000/config.json +19 -0
  43. first_attention_2_attention_unfreeze/model_6000/training_state.json +8 -0
  44. first_attention_2_attention_unfreeze/model_7000/config.json +19 -0
  45. first_attention_2_attention_unfreeze/model_7000/training_state.json +8 -0
  46. first_attention_2_attention_unfreeze/model_8000/config.json +19 -0
  47. first_attention_2_attention_unfreeze/model_8000/training_state.json +8 -0
  48. first_attention_2_attention_unfreeze/model_9000/config.json +19 -0
  49. first_attention_2_attention_unfreeze/model_9000/training_state.json +8 -0
  50. first_attention_2_attention_unfreeze/training_config.yaml +47 -0
baseline/final_model/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a68abbc43af60bd4c7f2d95c72f10740acbbb95df43e3fd9ba6ae48d8c02ccc
3
+ size 2533545094
bigram_2_full/final_model/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
bigram_2_full/model_1000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 1000,
3
+ "update_step": 1000,
4
+ "tokens_seen": 639025152,
5
+ "tokens_seen_before": 638386176,
6
+ "update_time": 3.281445264816284,
7
+ "wandb_id": "7nopmkvs"
8
+ }
bigram_2_full/model_11000/model_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
bigram_2_full/model_11000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 11000,
3
+ "update_step": 11000,
4
+ "tokens_seen": 7028785152,
5
+ "tokens_seen_before": 7028146176,
6
+ "update_time": 3.2798073291778564,
7
+ "wandb_id": "7nopmkvs"
8
+ }
bigram_2_full/model_13000/model_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
bigram_2_full/model_13000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 13000,
3
+ "update_step": 13000,
4
+ "tokens_seen": 8306737152,
5
+ "tokens_seen_before": 8306098176,
6
+ "update_time": 3.2799124717712402,
7
+ "wandb_id": "7nopmkvs"
8
+ }
bigram_2_full/model_14000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 14000,
3
+ "update_step": 14000,
4
+ "tokens_seen": 8945713152,
5
+ "tokens_seen_before": 8945074176,
6
+ "update_time": 3.2842938899993896,
7
+ "wandb_id": "7nopmkvs"
8
+ }
bigram_2_full/model_2000/model_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
bigram_2_full/model_2000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 2000,
3
+ "update_step": 2000,
4
+ "tokens_seen": 1278001152,
5
+ "tokens_seen_before": 1277362176,
6
+ "update_time": 3.285855293273926,
7
+ "wandb_id": "7nopmkvs"
8
+ }
bigram_2_full/model_3000/model_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
bigram_2_full/model_3000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 3000,
3
+ "update_step": 3000,
4
+ "tokens_seen": 1916977152,
5
+ "tokens_seen_before": 1916338176,
6
+ "update_time": 3.2819085121154785,
7
+ "wandb_id": "7nopmkvs"
8
+ }
bigram_2_full/model_4000/model_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
bigram_2_full/model_4000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 4000,
3
+ "update_step": 4000,
4
+ "tokens_seen": 2555953152,
5
+ "tokens_seen_before": 2555314176,
6
+ "update_time": 3.283513307571411,
7
+ "wandb_id": "7nopmkvs"
8
+ }
bigram_2_full/model_5000/model_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
bigram_2_full/model_5000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 5000,
3
+ "update_step": 5000,
4
+ "tokens_seen": 3194929152,
5
+ "tokens_seen_before": 3194290176,
6
+ "update_time": 3.27878475189209,
7
+ "wandb_id": "7nopmkvs"
8
+ }
bigram_2_full/model_9000/model_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
bigram_2_full/model_9000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 9000,
3
+ "update_step": 9000,
4
+ "tokens_seen": 5750833152,
5
+ "tokens_seen_before": 5750194176,
6
+ "update_time": 3.2838311195373535,
7
+ "wandb_id": "7nopmkvs"
8
+ }
bigram_2_full/training_config.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ adam_beta1: 0.9
2
+ adam_beta2: 0.95
3
+ adjust_step: 0
4
+ autoresume: false
5
+ batch_size: 6
6
+ clip_grad_norm: 1.0
7
+ comment: null
8
+ cycle_length: null
9
+ dtype: bfloat16
10
+ emb_freeze: null
11
+ eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/
12
+ eval_every: 1000
13
+ gradient_accumulation: 13
14
+ keep_checkpoints: null
15
+ layer_freeze: null
16
+ load_optimizer_state_on_resume: true
17
+ lr: 0.0004
18
+ max_length: 1024
19
+ max_train_tokens: null
20
+ min_lr_ratio: 0.1
21
+ model_config: model_config/478m.json
22
+ model_name_or_path: null
23
+ model_revision: null
24
+ num_training_steps: 15000
25
+ optimizer: Adam
26
+ restart_warmup_steps: null
27
+ resume_from: null
28
+ run_name: earnest-sky-22
29
+ save_dir: checkpoints/earnest-sky-22
30
+ save_every: 1000
31
+ scheduler: cosine
32
+ seed: 0
33
+ shuffle: true
34
+ skip_batches: !!set {}
35
+ tags:
36
+ - 396m-for-680m
37
+ total_batch_size: 624
38
+ train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/
39
+ training_config: training_config/two_stage/478m_resume_unfreeze.yaml
40
+ wandb_watch: true
41
+ warmed_up_model: /lee_embedding/checkpoints/tough-snowflake-18/final_model/
42
+ warmup_steps: 1500
43
+ weight_decay: 0.0
44
+ workers: 8
first_attention_2_attention_unfreeze/final_model/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
first_attention_2_attention_unfreeze/model_1000/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
first_attention_2_attention_unfreeze/model_1000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 1000,
3
+ "update_step": 1000,
4
+ "tokens_seen": 639025152,
5
+ "tokens_seen_before": 638386176,
6
+ "update_time": 2.9259252548217773,
7
+ "wandb_id": "0ab5p6ah"
8
+ }
first_attention_2_attention_unfreeze/model_10000/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
first_attention_2_attention_unfreeze/model_10000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 10000,
3
+ "update_step": 10000,
4
+ "tokens_seen": 6389809152,
5
+ "tokens_seen_before": 6389170176,
6
+ "update_time": 2.925158977508545,
7
+ "wandb_id": "0ab5p6ah"
8
+ }
first_attention_2_attention_unfreeze/model_11000/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
first_attention_2_attention_unfreeze/model_11000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 11000,
3
+ "update_step": 11000,
4
+ "tokens_seen": 7028785152,
5
+ "tokens_seen_before": 7028146176,
6
+ "update_time": 2.925471067428589,
7
+ "wandb_id": "0ab5p6ah"
8
+ }
first_attention_2_attention_unfreeze/model_12000/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
first_attention_2_attention_unfreeze/model_12000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 12000,
3
+ "update_step": 12000,
4
+ "tokens_seen": 7667761152,
5
+ "tokens_seen_before": 7667122176,
6
+ "update_time": 2.9245717525482178,
7
+ "wandb_id": "0ab5p6ah"
8
+ }
first_attention_2_attention_unfreeze/model_13000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 13000,
3
+ "update_step": 13000,
4
+ "tokens_seen": 8306737152,
5
+ "tokens_seen_before": 8306098176,
6
+ "update_time": 2.9278290271759033,
7
+ "wandb_id": "0ab5p6ah"
8
+ }
first_attention_2_attention_unfreeze/model_14000/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
first_attention_2_attention_unfreeze/model_14000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 14000,
3
+ "update_step": 14000,
4
+ "tokens_seen": 8945713152,
5
+ "tokens_seen_before": 8945074176,
6
+ "update_time": 2.924705743789673,
7
+ "wandb_id": "0ab5p6ah"
8
+ }
first_attention_2_attention_unfreeze/model_15000/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
first_attention_2_attention_unfreeze/model_15000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 15000,
3
+ "update_step": 15000,
4
+ "tokens_seen": 9584689152,
5
+ "tokens_seen_before": 9584050176,
6
+ "update_time": 2.924987554550171,
7
+ "wandb_id": "0ab5p6ah"
8
+ }
first_attention_2_attention_unfreeze/model_2000/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
first_attention_2_attention_unfreeze/model_2000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 2000,
3
+ "update_step": 2000,
4
+ "tokens_seen": 1278001152,
5
+ "tokens_seen_before": 1277362176,
6
+ "update_time": 2.923201322555542,
7
+ "wandb_id": "0ab5p6ah"
8
+ }
first_attention_2_attention_unfreeze/model_3000/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
first_attention_2_attention_unfreeze/model_3000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 3000,
3
+ "update_step": 3000,
4
+ "tokens_seen": 1916977152,
5
+ "tokens_seen_before": 1916338176,
6
+ "update_time": 2.9246528148651123,
7
+ "wandb_id": "0ab5p6ah"
8
+ }
first_attention_2_attention_unfreeze/model_4000/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
first_attention_2_attention_unfreeze/model_4000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 4000,
3
+ "update_step": 4000,
4
+ "tokens_seen": 2555953152,
5
+ "tokens_seen_before": 2555314176,
6
+ "update_time": 2.9246957302093506,
7
+ "wandb_id": "0ab5p6ah"
8
+ }
first_attention_2_attention_unfreeze/model_5000/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
first_attention_2_attention_unfreeze/model_5000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 5000,
3
+ "update_step": 5000,
4
+ "tokens_seen": 3194929152,
5
+ "tokens_seen_before": 3194290176,
6
+ "update_time": 2.9258697032928467,
7
+ "wandb_id": "0ab5p6ah"
8
+ }
first_attention_2_attention_unfreeze/model_6000/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
first_attention_2_attention_unfreeze/model_6000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 6000,
3
+ "update_step": 6000,
4
+ "tokens_seen": 3833905152,
5
+ "tokens_seen_before": 3833266176,
6
+ "update_time": 2.92516827583313,
7
+ "wandb_id": "0ab5p6ah"
8
+ }
first_attention_2_attention_unfreeze/model_7000/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
first_attention_2_attention_unfreeze/model_7000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 7000,
3
+ "update_step": 7000,
4
+ "tokens_seen": 4472881152,
5
+ "tokens_seen_before": 4472242176,
6
+ "update_time": 2.925605058670044,
7
+ "wandb_id": "0ab5p6ah"
8
+ }
first_attention_2_attention_unfreeze/model_8000/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
first_attention_2_attention_unfreeze/model_8000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 8000,
3
+ "update_step": 8000,
4
+ "tokens_seen": 5111857152,
5
+ "tokens_seen_before": 5111218176,
6
+ "update_time": 2.924616575241089,
7
+ "wandb_id": "0ab5p6ah"
8
+ }
first_attention_2_attention_unfreeze/model_9000/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LLaMAForCausalLM"
4
+ ],
5
+ "bos_token_id": 50256,
6
+ "eos_token_id": 50256,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 1536,
9
+ "intermediate_size": 5376,
10
+ "initializer_range": 0.02,
11
+ "max_sequence_length": 1024,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 24,
14
+ "num_hidden_layers": 14,
15
+ "rms_norm_eps": 1e-05,
16
+ "transformers_version": "4.28.1",
17
+ "use_cache": true,
18
+ "vocab_size": 50257
19
+ }
first_attention_2_attention_unfreeze/model_9000/training_state.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 9000,
3
+ "update_step": 9000,
4
+ "tokens_seen": 5750833152,
5
+ "tokens_seen_before": 5750194176,
6
+ "update_time": 2.9271013736724854,
7
+ "wandb_id": "0ab5p6ah"
8
+ }
first_attention_2_attention_unfreeze/training_config.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ adam_beta1: 0.9
2
+ adam_beta2: 0.95
3
+ adjust_step: 0
4
+ autoresume: false
5
+ batch_size: 6
6
+ clip_grad_norm: 1.0
7
+ comment: null
8
+ cycle_length: null
9
+ dtype: bfloat16
10
+ emb_freeze: null
11
+ eval_dataset_path: /work01/yanokazuki/fineweb/valid_data_gpt2/
12
+ eval_every: 1000
13
+ first_attention: false
14
+ first_attention_resume: true
15
+ gradient_accumulation: 13
16
+ keep_checkpoints: null
17
+ layer_freeze: null
18
+ layer_freeze_2: false
19
+ load_optimizer_state_on_resume: true
20
+ lr: 0.0004
21
+ max_length: 1024
22
+ max_train_tokens: null
23
+ min_lr_ratio: 0.1
24
+ model_config: model_config/478m.json
25
+ model_name_or_path: null
26
+ model_revision: null
27
+ num_training_steps: 15000
28
+ optimizer: Adam
29
+ restart_warmup_steps: null
30
+ resume_from: null
31
+ run_name: first_attention_resume_unfreeze
32
+ save_dir: checkpoints/first_attention_resume_unfreeze
33
+ save_every: 1000
34
+ scheduler: cosine
35
+ seed: 0
36
+ shuffle: true
37
+ skip_batches: !!set {}
38
+ tags:
39
+ - 396m-for-680m
40
+ total_batch_size: 624
41
+ train_dataset_path: /work01/yanokazuki/fineweb/train_data_gpt2/
42
+ training_config: training_config/two_stage/478m_first_attention_resume_unfreeze.yaml
43
+ wandb_watch: true
44
+ warmed_up_model: null
45
+ warmup_steps: 1500
46
+ weight_decay: 0.0
47
+ workers: 8