thomwolf HF staff commited on
Commit
0c6f487
1 Parent(s): 5d8e8eb

add pretrained model

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +1 -0
  2. config_mistral.py +70 -0
  3. config_mistral_7b.py +88 -0
  4. config_mistral_7b.yaml +53 -0
  5. config_tiny_mistral.py +7 -42
  6. config_tiny_mistral.yaml +92 -0
  7. convert_trfrs_to_brrr.py +262 -0
  8. modeling_mistral.py +50 -27
  9. pretrained/Mistral-7B-v0.1/checkpoint_metadata.json +9 -0
  10. pretrained/Mistral-7B-v0.1/config.yaml +53 -0
  11. pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  12. pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  13. pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/input_layernorm/model_weight.safetensors +3 -0
  14. pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  15. pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  16. pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/post_attention_layernorm/model_weight.safetensors +3 -0
  17. pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  18. pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  19. pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/input_layernorm/model_weight.safetensors +3 -0
  20. pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  21. pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  22. pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/post_attention_layernorm/model_weight.safetensors +3 -0
  23. pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  24. pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  25. pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/input_layernorm/model_weight.safetensors +3 -0
  26. pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  27. pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  28. pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/post_attention_layernorm/model_weight.safetensors +3 -0
  29. pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  30. pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  31. pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/input_layernorm/model_weight.safetensors +3 -0
  32. pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  33. pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  34. pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/post_attention_layernorm/model_weight.safetensors +3 -0
  35. pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  36. pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  37. pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/input_layernorm/model_weight.safetensors +3 -0
  38. pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  39. pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  40. pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/post_attention_layernorm/model_weight.safetensors +3 -0
  41. pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  42. pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  43. pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/input_layernorm/model_weight.safetensors +3 -0
  44. pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  45. pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  46. pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/post_attention_layernorm/model_weight.safetensors +3 -0
  47. pretrained/Mistral-7B-v0.1/model/model/decoder/14/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  48. pretrained/Mistral-7B-v0.1/model/model/decoder/14/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
  49. pretrained/Mistral-7B-v0.1/model/model/decoder/14/pp_block/input_layernorm/model_weight.safetensors +3 -0
  50. pretrained/Mistral-7B-v0.1/model/model/decoder/14/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
config_mistral.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ Example python script to generate a YAML config file which can be used to run a training with nanotron. Refer to "examples" section in the `/README.md` for more information.
2
+
3
+ Usage:
4
+ ```
5
+ python config_tiny_mistral.py
6
+ ```
7
+ """
8
+ import os
9
+ from dataclasses import dataclass
10
+ from typing import Optional
11
+
12
+ from nanotron.config import (
13
+ CheckpointsArgs,
14
+ Config,
15
+ DataArgs,
16
+ GeneralArgs,
17
+ LoggingArgs,
18
+ LRSchedulerArgs,
19
+ ModelArgs,
20
+ OptimizerArgs,
21
+ ParallelismArgs,
22
+ PretrainDatasetsArgs,
23
+ RandomInit,
24
+ TokenizerArgs,
25
+ TokensArgs,
26
+ )
27
+ from nanotron.logging import human_format
28
+
29
+
30
+ @dataclass
31
+ class MistralConfig:
32
+ """Configuration for a MISTRAL model
33
+
34
+ Be careful on having a coherent typing as we use it to reconstruct the model from yaml
35
+ """
36
+
37
+ attn_pdrop: float = 0.0
38
+ bos_token_id: int = 1
39
+ eos_token_id: int = 2
40
+ hidden_act: str = "silu"
41
+ hidden_size: int = 4096
42
+ initializer_range: float = 0.02
43
+ intermediate_size: int = 14336
44
+ is_mistral_config: bool = True # We use this help differentiate models in yaml/python conversion
45
+ max_position_embeddings: int = 32768
46
+ num_attention_heads: int = 32
47
+ num_hidden_layers: int = 32
48
+ num_key_value_heads: Optional[int] = 8
49
+ pad_token_id: Optional[int] = None
50
+ pretraining_tp: int = 1
51
+ rms_norm_eps: float = 1e-05
52
+ rope_theta: float = 10000.0
53
+ sliding_window_size: int = 4096
54
+ tie_word_embeddings: bool = False
55
+ use_cache: bool = True
56
+ vocab_size: int = 32000
57
+
58
+ def __post_init__(self):
59
+ # for backward compatibility
60
+ if self.num_key_value_heads is None:
61
+ self.num_key_value_heads = self.num_attention_heads
62
+
63
+ def get_num_params(model_config: MistralConfig) -> int:
64
+ num_params = model_config.vocab_size * model_config.hidden_size * 2 + \
65
+ model_config.num_hidden_layers * (
66
+ 3 * model_config.hidden_size * model_config.intermediate_size
67
+ + 2 * model_config.hidden_size * model_config.hidden_size
68
+ + 2 * model_config.hidden_size * (model_config.hidden_size / (model_config.num_attention_heads / model_config.num_key_value_heads))
69
+ )
70
+ return num_params
config_mistral_7b.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ Example python script to generate a YAML config file which can be used to run a training with nanotron. Refer to "examples" section in the `/README.md` for more information.
2
+
3
+ Usage:
4
+ ```
5
+ python config_tiny_mistral.py
6
+ ```
7
+ """
8
+ import os
9
+ from dataclasses import dataclass
10
+ from typing import Optional
11
+
12
+ from nanotron.config import (
13
+ CheckpointsArgs,
14
+ Config,
15
+ DataArgs,
16
+ GeneralArgs,
17
+ LoggingArgs,
18
+ LRSchedulerArgs,
19
+ ModelArgs,
20
+ OptimizerArgs,
21
+ ParallelismArgs,
22
+ PretrainDatasetsArgs,
23
+ RandomInit,
24
+ TokenizerArgs,
25
+ TokensArgs,
26
+ )
27
+ from nanotron.logging import human_format
28
+
29
+ from config_mistral import MistralConfig, get_num_params
30
+
31
+
32
+ MODEL_CONFIG = MistralConfig(
33
+ # Config for Mistral 7B
34
+ attn_pdrop=0.0,
35
+ bos_token_id=1,
36
+ eos_token_id=2,
37
+ hidden_act="silu",
38
+ hidden_size=4096,
39
+ initializer_range=0.02,
40
+ intermediate_size=14336,
41
+ max_position_embeddings=32768,
42
+ num_attention_heads=32,
43
+ num_hidden_layers=32,
44
+ num_key_value_heads=8,
45
+ pretraining_tp=1,
46
+ rms_norm_eps=1e-05,
47
+ rope_theta=10000.0,
48
+ sliding_window_size=4096,
49
+ tie_word_embeddings=False,
50
+ use_cache=True,
51
+ vocab_size=32000,
52
+ )
53
+
54
+ num_params = human_format(get_num_params(MODEL_CONFIG)).replace(".", "p")
55
+
56
+ print(f"Model has {num_params} parameters")
57
+
58
+ PARALLELISM = ParallelismArgs(
59
+ dp=2,
60
+ pp=2,
61
+ tp=2,
62
+ pp_engine="1f1b",
63
+ tp_mode="REDUCE_SCATTER",
64
+ tp_linear_async_communication=True,
65
+ recompute_granularity="selective",
66
+ )
67
+
68
+ CONFIG = Config(
69
+ general=GeneralArgs(project="mistralai", run="Mistral-7B-v0.1", seed=42),
70
+ checkpoints=None,
71
+ parallelism=PARALLELISM,
72
+ model=ModelArgs(init_method=RandomInit(std=0.025), model_config=MODEL_CONFIG),
73
+ tokenizer=TokenizerArgs("mistralai/Mistral-7B-v0.1"),
74
+ optimizer=None,
75
+ logging=None,
76
+ tokens=None,
77
+ data=None,
78
+ profiler=None,
79
+ )
80
+
81
+ if __name__ == "__main__":
82
+ file_path = os.path.abspath(__file__)
83
+
84
+ file_path = file_path.replace(".py", ".yaml")
85
+ # Save config as YAML file
86
+ config.save_as_yaml(file_path)
87
+
88
+ # You can now train a model with this config using `/run_train.py`
config_mistral_7b.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpoints: null
2
+ data: null
3
+ general:
4
+ benchmark_csv_path: null
5
+ consumed_train_samples: null
6
+ ignore_sanity_checks: false
7
+ project: mistralai
8
+ run: Mistral-7B-v0.1
9
+ seed: 42
10
+ step: null
11
+ logging: null
12
+ model:
13
+ ddp_bucket_cap_mb: 25
14
+ dtype: bfloat16
15
+ init_method:
16
+ std: 0.025
17
+ make_vocab_size_divisible_by: 1
18
+ model_config:
19
+ attn_pdrop: 0.0
20
+ bos_token_id: 1
21
+ eos_token_id: 2
22
+ hidden_act: silu
23
+ hidden_size: 4096
24
+ initializer_range: 0.02
25
+ intermediate_size: 14336
26
+ is_mistral_config: true
27
+ max_position_embeddings: 32768
28
+ num_attention_heads: 32
29
+ num_hidden_layers: 32
30
+ num_key_value_heads: 8
31
+ pad_token_id: null
32
+ pretraining_tp: 1
33
+ rms_norm_eps: 1.0e-05
34
+ rope_theta: 10000.0
35
+ sliding_window_size: 4096
36
+ tie_word_embeddings: false
37
+ use_cache: true
38
+ vocab_size: 32000
39
+ optimizer: null
40
+ parallelism:
41
+ dp: 2
42
+ pp: 2
43
+ pp_engine: 1f1b
44
+ recompute_granularity: SELECTIVE
45
+ tp: 2
46
+ tp_linear_async_communication: true
47
+ tp_mode: REDUCE_SCATTER
48
+ profiler: null
49
+ tokenizer:
50
+ tokenizer_max_length: null
51
+ tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
52
+ tokenizer_revision: null
53
+ tokens: null
config_tiny_mistral.py CHANGED
@@ -26,41 +26,12 @@ from nanotron.config import (
26
  )
27
  from nanotron.logging import human_format
28
 
29
-
30
- @dataclass
31
- class MistralConfig:
32
- """Configuration for a MISTRAL model
33
-
34
- Be careful on having a coherent typing as we use it to reconstruct the model from yaml
35
- """
36
-
37
- bos_token_id: int = 1
38
- eos_token_id: int = 2
39
- hidden_act: str = "silu"
40
- hidden_size: int = 4096
41
- initializer_range: float = 0.02
42
- intermediate_size: int = 11008
43
- is_mistral_config: bool = True # We use this help differentiate models in yaml/python conversion
44
- max_position_embeddings: int = 2048
45
- num_attention_heads: int = 32
46
- num_hidden_layers: int = 32
47
- num_key_value_heads: Optional[int] = None
48
- pad_token_id: Optional[int] = None
49
- pretraining_tp: int = 1
50
- rms_norm_eps: float = 1e-6
51
- rope_scaling: Optional[dict] = None
52
- tie_word_embeddings: bool = False
53
- use_cache: bool = True
54
- vocab_size: int = 32000
55
-
56
- def __post_init__(self):
57
- # for backward compatibility
58
- if self.num_key_value_heads is None:
59
- self.num_key_value_heads = self.num_attention_heads
60
 
61
 
62
  model_config = MistralConfig(
63
  # Config for a tiny model model with 1.62M parameters
 
64
  bos_token_id=1,
65
  eos_token_id=2,
66
  hidden_act="silu",
@@ -73,20 +44,13 @@ model_config = MistralConfig(
73
  num_key_value_heads=4,
74
  pretraining_tp=1,
75
  rms_norm_eps=1e-05,
76
- rope_scaling=None,
77
  tie_word_embeddings=True,
78
  use_cache=True,
79
  vocab_size=256,
80
  )
81
 
82
- num_params = human_format(
83
- model_config.vocab_size * model_config.hidden_size * 2
84
- + model_config.num_hidden_layers
85
- * (
86
- 3 * model_config.hidden_size * model_config.intermediate_size
87
- + 4 * model_config.hidden_size * model_config.hidden_size
88
- )
89
- ).replace(".", "p")
90
 
91
  print(f"Model has {num_params} parameters")
92
 
@@ -141,9 +105,10 @@ config = Config(
141
  )
142
 
143
  if __name__ == "__main__":
144
- dir = os.path.dirname(__file__)
145
 
 
146
  # Save config as YAML file
147
- config.save_as_yaml(f"{dir}/config_tiny_mistral.yaml")
148
 
149
  # You can now train a model with this config using `/run_train.py`
 
26
  )
27
  from nanotron.logging import human_format
28
 
29
+ from config_mistral import MistralConfig, get_num_params
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
 
32
  model_config = MistralConfig(
33
  # Config for a tiny model model with 1.62M parameters
34
+ attn_pdrop=0.0,
35
  bos_token_id=1,
36
  eos_token_id=2,
37
  hidden_act="silu",
 
44
  num_key_value_heads=4,
45
  pretraining_tp=1,
46
  rms_norm_eps=1e-05,
47
+ rope_theta=10000.0,
48
  tie_word_embeddings=True,
49
  use_cache=True,
50
  vocab_size=256,
51
  )
52
 
53
+ num_params = human_format(get_num_params(model_config)).replace(".", "p")
 
 
 
 
 
 
 
54
 
55
  print(f"Model has {num_params} parameters")
56
 
 
105
  )
106
 
107
  if __name__ == "__main__":
108
+ file_path = os.path.abspath(__file__)
109
 
110
+ file_path = file_path.replace(".py", ".yaml")
111
  # Save config as YAML file
112
+ config.save_as_yaml(file_path)
113
 
114
  # You can now train a model with this config using `/run_train.py`
config_tiny_mistral.yaml ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpoints:
2
+ checkpoint_interval: 10
3
+ checkpoints_path: /fsx/thomwolf/github/textbooks-proj/brrr/models/checkpoints
4
+ checkpoints_path_is_shared_file_system: false
5
+ resume_checkpoint_path: null
6
+ save_initial_state: false
7
+ data:
8
+ dataset:
9
+ dataset_overwrite_cache: false
10
+ dataset_processing_num_proc_per_process: 1
11
+ hf_dataset_config_name: null
12
+ hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
13
+ hf_dataset_splits: train
14
+ text_column_name: completion
15
+ num_loading_workers: 1
16
+ seed: 42
17
+ general:
18
+ benchmark_csv_path: null
19
+ consumed_train_samples: null
20
+ ignore_sanity_checks: false
21
+ project: debug
22
+ run: tiny_mistral
23
+ seed: 42
24
+ step: null
25
+ logging:
26
+ iteration_step_info_interval: 1
27
+ log_level: info
28
+ log_level_replica: info
29
+ model:
30
+ ddp_bucket_cap_mb: 25
31
+ dtype: bfloat16
32
+ init_method:
33
+ std: 0.025
34
+ make_vocab_size_divisible_by: 1
35
+ model_config:
36
+ attn_pdrop: 0.0
37
+ bos_token_id: 1
38
+ eos_token_id: 2
39
+ hidden_act: silu
40
+ hidden_size: 16
41
+ initializer_range: 0.02
42
+ intermediate_size: 64
43
+ is_mistral_config: true
44
+ max_position_embeddings: 256
45
+ num_attention_heads: 4
46
+ num_hidden_layers: 2
47
+ num_key_value_heads: 4
48
+ pad_token_id: null
49
+ pretraining_tp: 1
50
+ rms_norm_eps: 1.0e-05
51
+ rope_theta: 10000.0
52
+ sliding_window_size: 4096
53
+ tie_word_embeddings: true
54
+ use_cache: true
55
+ vocab_size: 256
56
+ optimizer:
57
+ accumulate_grad_in_fp32: true
58
+ adam_beta1: 0.9
59
+ adam_beta2: 0.95
60
+ adam_eps: 1.0e-08
61
+ clip_grad: 1.0
62
+ learning_rate_scheduler:
63
+ learning_rate: 0.0003
64
+ lr_decay_steps: 8
65
+ lr_decay_style: cosine
66
+ lr_warmup_steps: 2
67
+ lr_warmup_style: linear
68
+ min_decay_lr: 1.0e-05
69
+ torch_adam_is_fused: true
70
+ weight_decay: 0.01
71
+ zero_stage: 0
72
+ parallelism:
73
+ dp: 2
74
+ pp: 2
75
+ pp_engine: 1f1b
76
+ recompute_granularity: SELECTIVE
77
+ tp: 2
78
+ tp_linear_async_communication: true
79
+ tp_mode: REDUCE_SCATTER
80
+ profiler: null
81
+ tokenizer:
82
+ tokenizer_max_length: null
83
+ tokenizer_name_or_path: gpt2
84
+ tokenizer_revision: null
85
+ tokens:
86
+ batch_accumulation_per_replica: 1
87
+ limit_test_batches: 0
88
+ limit_val_batches: 0
89
+ micro_batch_size: 2
90
+ sequence_length: 32
91
+ train_steps: 10
92
+ val_check_interval: -1
convert_trfrs_to_brrr.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ruff: noqa: E402
2
+ """
3
+ This module converts a transformers LlamaForCausalLM to a brrr model
4
+
5
+ Command:
6
+ torchrun --nproc_per_node=1 convert_trfrs_to_brrr.py \
7
+ --model_name mistralai/Mistral-7B-v0.1 \
8
+ --save_path ./pretrained/Mistral-7B-v0.1
9
+ """
10
+ import argparse
11
+ import sys
12
+ from dataclasses import asdict
13
+ from pathlib import Path
14
+ from typing import Dict, List
15
+
16
+ import torch
17
+
18
+ from brrr.trainer import DistributedTrainer
19
+
20
+ sys.path.append(Path(__file__).parent.parent.as_posix())
21
+ import os
22
+
23
+ from nanotron.parallel.parameters import NanotronParameter, sanity_check
24
+ from nanotron.parallel.pipeline_parallel.engine import (
25
+ AllForwardAllBackwardPipelineEngine,
26
+ )
27
+ from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode
28
+ from transformers import MistralConfig as MistralConfig_trfs, MistralForCausalLM
29
+
30
+ import nanotron.distributed as dist
31
+ from nanotron.config import ParallelismArgs, RecomputeGranularity
32
+ from nanotron.parallel.context import ParallelContext
33
+ from nanotron.models import build_model
34
+ from nanotron.trainer import mark_tied_parameters
35
+ from nanotron.serialize import save_meta, save_weights, save
36
+
37
+ from modeling_mistral import MistralForTraining
38
+ from config_mistral_7b import PARALLELISM as PARALLELISM_BRRR, CONFIG as CONFIG_BRRR
39
+
40
+
41
+ def get_args():
42
+ parser = argparse.ArgumentParser(description="Convert transformers weights to brrr weights")
43
+ parser.add_argument("--model_name", type=str, default="mistralai/Mistral-7B-v0.1")
44
+ parser.add_argument("--save_path", type=str, default="pretrained/Mistral-7B-v0.1")
45
+ parser.add_argument("--dp", type=int, default=1)
46
+ parser.add_argument("--pp", type=int, default=1)
47
+ parser.add_argument("--tp", type=int, default=1)
48
+ return parser.parse_args()
49
+
50
+
51
+ def permute_for_rotary(tensor, num_heads, per_head_hidden_size, hidden_size):
52
+ return (
53
+ tensor.view(num_heads, 2, per_head_hidden_size // 2, hidden_size)
54
+ .transpose(1, 2)
55
+ .contiguous()
56
+ .view(num_heads * per_head_hidden_size, hidden_size)
57
+ )
58
+
59
+
60
+ def get_transformers_weight(
61
+ name: str, ref_module_state_dict: Dict[str, torch.Tensor], ref_module: MistralForCausalLM, get_grad: bool = False
62
+ ) -> torch.Tensor:
63
+ """From our brrr implementation, we get the equivalent tensor in transformers implementation"""
64
+ config = ref_module.config
65
+ brrr_prefix = "model."
66
+ assert name.startswith(brrr_prefix)
67
+ name = name[len(brrr_prefix) :]
68
+
69
+ path = name.split(".")
70
+ path.remove("pp_block")
71
+ name = ".".join(path)
72
+
73
+ if get_grad is False:
74
+
75
+ def get_tensor(path: str):
76
+ return ref_module_state_dict[path]
77
+
78
+ def get_tensors(path: List[str]):
79
+ return [get_tensor(p) for p in path]
80
+
81
+ else:
82
+
83
+ def get_tensor(path: str):
84
+ weight = ref_module.get_parameter(path)
85
+ return weight.grad
86
+
87
+ def get_tensors(path: List[str]):
88
+ return [get_tensor(p) for p in path]
89
+
90
+ if name == "token_position_embeddings.token_embedding.weight":
91
+ return get_tensor("model.embed_tokens.weight")
92
+
93
+ elif name == "lm_head.weight":
94
+ # This only used when weights are not shared
95
+ return get_tensor("lm_head.weight")
96
+
97
+ elif name == "final_layer_norm.weight":
98
+ return get_tensor("model.norm.weight")
99
+
100
+ if path[0] == "decoder":
101
+ transformer_path = ["model"] + ["layers"] + [path[1]]
102
+
103
+ if path[2] == "attn":
104
+ path[2] = "self_attn"
105
+
106
+ if path[2] == "ff":
107
+ path[2] = "mlp"
108
+
109
+ if path[3] == "qkv_proj":
110
+ proj_names = ["q_proj", "k_proj", "v_proj"]
111
+ tensor_list = get_tensors(
112
+ [".".join(transformer_path + path[2:3] + [proj_name] + path[4:]) for proj_name in proj_names]
113
+ )
114
+ # Permute q/k
115
+ per_head_hidden_size = config.hidden_size // config.num_attention_heads
116
+ # Permute q
117
+ print(f"Permuting q {tensor_list[0].shape}")
118
+ tensor_list[0] = permute_for_rotary(
119
+ tensor=tensor_list[0],
120
+ num_heads=config.num_attention_heads,
121
+ per_head_hidden_size=per_head_hidden_size,
122
+ hidden_size=config.hidden_size,
123
+ )
124
+ # Permute k
125
+ print(f"Permuting k {tensor_list[1].shape}")
126
+ tensor_list[1] = permute_for_rotary(
127
+ tensor=tensor_list[1],
128
+ num_heads=config.num_key_value_heads,
129
+ per_head_hidden_size=per_head_hidden_size,
130
+ hidden_size=config.hidden_size,
131
+ )
132
+ return torch.cat(tensor_list, dim=0)
133
+
134
+ if path[3] == "gate_up_proj":
135
+ tensor_list = get_tensors(
136
+ [
137
+ ".".join(transformer_path + path[2:3] + [proj_name] + path[4:])
138
+ for proj_name in ["gate_proj", "up_proj"]
139
+ ]
140
+ )
141
+ return torch.cat(tensor_list, dim=0)
142
+
143
+ return get_tensor(".".join(transformer_path + path[2:]))
144
+
145
+ else:
146
+ raise ValueError(f"Couldn't find transformer equivalent of {name}")
147
+
148
+
149
+ def convert_trfrs_to_brrr(dp, pp, tp, model_name="huggyllama/llama-7b", save_path="pretrained/llama-7b"):
150
+ # check save_path doesnt exist or is empty
151
+ save_path = Path(save_path)
152
+ # assert not save_path.exists() or len(list(save_path.iterdir())) == 0, f"save_path {save_path} is not empty"
153
+
154
+ parallel_config = PARALLELISM_BRRR
155
+
156
+ parallel_config.dp = dp
157
+ parallel_config.pp = pp
158
+ parallel_config.tp = tp
159
+
160
+ # Initialise all process groups
161
+ parallel_context = ParallelContext(
162
+ data_parallel_size=parallel_config.dp,
163
+ pipeline_parallel_size=parallel_config.pp,
164
+ tensor_parallel_size=parallel_config.tp,
165
+ )
166
+ # params
167
+ dtype = torch.bfloat16 # Flash attention doesn't support fp32
168
+
169
+ # Initialise brrr model
170
+ model_config_brrr = CONFIG_BRRR.model.model_config
171
+
172
+ model = build_model(
173
+ model_builder=lambda: MistralForTraining(
174
+ config=model_config_brrr,
175
+ parallel_context=parallel_context,
176
+ parallel_config=parallel_config,
177
+ random_states=None,
178
+ ),
179
+ dtype=dtype,
180
+ parallel_context=parallel_context,
181
+ device=torch.device("cpu"),
182
+ )
183
+
184
+ # Initialise transformers model
185
+ device_map = {}
186
+ current_pp_rank = dist.get_rank(group=parallel_context.pp_pg)
187
+ device_map["model.embed_tokens"] = (
188
+ model.model.token_position_embeddings.rank
189
+ if current_pp_rank == model.model.token_position_embeddings.rank
190
+ else "meta"
191
+ )
192
+ for i in range(model_config_brrr.num_hidden_layers):
193
+ device_map[f"model.layers.{i}"] = (
194
+ model.model.decoder[i].rank if current_pp_rank == model.model.decoder[i].rank else "meta"
195
+ )
196
+ device_map["model.norm"] = (
197
+ model.model.final_layer_norm.rank if current_pp_rank == model.model.final_layer_norm.rank else "meta"
198
+ )
199
+ device_map["lm_head"] = model.model.lm_head.rank if current_pp_rank == model.model.lm_head.rank else "meta"
200
+ model_ref = MistralForCausalLM.from_pretrained(model_name, torch_dtype=dtype, device_map=device_map)
201
+
202
+ # Copy weights from trfrs to brrr
203
+ ref_state_dict = model_ref.state_dict()
204
+ for name, param in model.named_parameters():
205
+ print(f"Syncing {name}")
206
+ ref_param = get_transformers_weight(name=name, ref_module_state_dict=ref_state_dict, ref_module=model_ref)
207
+
208
+ param_is_tp_sharded = (
209
+ isinstance(param, NanotronParameter)
210
+ and param.is_sharded
211
+ and parallel_context.world_ranks_to_pg[param.get_sharded_info().global_ranks] == parallel_context.tp_pg
212
+ )
213
+
214
+ if param_is_tp_sharded:
215
+ sharded_info = param.get_sharded_info()
216
+ # copy param data (not just the reference)
217
+ with torch.no_grad():
218
+ for local_global_slices_pair in sharded_info.local_global_slices_pairs:
219
+ local_slices = local_global_slices_pair.local_slices
220
+ global_slices = local_global_slices_pair.global_slices
221
+ param[local_slices].copy_(ref_param[global_slices])
222
+ else:
223
+ assert (
224
+ ref_param.shape == param.shape
225
+ ), f"Parameter shape don't match for {name}\n{ref_param.shape} != {param.shape}"
226
+ # copy param data (not just the reference)
227
+ with torch.no_grad():
228
+ param.copy_(ref_param)
229
+ ref_param = None
230
+ # torch.cuda.empty_cache()
231
+
232
+ # TODO @nouamanetazi: assert weights are the same
233
+ # Marks parameters as NanotronParameters
234
+ mark_tied_parameters(model=model, parallel_context=parallel_context, parallel_config=parallel_config)
235
+
236
+ sanity_check(root_module=model)
237
+
238
+ checkpoint_metadata = {
239
+ "last_train_step": 0,
240
+ "consumed_train_samples": 0,
241
+ }
242
+ save(config=CONFIG_BRRR, model=model, optimizer=None, lr_scheduler=None, parallel_context=parallel_context, root_folder=save_path,
243
+ should_save_optimizer=False, should_save_lr_scheduler=False, checkpoint_metadata=checkpoint_metadata,
244
+ sanity_checks=False)
245
+ # save_weights(model=model, parallel_context=parallel_context, root_folder=save_path)
246
+ # save_meta(root_folder=save_path, parallel_context=parallel_context, checkpoint_metadata=checkpoint_metadata)
247
+
248
+ if dist.get_rank(parallel_context.world_pg) == 0:
249
+ print(save_path)
250
+ import json
251
+
252
+ with open(save_path / "model_config.json", mode="w") as fo:
253
+ fo.write(json.dumps(asdict(CONFIG_BRRR.model.model_config), indent=4))
254
+
255
+
256
+ def main():
257
+ args = get_args()
258
+ convert_trfrs_to_brrr(**vars(args))
259
+
260
+
261
+ if __name__ == "__main__":
262
+ main()
modeling_mistral.py CHANGED
@@ -15,6 +15,7 @@
15
  """ PyTorch Mistral model.
16
  """
17
  from typing import Dict, Optional, Union
 
18
 
19
  import torch
20
  from flash_attn import bert_padding
@@ -46,12 +47,15 @@ from nanotron.parallel.tensor_parallel.nn import (
46
  )
47
  from nanotron.random import RandomStates
48
  from nanotron.utils import checkpoint_method
 
49
  from torch import nn
50
- from transformers import MistralConfig
51
- from transformers.activations import ACT2FN
52
 
53
  logger = logging.get_logger(__name__)
54
 
 
 
55
 
56
  class RotaryEmbedding(nn.Module):
57
  def __init__(self, dim: int, end: int, theta: float = 10000.0):
@@ -189,15 +193,22 @@ class CoreAttention(nn.Module):
189
  ), f"Hidden size {config.hidden_size} must be divisible by number of attention heads {config.num_attention_heads}."
190
  self.d_qk = config.hidden_size // config.num_attention_heads
191
  self.d_v = config.hidden_size // config.num_attention_heads
 
192
 
193
  self.checkpoint_attention = False # Because flash_attn already does checkpointing
194
 
 
 
 
 
 
 
195
  @checkpoint_method(attr_name="checkpoint_attention")
196
  def forward(
197
  self,
198
- query_states: torch.Tensor, # [batch_size * q_length, n_local_q_heads, inner_dim]
199
- key_states: torch.Tensor, # [batch_size * kv_length, n_local_kv_heads, inner_dim]
200
- value_states: torch.Tensor, # [batch_size * kv_length, n_local_kv_heads, inner_dim]
201
  q_sequence_mask: torch.Tensor, # torch.BoolTensor [batch_size, q_length] (can be broadcasted to that size)
202
  kv_sequence_mask: torch.Tensor, # torch.BoolTensor [batch_size, kv_length] (can be broadcasted to that size)
203
  ):
@@ -218,9 +229,10 @@ class CoreAttention(nn.Module):
218
  cu_seqlens_k=cu_seqlens_k,
219
  max_seqlen_q=q_sequence_mask.shape[1],
220
  max_seqlen_k=kv_sequence_mask.shape[1],
221
- dropout_p=0.0,
222
- softmax_scale=None, # This already defaults to the scale I'm interested in
223
  causal=causal,
 
224
  return_attn_probs=False,
225
  )
226
 
@@ -318,10 +330,11 @@ class CausalSelfAttention(nn.Module, AttachableStore):
318
  self.rotary_embedding = RotaryEmbedding(
319
  dim=self.d_qk,
320
  end=config.max_position_embeddings,
 
321
  )
322
 
323
  # NOTE: Only supported for training (TODO(fmom): position_ids not supported yet)
324
- self.flash_rotary_embedding = FlashRotaryEmbedding(dim=self.d_qk, interleaved=True)
325
 
326
  self.o_proj = TensorParallelRowLinear(
327
  config.num_attention_heads * self.d_qk,
@@ -852,7 +865,6 @@ class MistralForTraining(NanotronModel):
852
  super().__init__()
853
  import warnings
854
 
855
- warnings.warn("This is just a Llama Model, not a Mistral one for demo purpose. Please fix implementation")
856
  self.model = MistralModel(config=config, parallel_context=parallel_context, parallel_config=parallel_config)
857
  self.loss = PipelineBlock(
858
  p2p=self.model.p2p,
@@ -1044,12 +1056,13 @@ def get_flops(
1044
  num_layers,
1045
  hidden_size,
1046
  num_heads,
1047
- num_key_value_heads,
1048
  vocab_size,
1049
  seq_len,
1050
- ffn_hidden_size,
 
1051
  batch_size=1,
1052
  recompute_granularity=None,
 
1053
  ):
1054
  """Counts flops in an decoder-only model
1055
  Args:
@@ -1066,33 +1079,43 @@ def get_flops(
1066
  model_flops: flops in the model (should be independent of the hardware and model implementation)
1067
  hardware_flops: flops in the hardware (actual flops performed on the hardware). Check 6.3 in https://arxiv.org/pdf/2205.05198.pdf
1068
  """
1069
- if num_key_value_heads is None:
1070
- num_key_value_heads = num_heads
1071
- hidden_size_per_head = hidden_size // num_heads
 
 
 
1072
  # In the following we mark the reduced dimension with parentheses
1073
  # decoder
1074
- # self attention
1075
- ## qkv projection
1076
- decoder_qkv_proj_flops_fwd = (
1077
- 2 * num_layers * batch_size * seq_len * (hidden_size) * num_heads * hidden_size_per_head
1078
- + 2 * num_layers * batch_size * seq_len * (hidden_size) * 2 * num_key_value_heads * hidden_size_per_head
1079
- )
1080
  ## qk logits
1081
- decoder_qk_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (hidden_size_per_head) * seq_len
 
 
 
1082
  ## v logits
1083
- decoder_v_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (seq_len) * hidden_size_per_head
 
1084
  ## attn out
1085
- decoder_attn_out_flops_fwd = (
1086
- 2 * num_layers * batch_size * num_heads * seq_len * (hidden_size_per_head) * hidden_size
1087
- )
1088
  # FF
1089
  ## 1st layer
1090
- decoder_ffn_1_flops_fwd = 4 * num_layers * batch_size * seq_len * (hidden_size) * ffn_hidden_size
 
 
 
 
 
1091
  ## 2nd layer
1092
  decoder_ffn_2_flops_fwd = 2 * num_layers * batch_size * seq_len * (ffn_hidden_size) * hidden_size
1093
 
1094
  decoder_flops_fwd = (
1095
- decoder_qkv_proj_flops_fwd
 
1096
  + decoder_qk_logits_flops_fwd
1097
  + decoder_v_logits_flops_fwd
1098
  + decoder_attn_out_flops_fwd
 
15
  """ PyTorch Mistral model.
16
  """
17
  from typing import Dict, Optional, Union
18
+ import inspect
19
 
20
  import torch
21
  from flash_attn import bert_padding
 
47
  )
48
  from nanotron.random import RandomStates
49
  from nanotron.utils import checkpoint_method
50
+ from nanotron.nn.activations import ACT2FN
51
  from torch import nn
52
+
53
+ from config_mistral_7b import MistralConfig
54
 
55
  logger = logging.get_logger(__name__)
56
 
57
+ _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_varlen_func).parameters)
58
+
59
 
60
  class RotaryEmbedding(nn.Module):
61
  def __init__(self, dim: int, end: int, theta: float = 10000.0):
 
193
  ), f"Hidden size {config.hidden_size} must be divisible by number of attention heads {config.num_attention_heads}."
194
  self.d_qk = config.hidden_size // config.num_attention_heads
195
  self.d_v = config.hidden_size // config.num_attention_heads
196
+ self.dropout = config.attn_pdrop
197
 
198
  self.checkpoint_attention = False # Because flash_attn already does checkpointing
199
 
200
+ if config.sliding_window_size is not None:
201
+ assert (
202
+ _flash_supports_window_size
203
+ ), "Current version of flash-attn doesn't support sliding window: `pip install flash-attn>=2.3`"
204
+ self.sliding_window_size = config.sliding_window_size # if layer_idx not in config.global_attn_layers else None
205
+
206
  @checkpoint_method(attr_name="checkpoint_attention")
207
  def forward(
208
  self,
209
+ query_states: torch.Tensor, # [batch_size * q_length, num_heads, inner_dim]
210
+ key_states: torch.Tensor, # [batch_size * kv_length, 1, inner_dim]
211
+ value_states: torch.Tensor, # [batch_size * kv_length, 1, inner_dim]
212
  q_sequence_mask: torch.Tensor, # torch.BoolTensor [batch_size, q_length] (can be broadcasted to that size)
213
  kv_sequence_mask: torch.Tensor, # torch.BoolTensor [batch_size, kv_length] (can be broadcasted to that size)
214
  ):
 
229
  cu_seqlens_k=cu_seqlens_k,
230
  max_seqlen_q=q_sequence_mask.shape[1],
231
  max_seqlen_k=kv_sequence_mask.shape[1],
232
+ dropout_p=self.dropout if self.training else 0.0,
233
+ softmax_scale=None, # defaults to 1/sqrt(d_qk)
234
  causal=causal,
235
+ window_size=(self.sliding_window_size - 1, 0) if self.sliding_window_size is not None else (-1, -1),
236
  return_attn_probs=False,
237
  )
238
 
 
330
  self.rotary_embedding = RotaryEmbedding(
331
  dim=self.d_qk,
332
  end=config.max_position_embeddings,
333
+ theta=config.rope_theta
334
  )
335
 
336
  # NOTE: Only supported for training (TODO(fmom): position_ids not supported yet)
337
+ self.flash_rotary_embedding = FlashRotaryEmbedding(dim=self.d_qk, base=config.rope_theta, interleaved=True)
338
 
339
  self.o_proj = TensorParallelRowLinear(
340
  config.num_attention_heads * self.d_qk,
 
865
  super().__init__()
866
  import warnings
867
 
 
868
  self.model = MistralModel(config=config, parallel_context=parallel_context, parallel_config=parallel_config)
869
  self.loss = PipelineBlock(
870
  p2p=self.model.p2p,
 
1056
  num_layers,
1057
  hidden_size,
1058
  num_heads,
 
1059
  vocab_size,
1060
  seq_len,
1061
+ kv_channels=None,
1062
+ ffn_hidden_size=None,
1063
  batch_size=1,
1064
  recompute_granularity=None,
1065
+ glu_activation=False,
1066
  ):
1067
  """Counts flops in an decoder-only model
1068
  Args:
 
1079
  model_flops: flops in the model (should be independent of the hardware and model implementation)
1080
  hardware_flops: flops in the hardware (actual flops performed on the hardware). Check 6.3 in https://arxiv.org/pdf/2205.05198.pdf
1081
  """
1082
+ if kv_channels is None:
1083
+ assert hidden_size % num_heads == 0
1084
+ kv_channels = hidden_size // num_heads
1085
+ if ffn_hidden_size is None:
1086
+ ffn_hidden_size = 4 * hidden_size
1087
+
1088
  # In the following we mark the reduced dimension with parentheses
1089
  # decoder
1090
+ # self attention (MQA)
1091
+ ## q projection
1092
+ decoder_q_proj_flops_fwd = 2 * num_layers * batch_size * seq_len * (hidden_size) * num_heads * kv_channels
1093
+ ## kv projection, shared across heads
1094
+ decoder_kv_proj_flops_fwd = 2 * num_layers * batch_size * seq_len * (hidden_size) * 2 * kv_channels
 
1095
  ## qk logits
1096
+ decoder_qk_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (kv_channels) * seq_len
1097
+ ### SWA (sliding window attention / local attention)
1098
+ # window_size = 4096
1099
+ # decoder_qk_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (kv_channels) * window_size
1100
  ## v logits
1101
+ decoder_v_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (seq_len) * kv_channels
1102
+ # decoder_v_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (window_size) * kv_channels
1103
  ## attn out
1104
+ decoder_attn_out_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (kv_channels) * hidden_size
 
 
1105
  # FF
1106
  ## 1st layer
1107
+ decoder_ffn_1_flops_fwd = 2 * num_layers * batch_size * seq_len * (hidden_size) * ffn_hidden_size
1108
+ if glu_activation:
1109
+ # 3 matmuls instead of 2 in FFN
1110
+ # ref. https://arxiv.org/pdf/2002.05202.pdf
1111
+ # Used for example in T5 v1.1
1112
+ decoder_ffn_1_flops_fwd = 4 * num_layers * batch_size * seq_len * (hidden_size) * ffn_hidden_size
1113
  ## 2nd layer
1114
  decoder_ffn_2_flops_fwd = 2 * num_layers * batch_size * seq_len * (ffn_hidden_size) * hidden_size
1115
 
1116
  decoder_flops_fwd = (
1117
+ decoder_q_proj_flops_fwd
1118
+ + decoder_kv_proj_flops_fwd
1119
  + decoder_qk_logits_flops_fwd
1120
  + decoder_v_logits_flops_fwd
1121
  + decoder_attn_out_flops_fwd
pretrained/Mistral-7B-v0.1/checkpoint_metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dp": 1,
3
+ "metas": {
4
+ "consumed_train_samples": 0,
5
+ "last_train_step": 0
6
+ },
7
+ "tp": 1,
8
+ "version": "1.2"
9
+ }
pretrained/Mistral-7B-v0.1/config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpoints: null
2
+ data: null
3
+ general:
4
+ benchmark_csv_path: null
5
+ consumed_train_samples: null
6
+ ignore_sanity_checks: false
7
+ project: mistralai
8
+ run: Mistral-7B-v0.1
9
+ seed: 42
10
+ step: null
11
+ logging: null
12
+ model:
13
+ ddp_bucket_cap_mb: 25
14
+ dtype: bfloat16
15
+ init_method:
16
+ std: 0.025
17
+ make_vocab_size_divisible_by: 1
18
+ model_config:
19
+ attn_pdrop: 0.0
20
+ bos_token_id: 1
21
+ eos_token_id: 2
22
+ hidden_act: silu
23
+ hidden_size: 4096
24
+ initializer_range: 0.02
25
+ intermediate_size: 14336
26
+ is_mistral_config: true
27
+ max_position_embeddings: 32768
28
+ num_attention_heads: 32
29
+ num_hidden_layers: 32
30
+ num_key_value_heads: 8
31
+ pad_token_id: null
32
+ pretraining_tp: 1
33
+ rms_norm_eps: 1.0e-05
34
+ rope_theta: 10000.0
35
+ sliding_window_size: 4096
36
+ tie_word_embeddings: false
37
+ use_cache: true
38
+ vocab_size: 32000
39
+ optimizer: null
40
+ parallelism:
41
+ dp: 1
42
+ pp: 1
43
+ pp_engine: 1f1b
44
+ recompute_granularity: SELECTIVE
45
+ tp: 1
46
+ tp_linear_async_communication: true
47
+ tp_mode: REDUCE_SCATTER
48
+ profiler: null
49
+ tokenizer:
50
+ tokenizer_max_length: null
51
+ tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
52
+ tokenizer_revision: null
53
+ tokens: null
pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e6ef1be3d2daa611724f02567159bf507c9a9ea276d9771387a01f4942cafb6
3
+ size 33554672
pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3d4484e1f9505c97b6ac37472d9526e95470e6aef462fec6ae461b63e4ff77a
3
+ size 50332000
pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/input_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c286c58dfce1f3f030c13b90c41c831d05c4323da3d50e23fe434f38f81535b
3
+ size 8288
pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d69d412c683fc926a26a71536dd2530877994cfa6e4e9ae3f3a31f6861596b0
3
+ size 117440752
pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6148707c761016f313ee2ada71e15b9eb977878148fa382806eea4ef30a145e6
3
+ size 234881328
pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/post_attention_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05588e50e8fafc16c332e2f7a3d3830c9e59d29c35858d439a98ba4e418eba78
3
+ size 8288
pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cffeb63cbb1f271fd7ab983b618dfe4a4fc2b6b3763b9332fc324d378207210d
3
+ size 33554672
pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a1263d75614c4a9710ebc5a57fdec732b9348c1f57ace1887ce296e1805b529
3
+ size 50332000
pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/input_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71cd7738481e6bcbacbc76ce206545fb2fe6d995f7e1a733b408c3fe92f7356c
3
+ size 8288
pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31de05f9c50d9e94fe67936a973c86840f82ed2aad1494806baa81df8bbf9bf8
3
+ size 117440752
pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70761ee840fbdc950501814ff397c72e9d8bbc7be2030329f391c12eb5b73a0f
3
+ size 234881328
pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/post_attention_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1846ddc1c4ca9d8e03184f2fa34911398202f0edc310df5ea408a323a5f23ee8
3
+ size 8288
pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59d82cc1e5ec1f364e7401e17e58d53f62a39658799aeb4902060236ebb0cb60
3
+ size 33554672
pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:117c7832cefd9a767282b27c852f00ed4ce7888a8abb7e2f9257a0b2fed60608
3
+ size 50332000
pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/input_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c93dc35263c0efa22d22795777c009e4f9365cf1ef413b69880d14433d1069e8
3
+ size 8288
pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad3fefd66e98df8fee62bd0fe451b18ca1a14545b72e570d499dce0464368b81
3
+ size 117440752
pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6346f7c23987a4c529ac1b63b5f6f56b4392981ffcaaf2cb84cf2bf5b2bc36a7
3
+ size 234881328
pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/post_attention_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6cd70662e84b3d81b4f4512929d00d9377515c2dfe75d78109edce27c57d834
3
+ size 8288
pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c17ee146b384be81a4b9cb06960728dd540d6650a5798abcc95315bb0daf2ca
3
+ size 33554672
pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:497bbea1882c42d134dc4458194d71cd3d7d609b06e91e715093e0c243962116
3
+ size 50332000
pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/input_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7fbc23c909758daf76a1d647d1beefb4c3cc62a4aa04f98679e22d79cc6813e
3
+ size 8288
pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef4f8de0f891e6d79255af98deda246f04c0775694835f696a1a8b0738f492da
3
+ size 117440752
pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:844adcbf23618ae38fbffaf92f7d38ce7d853be5643485bb25f8f839c0f2819c
3
+ size 234881328
pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/post_attention_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1823cbba03a4ec4683cc6a41eab51e34cec90e92cea7af388d0c675abe451284
3
+ size 8288
pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9da7a13da9a196108d2efd875884aa8629533e8143255eef5915417ac592d9c0
3
+ size 33554672
pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31c8fb0c121f6977e10c7277544259157152d28de9559c8aae8236925398329f
3
+ size 50332000
pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/input_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90fd4f646b1f5ca201781cc77b713093ab9a67d4ee8de11c669a486a2896d773
3
+ size 8288
pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:368a3142cb9f085a2da1db74de226b13c509467cbea81da25f27db8842347443
3
+ size 117440752
pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48e00b1107d1e575c2425fa8368e92eb714b59825153206ae4cccc36eb4e8e45
3
+ size 234881328
pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/post_attention_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c3300f1d0af46ca69fc14397728055e302b2955b8b9adfd9705b68a683377b1
3
+ size 8288
pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0d2409179997ff51079156414cb112c82b964976a8023f5088b1dd7ab28f50c
3
+ size 33554672
pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c51efd448a50b2c75046bfb12f2703ce19e56b07f4f9e94f7a36f673c70517b8
3
+ size 50332000
pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/input_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe220fd602b0f41f30f7bca607c400adacadb7b5e31f81a28d7e103fd5c0b0a8
3
+ size 8288
pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa8d2abd973723ddb4ea4cb2188fa767ea55168bc2519e170e728619fde864c4
3
+ size 117440752
pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeeb6ae12119f5ffd390add932122f819d81d50886389567eb41103451b36d24
3
+ size 234881328
pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/post_attention_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55a66303f00f120db3fba8ae4d13725fc2c22cd6fd3babc1d66dc0fdee7eb45f
3
+ size 8288
pretrained/Mistral-7B-v0.1/model/model/decoder/14/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9fe17a044a248163b4c45783386f7d414e6217ae9657c8983d54a84e85aae8e
3
+ size 33554672
pretrained/Mistral-7B-v0.1/model/model/decoder/14/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b727021f1c6625dae8fb240904bf838985311afe7b5e19f62839c3563072e75a
3
+ size 50332000
pretrained/Mistral-7B-v0.1/model/model/decoder/14/pp_block/input_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5f069023e761eacf5747a540516a9215c49e3efc8614ffaa7fa4ca016c67075
3
+ size 8288
pretrained/Mistral-7B-v0.1/model/model/decoder/14/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:993eebcd3451163de4981bd6f7cd82a2bc0bfcc29a67cbbedceb502a5036466c
3
+ size 117440752