File size: 3,687 Bytes
54ba632
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
""" Example python script to generate a YAML config file which can be used to run a training with nanotron. Refer to "examples" section in the `/README.md` for more information.

Usage:
```
python config_tiny_mistral.py
```
"""
import os
from dataclasses import dataclass
from typing import Optional

from nanotron.config import (
    CheckpointsArgs,
    Config,
    DataArgs,
    GeneralArgs,
    LoggingArgs,
    LRSchedulerArgs,
    ModelArgs,
    OptimizerArgs,
    ParallelismArgs,
    PretrainDatasetsArgs,
    RandomInit,
    TokenizerArgs,
    TokensArgs,
)
from nanotron.logging import human_format


@dataclass
class MiniCPMConfig:
    """Configuration for a MiniCPM model.

    Be careful on having a coherent typing as we use it to reconstruct the model from yaml
    """

    attn_pdrop: float = 0.0
    bos_token_id: int =1
    eos_token_id: int =2
    pad_token_id: Optional[int] = None
    hidden_act: str ="silu"
    hidden_size: int =2304
    initializer_range: float =0.1
    intermediate_size: int =5760
    max_position_embeddings: int =2048
    num_attention_heads: int =36
    num_hidden_layers: int =40
    num_key_value_heads: int =36
    pretraining_tp: int=1
    rms_norm_eps: float=1e-05
    rope_theta: float = 10000.0
    tie_word_embeddings: bool =True
    use_cache: bool =True
    vocab_size: int = 122753
    scale_emb: float = 12
    dim_model_base: int= 256
    scale_depth: float = 1.4

    def __post_init__(self):
        # for backward compatibility
        if self.num_key_value_heads is None:
            self.num_key_value_heads = self.num_attention_heads

def get_num_params(model_config: MiniCPMConfig) -> int:
    num_params = model_config.vocab_size * model_config.hidden_size * 2 + \
        model_config.num_hidden_layers * (
            3 * model_config.hidden_size * model_config.intermediate_size
            + 2 * model_config.hidden_size * model_config.hidden_size
            + 2 * model_config.hidden_size * (model_config.hidden_size / (model_config.num_attention_heads / model_config.num_key_value_heads))
        )
    return num_params

def get_num_params_no_embed(model_config: MiniCPMConfig) -> int:
    num_params = model_config.num_hidden_layers * (
            3 * model_config.hidden_size * model_config.intermediate_size
            + 2 * model_config.hidden_size * model_config.hidden_size
            + 2 * model_config.hidden_size * (model_config.hidden_size / (model_config.num_attention_heads / model_config.num_key_value_heads))
        )
    return num_params

MODEL_CONFIG = MiniCPMConfig()

num_params = human_format(get_num_params(MODEL_CONFIG)).replace(".", "p")
num_params_no_embed = human_format(get_num_params_no_embed(MODEL_CONFIG)).replace(".", "p")

print(f"Model has {num_params} parameters or {num_params_no_embed} without embeddings")

PARALLELISM = ParallelismArgs(
    dp=1,
    pp=1,
    tp=1,
    pp_engine="1f1b",
    tp_mode="REDUCE_SCATTER",
    tp_linear_async_communication=True,
    recompute_granularity="selective",
)

CONFIG = Config(
    general=GeneralArgs(project="openbmb", run="MiniCPM-2B-dpo-bf16", seed=42, step=0),
    checkpoints=None,
    parallelism=PARALLELISM,
    model=ModelArgs(init_method=RandomInit(std=0.025), model_config=MODEL_CONFIG),
    tokenizer=TokenizerArgs("openbmb/MiniCPM-2B-dpo-bf16"),
    optimizer=None,
    logging=None,
    tokens=None,
    data=None,
    profiler=None,
    lighteval=None,
)

if __name__ == "__main__":
    file_path = os.path.abspath(__file__)

    file_path = file_path.replace(".py", ".yaml")
    # Save config as YAML file
    CONFIG.save_as_yaml(file_path)

    # You can now train a model with this config using `/run_train.py`