""" Example python script to generate a YAML config file which can be used to run a training with nanotron. Refer to "examples" section in the `/README.md` for more information. Usage: ``` python config_tiny_mistral.py ``` """ import os from dataclasses import dataclass from typing import Optional from nanotron.config import ( CheckpointsArgs, Config, DataArgs, GeneralArgs, LoggingArgs, LRSchedulerArgs, ModelArgs, OptimizerArgs, ParallelismArgs, PretrainDatasetsArgs, RandomInit, TokenizerArgs, TokensArgs, ) from nanotron.logging import human_format @dataclass class MiniCPMConfig: """Configuration for a MiniCPM model. Be careful on having a coherent typing as we use it to reconstruct the model from yaml """ attn_pdrop: float = 0.0 bos_token_id: int =1 eos_token_id: int =2 pad_token_id: Optional[int] = None hidden_act: str ="silu" hidden_size: int =2304 initializer_range: float =0.1 intermediate_size: int =5760 max_position_embeddings: int =2048 num_attention_heads: int =36 num_hidden_layers: int =40 num_key_value_heads: int =36 pretraining_tp: int=1 rms_norm_eps: float=1e-05 rope_theta: float = 10000.0 tie_word_embeddings: bool =True use_cache: bool =True vocab_size: int = 122753 scale_emb: float = 12 dim_model_base: int= 256 scale_depth: float = 1.4 def __post_init__(self): # for backward compatibility if self.num_key_value_heads is None: self.num_key_value_heads = self.num_attention_heads def get_num_params(model_config: MiniCPMConfig) -> int: num_params = model_config.vocab_size * model_config.hidden_size * 2 + \ model_config.num_hidden_layers * ( 3 * model_config.hidden_size * model_config.intermediate_size + 2 * model_config.hidden_size * model_config.hidden_size + 2 * model_config.hidden_size * (model_config.hidden_size / (model_config.num_attention_heads / model_config.num_key_value_heads)) ) return num_params def get_num_params_no_embed(model_config: MiniCPMConfig) -> int: num_params = model_config.num_hidden_layers * ( 3 * model_config.hidden_size * model_config.intermediate_size + 2 * model_config.hidden_size * model_config.hidden_size + 2 * model_config.hidden_size * (model_config.hidden_size / (model_config.num_attention_heads / model_config.num_key_value_heads)) ) return num_params MODEL_CONFIG = MiniCPMConfig() num_params = human_format(get_num_params(MODEL_CONFIG)).replace(".", "p") num_params_no_embed = human_format(get_num_params_no_embed(MODEL_CONFIG)).replace(".", "p") print(f"Model has {num_params} parameters or {num_params_no_embed} without embeddings") PARALLELISM = ParallelismArgs( dp=1, pp=1, tp=1, pp_engine="1f1b", tp_mode="REDUCE_SCATTER", tp_linear_async_communication=True, recompute_granularity="selective", ) CONFIG = Config( general=GeneralArgs(project="openbmb", run="MiniCPM-2B-dpo-bf16", seed=42, step=0), checkpoints=None, parallelism=PARALLELISM, model=ModelArgs(init_method=RandomInit(std=0.025), model_config=MODEL_CONFIG), tokenizer=TokenizerArgs("openbmb/MiniCPM-2B-dpo-bf16"), optimizer=None, logging=None, tokens=None, data=None, profiler=None, lighteval=None, ) if __name__ == "__main__": file_path = os.path.abspath(__file__) file_path = file_path.replace(".py", ".yaml") # Save config as YAML file CONFIG.save_as_yaml(file_path) # You can now train a model with this config using `/run_train.py`