mistral-nanotron / config_mistral_7b.py
thomwolf's picture
thomwolf HF staff
update all
aa5ff8c
""" Example python script to generate a YAML config file which can be used to run a training with nanotron. Refer to "examples" section in the `/README.md` for more information.
Usage:
```
python config_tiny_mistral.py
```
"""
import os
from dataclasses import dataclass
from typing import Optional
from nanotron.config import (
CheckpointsArgs,
Config,
DataArgs,
GeneralArgs,
LoggingArgs,
LRSchedulerArgs,
ModelArgs,
OptimizerArgs,
ParallelismArgs,
PretrainDatasetsArgs,
RandomInit,
TokenizerArgs,
TokensArgs,
)
from nanotron.logging import human_format
from config_mistral import MistralConfig, get_num_params
MODEL_CONFIG = MistralConfig(
# Config for Mistral 7B
attn_pdrop=0.0,
bos_token_id=1,
eos_token_id=2,
hidden_act="silu",
hidden_size=4096,
initializer_range=0.02,
intermediate_size=14336,
max_position_embeddings=32768,
num_attention_heads=32,
num_hidden_layers=32,
num_key_value_heads=8,
pretraining_tp=1,
rms_norm_eps=1e-05,
rope_theta=10000.0,
sliding_window_size=4096,
tie_word_embeddings=False,
use_cache=True,
vocab_size=32000,
)
num_params = human_format(get_num_params(MODEL_CONFIG)).replace(".", "p")
print(f"Model has {num_params} parameters")
PARALLELISM = ParallelismArgs(
dp=2,
pp=2,
tp=2,
pp_engine="1f1b",
tp_mode="REDUCE_SCATTER",
tp_linear_async_communication=True,
recompute_granularity="selective",
)
CONFIG = Config(
general=GeneralArgs(project="mistralai", run="Mistral-7B-v0.1", seed=42, step=0),
checkpoints=None,
parallelism=PARALLELISM,
model=ModelArgs(init_method=RandomInit(std=0.025), model_config=MODEL_CONFIG),
tokenizer=TokenizerArgs("mistralai/Mistral-7B-v0.1"),
optimizer=None,
logging=None,
tokens=None,
data=None,
profiler=None,
lighteval=None,
)
if __name__ == "__main__":
file_path = os.path.abspath(__file__)
file_path = file_path.replace(".py", ".yaml")
# Save config as YAML file
config.save_as_yaml(file_path)
# You can now train a model with this config using `/run_train.py`