|
""" Example python script to generate a YAML config file which can be used to run a training with nanotron. Refer to "examples" section in the `/README.md` for more information. |
|
|
|
Usage: |
|
``` |
|
python config_tiny_mistral.py |
|
``` |
|
""" |
|
import os |
|
|
|
from nanotron.config import ( |
|
CheckpointsArgs, |
|
Config, |
|
DataArgs, |
|
GeneralArgs, |
|
LoggingArgs, |
|
LRSchedulerArgs, |
|
ModelArgs, |
|
OptimizerArgs, |
|
ParallelismArgs, |
|
PretrainDatasetsArgs, |
|
RandomInit, |
|
TokenizerArgs, |
|
TokensArgs, |
|
) |
|
from nanotron.logging import human_format |
|
from dataclasses import dataclass |
|
from typing import Optional |
|
|
|
|
|
@dataclass |
|
class MistralConfig: |
|
"""Configuration for a MISTRAL model |
|
|
|
Be careful on having a coherent typing as we use it to reconstruct the model from yaml |
|
""" |
|
|
|
bos_token_id: int = 1 |
|
eos_token_id: int = 2 |
|
hidden_act: str = "silu" |
|
hidden_size: int = 4096 |
|
initializer_range: float = 0.02 |
|
intermediate_size: int = 11008 |
|
is_mistral_config: bool = True |
|
max_position_embeddings: int = 2048 |
|
num_attention_heads: int = 32 |
|
num_hidden_layers: int = 32 |
|
num_key_value_heads: Optional[int] = None |
|
pad_token_id: Optional[int] = None |
|
pretraining_tp: int = 1 |
|
rms_norm_eps: float = 1e-6 |
|
rope_scaling: Optional[dict] = None |
|
tie_word_embeddings: bool = False |
|
use_cache: bool = True |
|
vocab_size: int = 32000 |
|
|
|
def __post_init__(self): |
|
|
|
if self.num_key_value_heads is None: |
|
self.num_key_value_heads = self.num_attention_heads |
|
|
|
model_config = MistralConfig( |
|
|
|
bos_token_id=1, |
|
eos_token_id=2, |
|
hidden_act="silu", |
|
hidden_size=16, |
|
initializer_range=0.02, |
|
intermediate_size=64, |
|
max_position_embeddings=256, |
|
num_attention_heads=4, |
|
num_hidden_layers=2, |
|
num_key_value_heads=4, |
|
pretraining_tp=1, |
|
rms_norm_eps=1e-05, |
|
rope_scaling=None, |
|
tie_word_embeddings=True, |
|
use_cache=True, |
|
vocab_size=256, |
|
) |
|
|
|
num_params = human_format( |
|
model_config.vocab_size * model_config.hidden_size * 2 |
|
+ model_config.num_hidden_layers |
|
* ( |
|
3 * model_config.hidden_size * model_config.intermediate_size |
|
+ 4 * model_config.hidden_size * model_config.hidden_size |
|
) |
|
).replace(".", "p") |
|
|
|
print(f"Model has {num_params} parameters") |
|
|
|
seed = 42 |
|
|
|
learning_rate = LRSchedulerArgs( |
|
learning_rate=3e-4, lr_warmup_steps=2, lr_warmup_style="linear", lr_decay_style="cosine", min_decay_lr=1e-5 |
|
) |
|
|
|
optimizer = OptimizerArgs( |
|
zero_stage=0, |
|
weight_decay=0.01, |
|
clip_grad=1.0, |
|
accumulate_grad_in_fp32=True, |
|
adam_eps=1e-08, |
|
adam_beta1=0.9, |
|
adam_beta2=0.95, |
|
torch_adam_is_fused=True, |
|
learning_rate_scheduler=learning_rate, |
|
) |
|
|
|
parallelism = ParallelismArgs( |
|
dp=2, |
|
pp=2, |
|
tp=2, |
|
pp_engine="1f1b", |
|
tp_mode="REDUCE_SCATTER", |
|
tp_linear_async_communication=True, |
|
recompute_granularity="selective", |
|
) |
|
|
|
tokens = TokensArgs(sequence_length=32, train_steps=10, micro_batch_size=2, batch_accumulation_per_replica=1) |
|
|
|
dataset = PretrainDatasetsArgs( |
|
hf_dataset_or_datasets="HuggingFaceH4/testing_alpaca_small", text_column_name="completion" |
|
) |
|
|
|
checkpoints_path = os.path.dirname(os.path.dirname(__file__)) + "/checkpoints" |
|
os.makedirs(checkpoints_path, exist_ok=True) |
|
|
|
config = Config( |
|
general=GeneralArgs(project="debug", run="tiny_mistral", seed=seed), |
|
checkpoints=CheckpointsArgs(checkpoints_path=checkpoints_path, checkpoint_interval=10), |
|
parallelism=parallelism, |
|
model=ModelArgs(init_method=RandomInit(std=0.025), model_config=model_config), |
|
tokenizer=TokenizerArgs("gpt2"), |
|
optimizer=optimizer, |
|
logging=LoggingArgs(), |
|
tokens=tokens, |
|
data=DataArgs(dataset=dataset, seed=seed), |
|
profiler=None, |
|
) |
|
|
|
if __name__ == "__main__": |
|
dir = os.path.dirname(__file__) |
|
|
|
|
|
config.save_as_yaml(f"{dir}/config_tiny_mistral.yaml") |
|
|
|
|
|
|