max_seq_len: 8192 | |
global_seed: 17 | |
# Run Name | |
run_name: mpt-30b-orca-1ep_flan3m # If left blank, will be read from env var $RUN_NAME | |
model: | |
name: hf_causal_lm | |
pretrained: true | |
pretrained_model_name_or_path: mosaicml/mpt-30b | |
init_device: mixed | |
config_overrides: | |
max_seq_len: ${max_seq_len} | |
attn_config: | |
attn_impl: triton | |
# Set this to `true` if using `train_loader.dataset.packing_ratio` below | |
attn_uses_sequence_id: false | |
# Tokenizer | |
tokenizer: | |
name: mosaicml/mpt-30b | |
kwargs: | |
model_max_length: ${max_seq_len} | |
# Dataloaders | |
train_loader: | |
name: finetuning | |
dataset: | |
hf_name: csv | |
hf_kwargs: | |
data_dir: ~/mpt/llm-foundry/data/orca_3m_gpt3.5 | |
preprocessing_fn: | |
split: train | |
max_seq_len: ${max_seq_len} | |
allow_pad_trimming: false | |
decoder_only_format: true | |
# # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...` | |
# # to profile this run's optimal packing_ratio as it depends on GPU count, | |
# # batch size, sequence length | |
packing_ratio: 19.0 | |
shuffle: true | |
drop_last: true | |
num_workers: 8 | |
pin_memory: false | |
prefetch_factor: 2 | |
persistent_workers: true | |
timeout: 0 | |
# Optimization | |
scheduler: | |
name: linear_decay_with_warmup # linear no warmup is HF default which dolly used | |
t_warmup: 100ba # add some warmup though, seems to help with MPT | |
alpha_f: 0 | |
optimizer: | |
# Based on Dolly | |
name: decoupled_lionw | |
lr: 2.0e-6 | |
betas: | |
- 0.9 | |
- 0.999 | |
eps: 1.0e-8 | |
weight_decay: 0 | |
algorithms: | |
gradient_clipping: | |
clipping_type: norm | |
clipping_threshold: 1.0 | |
max_duration: 1ep # 2-3 epochs seems like the sweet spot | |
eval_interval: 1 | |
# eval_subset_num_batches: -1 | |
# eval_first: true | |
global_train_batch_size: 8 # somewhere in the 6-8 * numgpus range seems good | |
# System | |
seed: ${global_seed} | |
# device_eval_batch_size: 8 | |
device_train_microbatch_size: 2 | |
# device_train_microbatch_size: auto | |
precision: amp_bf16 | |
# FSDP | |
fsdp_config: | |
sharding_strategy: FULL_SHARD | |
mixed_precision: PURE | |
activation_checkpointing: true | |
activation_checkpointing_reentrant: false | |
activation_cpu_offload: false | |
limit_all_gathers: true | |
verbose: false | |
# Logging | |
progress_bar: false | |
log_to_console: true | |
console_log_interval: 1ba | |
callbacks: | |
speed_monitor: | |
window_size: 10 | |
lr_monitor: {} | |
memory_monitor: {} | |
runtime_estimator: {} | |
# loggers: | |
# wandb: {} | |
# Checkpoint to local filesystem or remote object store | |
# save_interval: 5000ba | |
save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK | |
save_folder: ./{run_name}/checkpoints | |
# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints | |