name: large_lm | |
dump_dir: ./dump_dir_llama1b2-mla-nope | |
seed: 777 | |
grad_acc_steps: 4 | |
gc_collect_freq: 1000 | |
probe_freq: null | |
steps: 60000 | |
data: | |
root_dir: ./data | |
sources: | |
fineweb_edu_10bt_shuffled: 100.0 | |
batch_size: 4 | |
seq_len: 4096 | |
n_views: 2 | |
seed: 42 | |
add_bos: true | |
add_eos: true | |
load_async: true | |
prefetch_size: 1024 | |
tokenizer: | |
name: tiktoken | |
path: tokenizers/cl100k_base.tiktoken | |
optim: | |
lr: 0.003 | |
weight_decay: 0.033 | |
epsilon: 1.0e-08 | |
beta1: 0.9 | |
beta2: 0.95 | |
clip: 1.0 | |
scheduler: cosine | |
warmup: 5000 | |
lr_min_ratio: 1.0e-06 | |
cycle_length: 1.0 | |
cosine_theta: 1.0 | |
annealing_step: 1000 | |
decay_fraction: 0.1 | |
exp_factor: 0.5 | |
model: | |
dim: 2048 | |
n_layers: 25 | |
head_dim: 128 | |
n_heads: 48 | |
n_kv_heads: 48 | |
ffn_dim_multiplier: null | |
multiple_of: 256 | |
norm_eps: 1.0e-05 | |
rope_theta: 10000.0 | |
init_base_std: null | |
init_std_factor: disabled | |
rope_type: none | |
rope_inv_freq_learnable: false | |
max_seqlen: 4096 | |
use_mla: simple | |
q_lora_rank: 1536 | |
kv_lora_rank: 512 | |
seed: 42 | |
vocab_size: 100512 | |
weight_tying: false | |
sliding_window: null | |
distributed: | |
dp_shard: 1 | |
dp_replicate: 4 | |
tp_size: 1 | |
selective_activation_checkpointing: false | |
compile: true | |
fsdp_type: full_shard | |
model_dtype: bf16 | |
float8_recipe: null | |
float8_filter: layers\.[0-9]+\. | |
matmul_allow_tf32: true | |
detect_anomaly: false | |
compile_cache_size_limit: 8 | |
spawn_method: forkserver | |
env: | |
MKL_SERVICE_FORCE_INTEL: GNU | |
OMP_NUM_THREADS: '1' | |
MKL_NUM_THREADS: '1' | |
ENABLE_INTRA_NODE_COMM: '1' | |
TORCH_NCCL_AVOID_RECORD_STREAMS: '1' | |
NCCL_IB_TIMEOUT: '22' | |
NCCL_DEBUG: INFO | |
TORCH_NCCL_ASYNC_ERROR_HANDLING: '1' | |
checkpoint: | |
dump: | |
every: 2500 | |
keep: 3 | |
eval: | |
every: 5000000000 | |
keep: -1 | |
path: dump_dir_llama1b2-mla-nope/checkpoints | |
init_ckpt_path: null | |
continue_training_from_init: false | |
profiling: | |
run: true | |
trace_folder: profiling | |
mem_warmup: 0 | |
mem_steps: 4 | |
profile_warmup: 100 | |
profile_steps: 4 | |
logging: | |
freq: 1 | |
acc_freq: null | |
wandb: null | |
async_eval_gpus: 1 | |
eval: | |
harness: | |
tasks: | |
- hellaswag | |
- task: boolq | |
dataset_kwargs: | |
trust_remote_code: true | |
- piqa | |
- task: social_iqa | |
dataset_kwargs: | |
trust_remote_code: true | |
- winogrande | |
- openbookqa | |
- arc_easy | |
- arc_challenge | |
- race | |
- commonsense_qa | |
- copa | |
validation: | |
max_steps: 1000 | |
generator: | |
max_tokens: 16384 | |
dtype: bf16 | |