lingua-run10-mla-nope / config.yaml
ttj's picture
Add files using upload-large-folder tool
cfabcfa verified
name: large_lm
dump_dir: ./dump_dir_llama1b2-mla-nope
seed: 777
grad_acc_steps: 4
gc_collect_freq: 1000
probe_freq: null
steps: 60000
data:
root_dir: ./data
sources:
fineweb_edu_10bt_shuffled: 100.0
batch_size: 4
seq_len: 4096
n_views: 2
seed: 42
add_bos: true
add_eos: true
load_async: true
prefetch_size: 1024
tokenizer:
name: tiktoken
path: tokenizers/cl100k_base.tiktoken
optim:
lr: 0.003
weight_decay: 0.033
epsilon: 1.0e-08
beta1: 0.9
beta2: 0.95
clip: 1.0
scheduler: cosine
warmup: 5000
lr_min_ratio: 1.0e-06
cycle_length: 1.0
cosine_theta: 1.0
annealing_step: 1000
decay_fraction: 0.1
exp_factor: 0.5
model:
dim: 2048
n_layers: 25
head_dim: 128
n_heads: 48
n_kv_heads: 48
ffn_dim_multiplier: null
multiple_of: 256
norm_eps: 1.0e-05
rope_theta: 10000.0
init_base_std: null
init_std_factor: disabled
rope_type: none
rope_inv_freq_learnable: false
max_seqlen: 4096
use_mla: simple
q_lora_rank: 1536
kv_lora_rank: 512
seed: 42
vocab_size: 100512
weight_tying: false
sliding_window: null
distributed:
dp_shard: 1
dp_replicate: 4
tp_size: 1
selective_activation_checkpointing: false
compile: true
fsdp_type: full_shard
model_dtype: bf16
float8_recipe: null
float8_filter: layers\.[0-9]+\.
matmul_allow_tf32: true
detect_anomaly: false
compile_cache_size_limit: 8
spawn_method: forkserver
env:
MKL_SERVICE_FORCE_INTEL: GNU
OMP_NUM_THREADS: '1'
MKL_NUM_THREADS: '1'
ENABLE_INTRA_NODE_COMM: '1'
TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
NCCL_IB_TIMEOUT: '22'
NCCL_DEBUG: INFO
TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
checkpoint:
dump:
every: 2500
keep: 3
eval:
every: 5000000000
keep: -1
path: dump_dir_llama1b2-mla-nope/checkpoints
init_ckpt_path: null
continue_training_from_init: false
profiling:
run: true
trace_folder: profiling
mem_warmup: 0
mem_steps: 4
profile_warmup: 100
profile_steps: 4
logging:
freq: 1
acc_freq: null
wandb: null
async_eval_gpus: 1
eval:
harness:
tasks:
- hellaswag
- task: boolq
dataset_kwargs:
trust_remote_code: true
- piqa
- task: social_iqa
dataset_kwargs:
trust_remote_code: true
- winogrande
- openbookqa
- arc_easy
- arc_challenge
- race
- commonsense_qa
- copa
validation:
max_steps: 1000
generator:
max_tokens: 16384
dtype: bf16