dsv3_0.5b / launch_script /run_1node_hybrid_mamba_pretrain.sh
O2iginal's picture
Upload folder launch_script to dsv3_0.5b
10f998d verified
# ------------------
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export USER=whoami
source /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/.venv/bin/activate
# ------------------
set -eo pipefail
# ------------------
cd /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/YuLan-Pretrain/scripts/pretrain
LAUNCH_SCRIPT_PATH="$(realpath $0)" \
DATA_PATH="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/cache/datasets/huggingface/Teaven/combine_2B_0908/binidx" \
OUTPUT_CHECKPOINT_PATH="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/megatron_lm_workspace" \
BATCH_SIZE=1 GLOBAL_BATCH_SIZE=1024 \
TRAIN_TOKENS=2_000_000_000 LR_WARMUP_TOKENS=100_000_000 SAVE_TOKENS=1_000_000_000 \
LR_DECAY_STYLE='linear' LR_DECAY_TOKENS=2_000_000_000 \
LR=2e-5 MIN_LR=7e-7 \
MP_SIZE=2 PP_SIZE=1 CP_SIZE=1 \
TOKENIZER_TYPE="hf_tokenizer_yulan_mini" \
ACTIVATION_CHECKPOINT='true' \
NAME_PREFIX='dev-' \
HYBRID_ATTN=0.0625 \
HYBRID_MLP_RATIO=0.5 \
MAMBA_HEAD_DIM=64 \
MAMBA_NUM_GROUPS=6 \
MAMBA_STATE_DIM=320 \
MAMBA_EXPAND=1 \
NUM_LAYERS=112 \
MODEL_SIZE='2.9b' \
HIDDEN_SIZE=1920 \
NUM_ATTN_HEADS=30 \
NUM_QUERY_GROUPS=6 \
ROTARY_BASE=10000 \
MOE_FFN_HIDDEN_SIZE=4800 \
NUM_EXPERTS=0 \
SEQ_LEN=4096 \
TIE_EMBEDDING=false \
FREEZE_NON_MAMBA=false \
LOAD_FROM_CHECKPOINT='attn_mamba' \
HYBRID_OVERRIDE_PATTERN_TYPE='A0' \
CHECKPOINT_LOAD_PATH='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/cache/models/distill/L56-D1920-qwen_mamba2_qwen2-e1-i1920-s320-hd64-gn6-A0-S512-step1/rwkv-final-hf-A7-0_8_16_24_32_40_48/megatron-pp1-tp2' \
EXTRA_ARGS="--log-params-norm --no-save-step-one --ckpt-format torch --encoder-tensor-model-parallel-size $MP_SIZE --no-load-optim --no-load-rng" \
bash mamba_moe_0.5b_pretrain_template.sh
# SEQ_LEN, ROTARY_BASE, MAMBA_STATE_DIM, MODEL_SIZE, LOAD_FROM_CHECKPOINT, CHECKPOINT_LOAD_PATH, HYBRID_ATTN
# LOAD_FROM_CHECKPOINT = none / attn_only / attn_mamba
# group.add_argument('--hybrid-override-pattern', type=str, default=None,
# help='Force a specific hybrid layer pattern. The value'
# 'should be a string of characters chosen from'
# 'core.ssm.mamba_hybrid_layer_allocation.Symbols.'
# 'If a value greater than 0.0 is supplied to any of the '
# 'hybrid ratio arguments, then the number of each type'
# 'of layer in the override pattern must match number in'
# 'the overidden pattern')
# M0 type:
# M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-
# A0 type:
# *-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-
# A01 type:
# *-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-
# M01 type:
# M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-
# Nemo_A7_M49_F49
# M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M-M-
# yulanmini
# *-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
# else or no this argument:
# No override