| # ------------------ | |
| export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True | |
| export USER=whoami | |
| source /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/.venv/bin/activate | |
| # ------------------ | |
| set -eo pipefail | |
| # ------------------ | |
| cd /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/YuLan-Pretrain/scripts/pretrain | |
| LAUNCH_SCRIPT_PATH="$(realpath $0)" \ | |
| DATA_PATH="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/cache/datasets/huggingface/Teaven/combine_2B_0908/binidx" \ | |
| OUTPUT_CHECKPOINT_PATH="/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/megatron_lm_workspace" \ | |
| BATCH_SIZE=1 GLOBAL_BATCH_SIZE=1024 \ | |
| TRAIN_TOKENS=2_000_000_000 LR_WARMUP_TOKENS=100_000_000 SAVE_TOKENS=1_000_000_000 \ | |
| LR_DECAY_STYLE='linear' LR_DECAY_TOKENS=2_000_000_000 \ | |
| LR=2e-5 MIN_LR=7e-7 \ | |
| MP_SIZE=2 PP_SIZE=1 CP_SIZE=1 \ | |
| TOKENIZER_TYPE="hf_tokenizer_yulan_mini" \ | |
| ACTIVATION_CHECKPOINT='true' \ | |
| NAME_PREFIX='dev-' \ | |
| HYBRID_ATTN=0.0625 \ | |
| HYBRID_MLP_RATIO=0.5 \ | |
| MAMBA_HEAD_DIM=64 \ | |
| MAMBA_NUM_GROUPS=6 \ | |
| MAMBA_STATE_DIM=320 \ | |
| MAMBA_EXPAND=1 \ | |
| NUM_LAYERS=112 \ | |
| MODEL_SIZE='2.9b' \ | |
| HIDDEN_SIZE=1920 \ | |
| NUM_ATTN_HEADS=30 \ | |
| NUM_QUERY_GROUPS=6 \ | |
| ROTARY_BASE=10000 \ | |
| MOE_FFN_HIDDEN_SIZE=4800 \ | |
| NUM_EXPERTS=0 \ | |
| SEQ_LEN=4096 \ | |
| TIE_EMBEDDING=false \ | |
| FREEZE_NON_MAMBA=false \ | |
| LOAD_FROM_CHECKPOINT='attn_mamba' \ | |
| HYBRID_OVERRIDE_PATTERN_TYPE='A0' \ | |
| CHECKPOINT_LOAD_PATH='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe-dev/cache/models/distill/L56-D1920-qwen_mamba2_qwen2-e1-i1920-s320-hd64-gn6-A0-S512-step1/rwkv-final-hf-A7-0_8_16_24_32_40_48/megatron-pp1-tp2' \ | |
| EXTRA_ARGS="--log-params-norm --no-save-step-one --ckpt-format torch --encoder-tensor-model-parallel-size $MP_SIZE --no-load-optim --no-load-rng" \ | |
| bash mamba_moe_0.5b_pretrain_template.sh | |
| # SEQ_LEN, ROTARY_BASE, MAMBA_STATE_DIM, MODEL_SIZE, LOAD_FROM_CHECKPOINT, CHECKPOINT_LOAD_PATH, HYBRID_ATTN | |
| # LOAD_FROM_CHECKPOINT = none / attn_only / attn_mamba | |
| # group.add_argument('--hybrid-override-pattern', type=str, default=None, | |
| # help='Force a specific hybrid layer pattern. The value' | |
| # 'should be a string of characters chosen from' | |
| # 'core.ssm.mamba_hybrid_layer_allocation.Symbols.' | |
| # 'If a value greater than 0.0 is supplied to any of the ' | |
| # 'hybrid ratio arguments, then the number of each type' | |
| # 'of layer in the override pattern must match number in' | |
| # 'the overidden pattern') | |
| # M0 type: | |
| # M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*- | |
| # A0 type: | |
| # *-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M- | |
| # A01 type: | |
| # *-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-M-M-M-M-M-M-M- | |
| # M01 type: | |
| # M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*-*-M-M-M-M-M-M-M-M-M-M-M-M-M-M-*- | |
| # Nemo_A7_M49_F49 | |
| # M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M*-M-M-M-M-M-M-M- | |
| # yulanmini | |
| # *-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*- | |
| # else or no this argument: | |
| # No override |