0420upload / 0417train.sh
Prummn's picture
Add files using upload-large-folder tool
03cb542 verified
TRAIN_DATA="/mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/lora_0323_10w+55w+error+syn_with_domain_train90_targeted_rl_train90_loramerged_basewer_3suppress_server.jsonl"
VAL_DATA="/mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/lora_0323_10w+55w+error+syn_with_domain_train90_targeted_rl_val5_sample5p_server.jsonl" # <- 你新增的验证集
export MASTER_PORT=29540
export MASTER_ADDR=127.0.0.1
######################
# 0. 基础环境变量 (wandb)
######################
export WANDB_BASE_URL="https://api.wandb.ai"
export WANDB_API_KEY="af28909dd03c6220f0ad4df6bd9a0a1fbddb3e55"
export WANDB_PROJECT="qwen3_asr_swift_dapo" # 对应截图里的项目名
export WANDB_ENTITY="pang_kaiyu-none" # 对应截图里的 Entity
# 让 wandb 在多卡训练时只开一个进程写日志(可选)
export WANDB_MODE=online
export NPROC_PER_NODE=4
export SWIFT_SINGLE_DEVICE_MODE=1
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export ASR_REWARD_DEBUG=1
export ASR_REWARD_DEBUG_PATH=/mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/out/qwen3asr_dapo_reward56_4x4x12_12gen_4GPU/reward_debug
export ASR_REWARD_DEBUG_MAX_ROWS=1000
swift rlhf \
--rlhf_type grpo \
--external_plugins /mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/my_qwen3_asr_dapo_register.py /mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/0417_reward.py \
--model /mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/qwen3-asr-merged \
--model_type my_qwen3_asr_rl \
--template my_qwen3_asr_rl \
--dataset ${TRAIN_DATA} \
--val_dataset ${VAL_DATA} \
--reward_funcs asr_wer_sub_len_cmp_hallu_dirty_v56 \
--train_type lora \
--use_vllm false \
--log_completions true \
--loss_type dapo \
--advantage_estimator grpo \
--scale_rewards group \
--num_iterations 2 \
--beta 0.04 \
--epsilon_high 0.28 \
--dynamic_sample true \
--max_resample_times 4 \
--overlong_filter true \
--truncation_strategy delete \
--num_generations 12 \
--generation_batch_size 48 \
--per_device_train_batch_size 4 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 12 \
--num_generations_eval 4 \
--max_completion_length 256 \
--temperature 0.50 \
--top_p 0.95 \
--top_k 50 \
--repetition_penalty 1.08 \
--learning_rate 5e-5 \
--lr_scheduler_type cosine \
--warmup_ratio 0.03 \
--report_to wandb \
--run_name qwen3asr_dapo_reward56_4x4x12_12gen_4GPU \
--output_dir /mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/out/qwen3asr_dapo_reward56_4x4x12_12gen_4GPU \
--save_strategy steps \
--save_steps 20 \
--logging_steps 5 \
--freeze_llm false \
--freeze_vit false \
--freeze_aligner false \
--remove_unused_columns false \
--padding_side left