CUDA_VISIBLE_DEVICES=0 \ swift sft \ --model "Qwen/Qwen2.5-0.5B-Instruct" \ --train_type "lora" \ --dataset "AI-ModelScope/alpaca-gpt4-data-zh#100" \ --torch_dtype "bfloat16" \ --num_train_epochs "1" \ --per_device_train_batch_size "1" \ --learning_rate "1e-4" \ --lora_rank "8" \ --lora_alpha "32" \ --target_modules "all-linear" \ --gradient_accumulation_steps "16" \ --save_steps "50" \ --save_total_limit "5" \ --logging_steps "5" \ --max_length "2048" \ --eval_strategy "steps" \ --eval_steps "5" \ --per_device_eval_batch_size "5" \ --eval_use_evalscope \ --eval_dataset "gsm8k" \ --eval_dataset_args '{"gsm8k": {"few_shot_num": 0}}' \ --eval_limit "10"