|
#!/bin/bash |
|
|
|
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' |
|
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 |
|
export NPROC_PER_NODE=8 |
|
export VIDEO_MAX_PIXELS=50176 |
|
export IMAGE_MAX_PIXELS=50176 |
|
export MAX_PIXELS=50176 |
|
|
|
|
|
export MASTER_ADDR=localhost |
|
export MASTER_PORT=29500 |
|
export WORLD_SIZE=8 |
|
export RANK=0 |
|
|
|
export SEQUENCE_PARALLEL_IMPL=ulysses |
|
|
|
export CELOSS_PARALLEL_SIZE=16384 |
|
|
|
swift sft \ |
|
--model Qwen/Qwen2.5-VL-7B-Instruct \ |
|
--train_type full \ |
|
--deepspeed zero3 \ |
|
--attn_impl flash_attn \ |
|
--sequence_parallel_size 4 \ |
|
--freeze_vit false \ |
|
--freeze_aligner false \ |
|
--freeze_llm false \ |
|
--use_hf true \ |
|
--dataset 'datasets/jsonl/filter_llava_350_550#32' \ |
|
--dataset_num_proc 4 \ |
|
--split_dataset_ratio 0.001 \ |
|
--save_strategy steps \ |
|
--save_steps 1 \ |
|
--torch_dtype bfloat16 \ |
|
--max_steps 1 \ |
|
--per_device_train_batch_size 1 \ |
|
--per_device_eval_batch_size 1 \ |
|
--learning_rate 5e-6 \ |
|
--gradient_accumulation_steps 1 \ |
|
--eval_steps 1000 \ |
|
--save_total_limit 5 \ |
|
--logging_steps 1 \ |
|
--max_length 76000 \ |
|
--warmup_ratio 0.05 \ |
|
--dataloader_num_workers 1 \ |
|
--gradient_checkpointing true \ |
|
--max_grad_norm 1.0 \ |
|
--use_liger_kernel true \ |
|
--loss_scale default \ |
|
--load_from_cache_file true \ |
|
--save_safetensors true \ |
|
--report_to tensorboard \ |
|
--logging_dir output/runs \ |
|
--output_dir output/timing_test_single_sample |