export EXPERT=$1 | |
export MODEL_SIZE=$2 | |
export BATCH_SIZE=$3 | |
export CUDA_VISIBLE_DEVICES=$4 | |
export BASE_DIR=/workspace | |
export WANDB_API_KEY=[redacted] | |
export WANDB_PROJECT=airoboros-lmoe-$MODEL_SIZE-2.1-$EXPERT | |
pyt qlora.py \ | |
--model_name_or_path $BASE_DIR/llama-2-$MODEL_SIZE-hf \ | |
--output_dir $BASE_DIR/$WANDB_PROJECT \ | |
--num_train_epochs 3 \ | |
--logging_steps 1 \ | |
--save_strategy steps \ | |
--save_steps 100 \ | |
--save_total_limit 1 \ | |
--data_seed 11422 \ | |
--evaluation_strategy no \ | |
--eval_dataset_size 2 \ | |
--max_new_tokens 4096 \ | |
--dataloader_num_workers 3 \ | |
--logging_strategy steps \ | |
--remove_unused_columns False \ | |
--do_train \ | |
--lora_r 64 \ | |
--lora_alpha 16 \ | |
--lora_modules all \ | |
--bf16 \ | |
--bits 4 \ | |
--double_quant \ | |
--quant_type nf4 \ | |
--warmup_ratio 0.03 \ | |
--lr_scheduler_type constant \ | |
--dataset airoboros-lmoe-2.1/expert_$EXPERT.jsonl \ | |
--dataset_format airoboros \ | |
--model_max_len 4096 \ | |
--per_device_train_batch_size $BASE_SIZE \ | |
--learning_rate 0.00017 \ | |
--adam_beta2 0.999 \ | |
--max_grad_norm 0.3 \ | |
--lora_dropout 0.05 \ | |
--weight_decay 0.0 \ | |
--seed 11422 \ | |
--report_to wandb \ | |
--gradient_accumulation_steps 16 \ | |
--gradient_checkpointing | |