QLoRA fine-tuning

#5
by sivan22 - opened

I was trying to finetune the model using LoRA and 4-bit quantization. i've used PEFT and SFTTrainer from TRL.
but i was not sure which layers i should pick for training with LoRA (target_modules in the code) and which should be trained directly (modules_to_save in the code).

@Shaltiel , can you please direct me on this?

thanks!

the code:


import os, torch, logging
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
 
# Dataset
data_name = "Norod78/hewiki-20220901-articles-dataset"
training_data = load_dataset(data_name, split='train[0:1000]')
# Model and tokenizer names
base_model_name = "dicta-il/dictalm-7b"
refined_model = "dictalm-7b-finetuned"
 
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
 
# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)
 
# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0},
    trust_remote_code=True
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1
 
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",    
    target_modules=[r"megatron_gpt.layers.[0-31].self_attention.dense",r"megatron_gpt.layers.[0-31].mlp.dense_h_to_4h",
                    r"megatron_gpt.layers.[0-31].mlp.dense_4h_to_h",r"megatron_gpt.layers.[0-31].self_attention.query_key_value"]
    save_modules=None
)
from peft import get_peft_model
peft_model = get_peft_model(base_model, peft_parameters)
peft_model.print_trainable_parameters()
 
# Training Params
train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant"
)
 
# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=training_data,
    peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=train_params
)
 
# Training
fine_tuning.train()
 
# Save Model
fine_tuning.model.save_pretrained(refined_model)
DICTA: The Israel Center for Text Analysis org

When I've fine-tuned the model using QLoRA I used the default settings in their repository, and it worked quite well. The target_modules were every Linear layer in the model.

Found using this code:

def find_all_linear_names(args, model):
    cls = bnb.nn.Linear4bit if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else '.' + names[-1])


    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

Sign up or log in to comment