supervised finetuning error

#96
by AlexMercerXX - opened

I am new to working with Hugging Face models and LLMs in general so any help will be appreciated.

I am trying to run a supervised fine-tuning experiment with phi2 on my custom dataset. I have collected data samples of the form {"instruction": ... , "input":...,"output":...}.
I am getting this error during the training process and I am unable to understand where it is coming from. The model starts training and every time after running on 2-3 input sequences it crashes with this error.

File "/huggingface/modules/transformers_modules/phi-2/modeling_phi.py", line 158, in _apply_rotary_emb_qkv
q_rot = torch.cat([q1 * c - q2 * s, q1 * s + q2 * c], axis=-1).to(qkv.dtype)
~~~^~~
RuntimeError: The size of tensor a (328) must match the size of tensor b (319) at non-singleton dimension 1

I am attaching my code for supervised fine-tuning:

import glob
import re
from transformers import AutoTokenizer, AutoModelForCausalLM,TrainingArguments,Trainer,BitsAndBytesConfig
import torch 
import torch.nn as nn
from torch.cuda.amp import autocast
from datasets import Dataset,load_dataset
import json 
import peft
from trl import SFTTrainer
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
from peft import LoraConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

bnb_config = BitsAndBytesConfig(
 load_in_4bit=True,
 bnb_4bit_quant_type="nf4",
 bnb_4bit_compute_dtype='float16',
 bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained("phi-2", quantization_config = bnb_config, trust_remote_code=True, load_in_8bit = True, torch_dtype=torch.float16, revision="refs/pr/1")
model.config.use_cache = False
print(model)

peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias='none',
    task_type='CAUSAL_LM',
    # target_modules=["out_proj", "Wqkv"]
    target_modules = ["Wqkv"] #,"fc1","fc2"]
)

model = peft.get_peft_model(model, peft_config)
model = accelerator.prepare_model(model)

if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

model.print_trainable_parameters()

tokenizer = AutoTokenizer.from_pretrained("phi-2", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

custom_dataset = load_dataset("json",data_files="sft_dataset.json",split='train')

def formatting_prompts_func(examples):
    output_text = []
    for i in range(len(examples["instruction"])):
        instruction = examples["instruction"][i]
        input_text = examples["input"][i]
        response = examples["output"][i]

        if len(input_text) >= 2:
            text = f'''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
            
            ### Instruction:
            {instruction}
            
            ### Input:
            {input_text}
            
            ### Response:
            {response}
            '''
        else:
            text = f'''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
            
            ### Instruction:
            {instruction}
            
            ### Response:
            {response}
            '''
        output_text.append(text)
    return output_text

training_args = TrainingArguments(
    output_dir="results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    # per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="logs",
    logging_steps=1,
    remove_unused_columns=True,
    gradient_accumulation_steps=4,
    # gradient_checkpointing=True,
    bf16=False,
    fp16 = True,
    lr_scheduler_type="cosine",
    optim = "paged_adamw_8bit",
    max_grad_norm=0.3,
    learning_rate=2.5e-5,
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=custom_dataset,
    packing=False,
    max_seq_length=2048,
    # eval_dataset=custom_dataset,
    # peft_config=peft_config,
    formatting_func=formatting_prompts_func,
    tokenizer=tokenizer,
)
trainer.train()
model.save_pretrained("fine_tuned_model")

Another query: Is it possible to run this experiment on 2 8GB GPUs? I have been trying to setup another code based on another notebook on the internet but one of the GPUs keeps going out of memory.

Sign up or log in to comment