Multi-gpu fine-tuning

#30

by matbee - opened Apr 24, 2024

Discussion

matbee

Apr 24, 2024

Is it possible to utilize FSDP / Deepspeed for finetuning this?

VictorSanh

Apr 24, 2024

Hi @matbee , yes it is totally possible.
One low-barrier entry point to do so is through the HF trainer: https://huggingface.co/docs/transformers/main/en/trainer
you can for instance adapt https://colab.research.google.com/drive/1rm3AGquGEYXfeeizE40bbDtcWh5S4Nlq?authuser=1#scrollTo=nlEpIG4UBmoH which provides some code to fine-tune on a single GPU. the main work would be to adapt the accelerate config.
let me know if you need reviews/help!

matbee

Apr 24, 2024

•

edited Apr 24, 2024

Here's what I've got so far. It IS training, unsure if it's absolutely correct though. I had to set 'mixed_precision' to no in the config.

I dont think its properly sharding it, as when training via 1 gpu, it uses the same amount of VRAM on each.

import torch
import random
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoProcessor, AutoTokenizer, BitsAndBytesConfig, Idefics2ForConditionalGeneration, TrainingArguments, Trainer
from datasets import load_dataset
from accelerate import PartialState

DEVICE = "cuda:0"
USE_LORA = False
USE_QLORA = True

processor = AutoProcessor.from_pretrained(
    "HuggingFaceM4/idefics2-8b",
    do_image_splitting=False
)

# Three options for training, from the lowest precision training to the highest precision training:
# - QLora
# - Standard Lora
# - Full fine-tuning
IDEFICS2_CHAT_TEMPLATE = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}"""

if USE_QLORA or USE_LORA:
    peft_config = LoraConfig(
        r=8,
        lora_alpha=8,
        lora_dropout=0.1,
        target_modules='.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$',
        init_lora_weights="gaussian"
    )
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_quant_storage=torch.bfloat16,
        )
    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceM4/idefics2-8b", use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.chat_template = IDEFICS2_CHAT_TEMPLATE

    model = Idefics2ForConditionalGeneration.from_pretrained(
        "HuggingFaceM4/idefics2-8b",
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config if USE_QLORA else None,
        attn_implementation="flash_attention_2",
        device_map={"": PartialState().process_index}
    )
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    if USE_LORA:
        model = prepare_model_for_kbit_training(model)
    # model.add_adapter(peft_config)
    # model.enable_adapters()
else:
    model = Idefics2ForConditionalGeneration.from_pretrained(
        "HuggingFaceM4/idefics2-8b",
        torch_dtype=torch.bfloat16,
        _attn_implementation="flash_attention_2", # Only available on A100 or H100
    ).to(DEVICE)

### Load Dataset

train_dataset = load_dataset("nielsr/docvqa_1200_examples", split="train")
train_dataset = train_dataset.remove_columns(['id', 'words', 'bounding_boxes', 'answer'])

eval_dataset = load_dataset("nielsr/docvqa_1200_examples", split="test")
eval_dataset = eval_dataset.remove_columns(['id', 'words', 'bounding_boxes', 'answer'])

#### Dataset Formatting

class MyDataCollator:
    def __init__(self, processor):
        self.processor = processor
        self.image_token_id = processor.tokenizer.additional_special_tokens_ids[
            processor.tokenizer.additional_special_tokens.index("<image>")
        ]

    def __call__(self, examples):
        texts = []
        images = []
        for example in examples:
            image = example["image"]
            question = example["query"]["en"]
            answer = random.choice(example["answers"])
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Answer briefly."},
                        {"type": "image"},
                        {"type": "text", "text": question}
                    ]
                },
                {
                    "role": "assistant",
                    "content": [
                        {"type": "text", "text": answer}
                    ]
                }
            ]
            text = processor.apply_chat_template(messages, add_generation_prompt=False)
            texts.append(text.strip())
            images.append([image])

        batch = processor(text=texts, images=images, return_tensors="pt", padding=True)

        labels = batch["input_ids"].clone()
        labels[labels == processor.tokenizer.pad_token_id] = self.image_token_id
        batch["labels"] = labels

        return batch

data_collator = MyDataCollator(processor)

### Training

training_args = TrainingArguments(
    num_train_epochs=2,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=8,
    warmup_steps=50,
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_steps=25,
    output_dir="./docvqa_ft_tutorial",
    save_strategy="steps",
    save_steps=250,
    save_total_limit=1,
    # evaluation_strategy="epoch",
    remove_unused_columns=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()

if trainer.is_fsdp_enabled:
    trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")

compute_environment: LOCAL_MACHINE                                                                                                                                           
debug: false                                                                                                                                                                 
distributed_type: FSDP
downcast_bf16: 'no'
fsdp_config:
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_backward_prefetch: BACKWARD_PRE
  fsdp_cpu_ram_efficient_loading: true
  fsdp_forward_prefetch: false
  fsdp_offload_params: true
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sync_module_states: true
  fsdp_use_orig_params: false
machine_rank: 0
main_training_function: main
mixed_precision: 'no'
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

CUDA_VISIBLE_DEVICES=0,1 accelerate launch --multi_gpu --config_file fsdp_config_qlora.yaml idefics2.py --train_type qlora --use_flash_attn true --use_peft_lora True --use_reentrant True --use_4bit_quantization True --bf16 true

matbee

Apr 25, 2024

Yeah I think this might be beyond my current abilities

VictorSanh

Apr 25, 2024

i'll allocate some time to dig in, looking at the config and the script, you should be pretty close I think

matbee

Apr 25, 2024

I believe one source of problems is that I need to parse the arguments with hf_argparser and pass those values into Trainer({args:...}).
The current state DOES seem to work with DDP, but not FSDP. It's likely some combination of arguments I'm using. It's very close though.

VictorSanh

Apr 25, 2024

if it is of any help in the meantime, here's the config I used to train on multiple gpus with deepspeed (not fsdp)
I don't think it matters but I passed all the parameters inside the TrainingArgs.

compute_environment: LOCAL_MACHINE
deepspeed_config:
  deepspeed_multinode_launcher: standard
  deepspeed_config_file: deepspeed_config.json
  zero3_init_flag: true
distributed_type: DEEPSPEED
fsdp_config: {}
machine_rank: 0
main_process_ip: $MASTER_ADDR
main_process_port: $MASTER_PORT
main_training_function: main
num_machines: 1
num_processes: $NUM_GPUS
use_cpu: false

and the content of deepspeed_config.json

{
    "communication_data_type": "fp32",
    "zero_optimization": {
        "stage": 3,
        "offload_param": {
            "device": "none"
        },
        "offload_optimizer": {
            "device": "none"
        }
    },
    "fp16": {
        "enabled": false
    },
    "bf16": {
        "enabled": true
    },
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "gradient_accumulation_steps": "auto"
}

the training script:

import torch
from transformers import AutoProcessor, BitsAndBytesConfig, Idefics2ForConditionalGeneration
import safetensors

DEVICE = "cuda:0"
USE_4_BIT = False
RESUME_FROM_CHECKPOINT = False

processor = AutoProcessor.from_pretrained(
    "HuggingFaceM4/idefics2-8b",
    do_image_splitting=True,
)
if USE_4_BIT:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    model = Idefics2ForConditionalGeneration.from_pretrained(
        "HuggingFaceM4/idefics2-8b",
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config,
    )
else:
    model = Idefics2ForConditionalGeneration.from_pretrained(
        "HuggingFaceM4/idefics2-8b",
        torch_dtype=torch.bfloat16,
        _attn_implementation="flash_attention_2",
    )#.to(DEVICE)

##
from peft import LoraConfig
from peft import get_peft_model

lora_config = LoraConfig(
    r=4,
    lora_alpha=4,
    lora_dropout=0.1,
    target_modules='all-linear',
    use_dora=True,
    init_lora_weights="gaussian"
)

model = get_peft_model(model, lora_config)


##
from datasets import load_dataset, disable_caching
disable_caching()

train_dataset = load_dataset("HuggingFaceM4/DocumentVQA", split="train") # TO CHANGE with nielsr/docvqa_1200_examples_donut
train_dataset = train_dataset.remove_columns(['questionId', 'question_types', 'docId', 'ucsf_document_id', 'ucsf_document_page_no'])
eval_dataset = load_dataset("HuggingFaceM4/DocumentVQA", split="validation") # TO CHANGE with nielsr/docvqa_1200_examples_donut
eval_dataset = eval_dataset.remove_columns(['questionId', 'question_types', 'docId', 'ucsf_document_id', 'ucsf_document_page_no'])

##
import random

class MyDataCollator:
    def __init__(self, processor):
        self.processor = processor
        self.image_token_id = processor.tokenizer.additional_special_tokens_ids[
            processor.tokenizer.additional_special_tokens.index("<image>")
        ]

    def __call__(self, examples):
        texts = []
        images = []
        for example in examples:
            image = example["image"]
            if image is None:
                continue
            question = example["question"]
            answer = random.choice(example["answers"])
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Answer briefly."},
                        {"type": "image"},
                        {"type": "text", "text": question}
                    ]
                },
                {
                    "role": "assistant",
                    "content": [
                        {"type": "text", "text": answer}
                    ]
                }
            ]
            text = processor.apply_chat_template(messages, add_generation_prompt=False)
            texts.append(text.strip())
            images.append([image])

        batch = processor(text=texts, images=images, return_tensors="pt", padding=True)

        labels = batch["input_ids"].clone()
        labels[labels == processor.tokenizer.pad_token_id] = self.image_token_id
        batch["labels"] = labels

        return batch

data_collator = MyDataCollator(processor)

##
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    warmup_steps=100,
    learning_rate=5e-5,
    weight_decay=0.1,
    logging_steps=10,
    output_dir="./docvqa_ft_tutorial",
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    bf16=True,
    push_to_hub_model_id="test-victor",
    remove_unused_columns=False,
    report_to="none",
    deepspeed="deepspeed_config.json",
    save_safetensors=False,
    neftune_noise_alpha=5.0,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train(resume_from_checkpoint=RESUME_FROM_CHECKPOINT)

trainer.push_to_hub()

the launch command:

accelerate launch \
    --rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \
    --config_file $ACCELERATE_CONFIG_FILE \
    --main_process_ip $MASTER_ADDR \
    --main_process_port $MASTER_PORT \
    --machine_rank \$SLURM_PROCID \
    --role \$(hostname -s): --tee 3 \
    docvqa_ft.py

matbee

Apr 26, 2024

•

edited Apr 26, 2024

Alright I believe I've been able to take what you gave and get a working Zero3 deepspeed fine-tune. Would definitely need a little bit of lovin' to make it universal/shippable- but it seems to work!

https://gist.github.com/matbee-eth/466ec56c9fc82a15ac7ea0a1ba5df29c

VictorSanh

Apr 26, 2024

let's go! i am glad this unblocked you!

joaomsimoes

Apr 26, 2024

By the way, how can I train it with AWS Sagemaker? Do I need to do any changes in the code?

matbee

Apr 26, 2024

By the way, how can I train it with AWS Sagemaker? Do I need to do any changes in the code?

Ive never used it, but a quick peruse looks like it should work. It doesn't use accelerate+deepspeed config, it uses its own accelerate+ sagemaker config. Atleast from my inexperienced view, it should be worth a shot.

as8311

Apr 29, 2024

i'm trying to finetune on a multi-GPU node with 2 NVIDIA A100 GPUs, using the training script provided by Victor.

getting a runtime error indicating a mismatch in device allocation between cuda:1 and cuda:0.

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!

 [WARNING]  using untested triton version (2.3.0), only 1.0.0 is known to be compatible
[2024-04-29 01:59:03,406] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-04-29 01:59:03,406] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2024-04-29 01:59:03,485] [INFO] [comm.py:637:init_distributed] cdb=None
Parameter Offload: Total persistent parameters: 30988016 in 1060 params
  0%|                                                                                                                                                  | 0/40000 [00:00<?, ?it/s]
No chat template is defined for this tokenizer - using the default template for the LlamaTokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.


No chat template is defined for this tokenizer - using the default template for the LlamaTokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.

[rank1]: Traceback (most recent call last):
[rank1]:   File "/home/idefics2/deepspeed/idefics2_deepspeed_script.py", line 180, in <module>
[rank1]:     trainer.train()
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/transformers/trainer.py", line 1859, in train
[rank1]:     return inner_training_loop(
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/transformers/trainer.py", line 2203, in _inner_training_loop
[rank1]:     tr_loss_step = self.training_step(model, inputs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/transformers/trainer.py", line 3138, in training_step
[rank1]:     loss = self.compute_loss(model, inputs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/transformers/trainer.py", line 3161, in compute_loss
[rank1]:     outputs = model(**inputs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
[rank1]:     return self._call_impl(*args, **kwargs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
[rank1]:     return forward_call(*args, **kwargs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
[rank1]:     ret_val = func(*args, **kwargs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1855, in forward
[rank1]:     loss = self.module(*inputs, **kwargs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
[rank1]:     return self._call_impl(*args, **kwargs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1582, in _call_impl
[rank1]:     result = forward_call(*args, **kwargs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/peft/peft_model.py", line 563, in forward
[rank1]:     return self.get_base_model()(*args, **kwargs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
[rank1]:     return self._call_impl(*args, **kwargs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1582, in _call_impl
[rank1]:     result = forward_call(*args, **kwargs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
[rank1]:     output = module._old_forward(*args, **kwargs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/transformers/models/idefics2/modeling_idefics2.py", line 1823, in forward
[rank1]:     outputs = self.model(
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
[rank1]:     return self._call_impl(*args, **kwargs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1582, in _call_impl
[rank1]:     result = forward_call(*args, **kwargs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
[rank1]:     output = module._old_forward(*args, **kwargs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/transformers/models/idefics2/modeling_idefics2.py", line 1602, in forward
[rank1]:     inputs_embeds = self.text_model.get_input_embeddings()(input_ids)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
[rank1]:     return self._call_impl(*args, **kwargs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1582, in _call_impl
[rank1]:     result = forward_call(*args, **kwargs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
[rank1]:     output = module._old_forward(*args, **kwargs)
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/torch/nn/modules/sparse.py", line 163, in forward
[rank1]:     return F.embedding(
[rank1]:   File "/home/idefics2/idefics2_venv/lib/python3.10/site-packages/torch/nn/functional.py", line 2264, in embedding
[rank1]:     return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
[rank1]: RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)
W0429 01:59:14.322000 140198700533568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 874502 closing signal SIGTERM
E0429 01:59:14.536000 140198700533568 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 1 (pid: 874503) of binary: /home/idefics2/idefics2_venv/bin/python3

default_config.yaml

compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
  deepspeed_config_file: /home/idefics2/deepspeed/deepspeed_config.json
  zero3_init_flag: true
distributed_type: DEEPSPEED
downcast_bf16: 'no'
enable_cpu_affinity: false
machine_rank: 0
main_training_function: main
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

deepspeed_config.json

{
    "communication_data_type": "fp32",
    "zero_optimization": {
        "stage": 3,
        "offload_param": {
            "device": "none"
        },
        "offload_optimizer": {
            "device": "none"
        }
    },
    "fp16": {
        "enabled": "auto"
    },
    "bf16": {
        "enabled": "auto"
    },
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "gradient_accumulation_steps": "auto"
}

VictorSanh

Apr 29, 2024

hi @as8311
i usually encountered such error when I was handling the model device placement or input device placement myself instead of handing it off to the trainer (and accelerate in the backend).
any chance you are doing something similar?
btw, I would recommend in the ds config to not set fp16 AND bf16 to auto, but rather to true/false depending on which mixed precision you are using. i don't think it's related to this but it is less error prone

as8311

Apr 30, 2024

hey @VictorSanh
thanks for getting back to me. i'll update the ds config accordingly. i went through my code and didn't find any explicit handling for model or input device placement. here's the code for reference -

import safetensors
import torch
import random
import pandas as pd
from peft import LoraConfig
from transformers import AutoProcessor, BitsAndBytesConfig, Idefics2ForConditionalGeneration
from datasets import Dataset
from PIL import Image
from transformers.image_utils import load_image
from peft import get_peft_model

RESUME_FROM_CHECKPOINT = False

processor = AutoProcessor.from_pretrained(
    "HuggingFaceM4/idefics2-8b",
    do_image_splitting=False,
)

USE_LORA = True
USE_QLORA = True

if USE_QLORA or USE_LORA:
    lora_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules = 'all-linear',
        use_dora=False if USE_QLORA else True,
        init_lora_weights="gaussian"
    )
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
    model = Idefics2ForConditionalGeneration.from_pretrained(
        "HuggingFaceM4/idefics2-8b",
        quantization_config=bnb_config if USE_QLORA else None,
        torch_dtype=torch.bfloat16, 
    )
else:
    model = Idefics2ForConditionalGeneration.from_pretrained(
        "HuggingFaceM4/idefics2-8b",
        torch_dtype=torch.bfloat16,
    )

model = get_peft_model(model, lora_config)


##
from datasets import load_dataset, disable_caching
disable_caching()

train = pd.read_csv("/home/idefics2/data/caption/train.csv", encoding_errors='ignore',lineterminator='\n',
                    on_bad_lines='skip')
train_dataset = Dataset.from_pandas(train)

##
import random

class MyDataCollator:
    def __init__(self, processor):
        self.processor = processor
        self.image_token_id = processor.tokenizer.additional_special_tokens_ids[
            processor.tokenizer.additional_special_tokens.index("<image>")
        ]

    def __call__(self, examples):
        texts = []
        images = []
        for example in examples:
            image = load_image(example["main_image_url"])
            prompt = example["prompt"]
            answer = example["attribute_value_pair"]
            messages = [
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            {"type": "image"},
                        ]
                    },
                    {
                        "role": "assistant",
                        "content": [
                            {"type": "text", "text": answer}
                        ]
                    }
            ]
            text = processor.apply_chat_template(messages, add_generation_prompt=False)
            texts.append(text.strip())
            images.append([image if image else None])

        batch = processor(text=texts, images=images, return_tensors="pt", padding=True)

        labels = batch["input_ids"].clone()
        labels[labels == processor.tokenizer.pad_token_id] = self.image_token_id
        batch["labels"] = labels

        return batch

data_collator = MyDataCollator(processor)

##
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    warmup_steps=50,
    learning_rate=1e-4,
    weight_decay=0.1,
    logging_steps=25,
    output_dir="caption/",
    save_strategy="steps",
    save_steps=4000,
    save_total_limit=8,
    bf16=True,
    remove_unused_columns=False,
    report_to="tensorboard",
    deepspeed="deepspeed_config.json",
    save_safetensors=False,
    neftune_noise_alpha=5.0,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train(resume_from_checkpoint=RESUME_FROM_CHECKPOINT)

schwarzwalder

May 15, 2024

•

edited May 15, 2024

@VictorSanh @matbee I followed your examples above to train on multiple gpus with deepspeed. I get the following error and unable to resolve it:

 File "/home/user/envs/vlm/lib/python3.8/site-packages/transformers/models/mistral/modeling_mistral.py", line 677, in forward
     raise ValueError(
 ValueError: Attention mask should be of size (2, 1, 96, 192), but is torch.Size([2, 1, 96, 96])

However, when I set the per_device_train_batch_size=1 the code runs successfully (previously per_device_train_batch_size=2).

Here is my environment details:

torch==2.3.0
torchvision==0.18.0
transformers==4.40.2
tokenizers==0.19.1
deepspeed==0.14.2
accelerate==0.30.1
peft==0.10.0

schwarzwalder

May 22, 2024

•

edited May 22, 2024

@VictorSanh @matbee I am still unable to fix this issue. Can you please share the versions of the above listed packages and nvidia-cuda-* packages of your python environment ?

Leyo

May 27, 2024

•

edited May 27, 2024

I think a quick fix with your version of transformers would be:
model.model.text_model._attn_implementation = "flash_attention_2"

There has been 2 fixes recently: https://github.com/huggingface/transformers/pull/30320 and https://github.com/huggingface/transformers/pull/30507 I think either of those changes should help with your issue. One fixes the past_key_values that lead to the having the wrong shapes in the standard attention implementation and the other helps to pass pass the _attn_implementation correctly for the text_model

schwarzwalder

Jun 2, 2024

@Leyo Thanks, that fixed the error.

ben81828

Jun 12, 2024

I followed @matbee code for training, and everything seemed to be working fine with the loss decreasing steadily. However, I found that all the LoRA B values ended up being zero. Has anyone else experienced this issue?

Leyo

Jun 12, 2024

Is it really all the LoRA B or just those of the vision model LoRAs? Also, are you using gradient checkpointing?
It may be linked to this: https://github.com/huggingface/peft/issues/1402#issuecomment-1913675032

It would still be a surprise if that's the case because Idefics2 specifically adds hooks on both embeddings to make sure not to run in this issue. That's the only one I can think of from the information given though.

ben81828

Jun 13, 2024

@Leyo , thank you very much for your reply.
From my observation, it's not only the Lora B matrix of the vision_model that is zero (as shown in the image below).

I used the following scripts on Colab with 4 T4 GPUs.

code:

import torch
import safetensors
import random
import sys
import os
from transformers import HfArgumentParser, TrainingArguments, AutoProcessor, BitsAndBytesConfig, Idefics2ForConditionalGeneration, TrainingArguments, Trainer
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from datasets import load_dataset, disable_caching
from dataclasses import dataclass, field
from typing import Optional

# DEVICE = "cuda:0"
USE_4_BIT = True
RESUME_FROM_CHECKPOINT = False

# Define and parse arguments.
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: Optional[str] = field(
        default="HuggingFaceM4/idefics2-8b",
        metadata={
            "help": "Path to pretrained model or model identifier from huggingface.co/models"
        }
    )
    chat_template_format: Optional[str] = field(
        default="none",
        metadata={
            "help": "chatml|zephyr|none. Pass `none` if the dataset is already formatted with the chat template."
        },
    )
    lora_alpha: Optional[int] = field(default=16)
    lora_dropout: Optional[float] = field(default=0.1)
    lora_r: Optional[int] = field(default=64)
    lora_target_modules: Optional[str] = field(
        default="q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj",
        metadata={
            "help": "comma separated list of target modules to apply LoRA layers to"
        },
    )
    use_nested_quant: Optional[bool] = field(
        default=False,
        metadata={"help": "Activate nested quantization for 4bit base models"},
    )
    bnb_4bit_compute_dtype: Optional[str] = field(
        default="bfloat16",
        metadata={"help": "Compute dtype for 4bit base models"},
    )
    bnb_4bit_quant_storage_dtype: Optional[str] = field(
        default="uint8",
        metadata={"help": "Quantization storage dtype for 4bit base models"},
    )
    bnb_4bit_quant_type: Optional[str] = field(
        default="nf4",
        metadata={"help": "Quantization type fp4 or nf4"},
    )
    use_flash_attn: Optional[bool] = field(
        default=False,
        metadata={"help": "Enables Flash attention for training."},
    )
    use_peft_lora: Optional[bool] = field(
        default=False,
        metadata={"help": "Enables PEFT LoRA for training."},
    )
    use_8bit_qunatization: Optional[bool] = field(
        default=False,
        metadata={"help": "Enables loading model in 8bit."},
    )
    use_4bit_quantization: Optional[bool] = field(
        default=False,
        metadata={"help": "Enables loading model in 4bit."},
    )
    use_reentrant: Optional[bool] = field(
        default=False,
        metadata={"help": "Gradient Checkpointing param. Refer the related docs"},
    )
    use_unsloth: Optional[bool] = field(
        default=False,
        metadata={"help": "Enables UnSloth for training."},
    )
    use_loftq: Optional[bool] = field(
        default=False,
        metadata={"help": "Enables LoftQ init for the LoRA adapters when using QLoRA."},
    )
    use_loftq_callback: Optional[bool] = field(
        default=False,
        metadata={
            "help": "Enables LoftQ callback comparing logits of base model to the ones from LoftQ init. Provides better init."
        },
    )
    moe_layer_name: Optional[str] = field(
        default=None,
        metadata={"help": "MOE layer name"},
    )

@dataclass
class DataTrainingArguments:
    dataset_name: Optional[str] = field(
        default="timdettmers/openassistant-guanaco",
        metadata={"help": "The preference dataset to use."},
    )
    packing: Optional[bool] = field(
        default=False,
        metadata={"help": "Use packing dataset creating."},
    )
    dataset_text_field: str = field(
        default="text", metadata={"help": "Dataset field to use as input text."}
    )
    max_seq_length: Optional[int] = field(default=512)
    append_concat_token: Optional[bool] = field(
        default=False,
        metadata={
            "help": "If True, appends `eos_token_id` at the end of each sample being packed."
        },
    )
    add_special_tokens: Optional[bool] = field(
        default=False,
        metadata={
            "help": "If True, tokenizers adds special tokens to each sample being packed."
        },
    )
    splits: Optional[str] = field(
        default="train,test",
        metadata={"help": "Comma separate list of the splits to use from the dataset."},
    )


class MyDataCollator:
    def __init__(self, processor):
        self.processor = processor
        self.image_token_id = self.processor.tokenizer.additional_special_tokens_ids[
            self.processor.tokenizer.additional_special_tokens.index("<image>")
        ]

    def __call__(self, examples):
        texts = []
        images = []
        for example in examples:
            image = example["image"]
            if image is None:
                continue
            question = example["query"]["en"]
            answer = random.choice(example["answers"])
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Answer briefly."},
                        {"type": "image"},
                        {"type": "text", "text": question}
                    ]
                },
                {
                    "role": "assistant",
                    "content": [
                        {"type": "text", "text": answer}
                    ]
                }
            ]
            text = self.processor.apply_chat_template(messages, add_generation_prompt=False)
            texts.append(text.strip())
            print("texts", texts)
            images.append([image])

        batch = self.processor(text=texts, images=images, return_tensors="pt", padding=True)

        labels = batch["input_ids"].clone()
        labels[labels == self.processor.tokenizer.pad_token_id] = self.image_token_id
        batch["labels"] = labels

        return batch

def main(model_args, data_args, training_args):

    processor = AutoProcessor.from_pretrained(
        "HuggingFaceM4/idefics2-8b",
        do_image_splitting=False,
        # do_image_splitting=True,
    )
    if USE_4_BIT:
        compute_dtype = getattr(torch, "bfloat16")
        quant_storage_stype = getattr(torch, "bfloat16")
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_storage=torch.bfloat16,
            llm_int8_skip_modules=["lm_head", "embed_tokens"],
        )
        model = Idefics2ForConditionalGeneration.from_pretrained(
            "HuggingFaceM4/idefics2-8b",
            torch_dtype = getattr(torch, "bfloat16"),
            quantization_config=bnb_config,
            low_cpu_mem_usage=True,
            # attn_implementation="flash_attention_2",
            use_cache=False,
        )
        # model.gradient_checkpointing_enable()
        # model = prepare_model_for_kbit_training(model)
    else:
        model = Idefics2ForConditionalGeneration.from_pretrained(
            "HuggingFaceM4/idefics2-8b",
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2",
            low_cpu_mem_usage=True
        )#.to(DEVICE)

    ##

    lora_config = LoraConfig(
        r=4,
        lora_alpha=4,
        lora_dropout=0.1,
        bias="none",
        target_modules=["q_proj", "k_proj", "v_proj"],
        task_type="CAUSAL_LM",
        use_dora=False
    )

    model = get_peft_model(model, lora_config)
    ##
    # disable_caching()
    train_dataset = load_dataset("nielsr/docvqa_1200_examples_donut", split="train") # TO CHANGE with nielsr/docvqa_1200_examples_donut
    # train_dataset = train_dataset.remove_columns(['questionId', 'question_types', 'docId', 'ucsf_document_id', 'ucsf_document_page_no'])
    eval_dataset = load_dataset("nielsr/docvqa_1200_examples_donut", split="test") # TO CHANGE with nielsr/docvqa_1200_examples_donut
    # eval_dataset = eval_dataset.remove_columns(['questionId', 'question_types', 'docId', 'ucsf_document_id', 'ucsf_document_page_no'])

    ##
    import random

    data_collator = MyDataCollator(processor)

    ##

    training_args = TrainingArguments(
        num_train_epochs=1,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        warmup_steps=100,
        learning_rate=5e-5,
        weight_decay=0.1,
        logging_steps=10,
        output_dir="./docvqa_ft_tutorial",
        save_strategy="steps",
        save_steps=10,
        save_total_limit=3,
        bf16=True,
        remove_unused_columns=False,
        report_to="none",
        deepspeed="zero_stage3_config.json",
        save_safetensors=False,
        neftune_noise_alpha=5.0,
        per_device_train_batch_size=1,
        gradient_checkpointing_kwargs = {"use_reentrant": True}
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train(resume_from_checkpoint=RESUME_FROM_CHECKPOINT)

    # trainer.push_to_hub()

if __name__ == "__main__":
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments)
    )
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1])
        )
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
    main(model_args, data_args, training_args)

zero_stage3_config.json

{
    "zero_optimization": {
        "stage": 3,
        "offload_param": {
            "device": "none"
        },
        "offload_optimizer": {
            "device": "none"
        }
    },
    "fp16": {
        "enabled": "auto"
    },
    "bf16": {
        "enabled": "auto"
    },
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "gradient_accumulation_steps": "auto"
}

launch.sh

!CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --standalone --nnodes=1 --nproc_per_node=2 idefics-8b-deepspeed.py --output_dir "./output"

ben81828

Jun 19, 2024

Hi, @Leyo . Sorry to bother you again.
I still haven't found a way to fix this. I'm wondering if you have any thoughts on this? Could you please provide some guidance?

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

Your need to confirm your account before you can post a new comment.

· Sign up or log in to comment