Error with finetuning

#12
by kshetrajna12 - opened

@liuhaotian Thanks for this great model!
I am trying to finetune the HF version of LLava instead of the other version provided in haotian-liu/LLaVA.

The script roughly looks like

from transformers import pipeline
from PIL import Image    
import requests
from datasets import load_dataset
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration
from transformers import TrainingArguments, Trainer


PATH_TO_SAVE="........"

model_id = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained(
    model_id, 
    low_cpu_mem_usage=True, 
)

processor = AutoProcessor.from_pretrained(model_id)

def preprocess_data(examples):
    images = examples['image']
    texts = ['USER: <image>\n'+x+'\nASSISTANT:' for x in examples['text']]

    outputs = [x for x in examples['answer']]
    encoding = processor(texts,images, padding=True, truncation=True, return_tensors="pt")

    for k, v in encoding.items():
          encoding[k] = v.squeeze()

    targets = [processor.tokenizer.encode(x, add_special_tokens=False)+[processor.tokenizer.eos_token_id] for x in outputs]

    encoding["labels"] = targets
    return encoding

dataset = load_dataset('.....', split='train')
processed_dataset = dataset.map(preprocess_data, batched=True, remove_columns=['image','text','answer'])



training_args = TrainingArguments(
    output_dir=PATH_TO_SAVE,
    per_device_train_batch_size=1,
    num_train_epochs=1,
    save_steps=200,
    logging_steps=50,
    learning_rate=5e-5,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    tokenizer=processor.tokenizer,
)

trainer.train()

I keep running into this error
```

ValueError Traceback (most recent call last)
Cell In[285], line 1
----> 1 trainer.train()

File /opt/homebrew/Caskroom/miniconda/base/envs/llava/lib/python3.10/site-packages/transformers/trainer.py:1537, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1535 hf_hub_utils.enable_progress_bars()
1536 else:
-> 1537 return inner_training_loop(
1538 args=args,
1539 resume_from_checkpoint=resume_from_checkpoint,
1540 trial=trial,
1541 ignore_keys_for_eval=ignore_keys_for_eval,
1542 )

File /opt/homebrew/Caskroom/miniconda/base/envs/llava/lib/python3.10/site-packages/transformers/trainer.py:1854, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1851 self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
1853 with self.accelerator.accumulate(model):
-> 1854 tr_loss_step = self.training_step(model, inputs)
1856 if (
1857 args.logging_nan_inf_filter
1858 and not is_torch_tpu_available()
1859 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
1860 ):
1861 # if loss is nan or inf simply add the average of previous logged losses
1862 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File /opt/homebrew/Caskroom/miniconda/base/envs/llava/lib/python3.10/site-packages/transformers/trainer.py:2735, in Trainer.training_step(self, model, inputs)
2732 return loss_mb.reduce_mean().detach().to(self.args.device)
2734 with self.compute_loss_context_manager():
-> 2735 loss = self.compute_loss(model, inputs)
2737 if self.args.n_gpu > 1:
2738 loss = loss.mean() # mean() to average on multi-gpu parallel training

File /opt/homebrew/Caskroom/miniconda/base/envs/llava/lib/python3.10/site-packages/transformers/trainer.py:2758, in Trainer.compute_loss(self, model, inputs, return_outputs)
2756 else:
2757 labels = None
-> 2758 outputs = model(**inputs)
2759 # Save past state if it exists
2760 # TODO: this needs to be fixed and made cleaner later.
2761 if self.args.past_index >= 0:

File /opt/homebrew/Caskroom/miniconda/base/envs/llava/lib/python3.10/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)

File /opt/homebrew/Caskroom/miniconda/base/envs/llava/lib/python3.10/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None

File /opt/homebrew/Caskroom/miniconda/base/envs/llava/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.py:405, in LlavaForConditionalGeneration.forward(self, input_ids, pixel_values, attention_mask, position_ids, past_key_values, inputs_embeds, vision_feature_layer, vision_feature_select_strategy, labels, use_cache, output_attentions, output_hidden_states, return_dict)
400 raise ValueError(
401 f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
402 )
404 image_features = self.multi_modal_projector(selected_image_feature)
--> 405 inputs_embeds, attention_mask, position_ids = self._merge_input_ids_with_image_features(
406 image_features, inputs_embeds, input_ids, attention_mask, position_ids
407 )
408 if labels is None:
409 labels = torch.full_like(attention_mask, self.config.ignore_index).to(torch.long)

File /opt/homebrew/Caskroom/miniconda/base/envs/llava/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.py:312, in LlavaForConditionalGeneration._merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, position_ids)
309 image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None]
311 if image_to_overwrite.sum() != image_features.shape[:-1].numel():
--> 312 raise ValueError(
313 f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
314 f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
315 )
317 final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim)
318 final_attention_mask |= image_to_overwrite

ValueError: The input provided to the model are wrong. The number of image tokens is 1 while the number of image given to the model is 1. This prevents correct indexing and breaks batch generation.


Anyone having similar issues? 

This might be a Mac Silicon specific issue since I am getting the same error with the generation script when I move the model and tensors .to('mps').
Running on a linux machine now gives me
```
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the call method is faster than using a method to encode the text followed by a call to the pad method to get a padded encoding.

IndexError Traceback (most recent call last)
Cell In[10], line 1
----> 1 trainer.train()

File /opt/conda/envs/llava_hf/lib/python3.10/site-packages/transformers/trainer.py:1537, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1535 hf_hub_utils.enable_progress_bars()
1536 else:
-> 1537 return inner_training_loop(
1538 args=args,
1539 resume_from_checkpoint=resume_from_checkpoint,
1540 trial=trial,
1541 ignore_keys_for_eval=ignore_keys_for_eval,
1542 )

File /opt/conda/envs/llava_hf/lib/python3.10/site-packages/transformers/trainer.py:1854, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1851 self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
1853 with self.accelerator.accumulate(model):
-> 1854 tr_loss_step = self.training_step(model, inputs)
1856 if (
1857 args.logging_nan_inf_filter
1858 and not is_torch_tpu_available()
1859 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
1860 ):
1861 # if loss is nan or inf simply add the average of previous logged losses
1862 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File /opt/conda/envs/llava_hf/lib/python3.10/site-packages/transformers/trainer.py:2735, in Trainer.training_step(self, model, inputs)
2732 return loss_mb.reduce_mean().detach().to(self.args.device)
2734 with self.compute_loss_context_manager():
-> 2735 loss = self.compute_loss(model, inputs)
2737 if self.args.n_gpu > 1:
2738 loss = loss.mean() # mean() to average on multi-gpu parallel training

File /opt/conda/envs/llava_hf/lib/python3.10/site-packages/transformers/trainer.py:2758, in Trainer.compute_loss(self, model, inputs, return_outputs)
2756 else:
2757 labels = None
-> 2758 outputs = model(**inputs)
2759 # Save past state if it exists
2760 # TODO: this needs to be fixed and made cleaner later.
2761 if self.args.past_index >= 0:

File /opt/conda/envs/llava_hf/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /opt/conda/envs/llava_hf/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.py:452, in LlavaForConditionalGeneration.forward(self, input_ids, pixel_values, attention_mask, position_ids, past_key_values, inputs_embeds, vision_feature_layer, vision_feature_select_strategy, labels, use_cache, output_attentions, output_hidden_states, return_dict)
450 shift_attention_mask = attention_mask[..., 1:]
451 shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
--> 452 shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
453 else:
454 shift_logits = logits[..., :-1, :].contiguous()

IndexError: The shape of the mask [1, 700] at index 1 does not match the shape of the indexed tensor [1, 1] at index 1
```

Llava Hugging Face org

Hi,

This was fixed in https://github.com/huggingface/transformers/pull/28333. For now, you will need to install Transformers from source to use it: pip install git+git@github.com:huggingface/transformers.git.

Thanks! Is there an example of how to process that dataset? I still run into shape mismatch errors for the labels.
probably due to this method.

def preprocess_data(examples):
     images = examples['image']
     texts = ['USER: <image>\n'+x+'\nASSISTANT:' for x in examples['text']]
 
     outputs = [x for x in examples['answer']]
     encoding = processor(texts,images, padding=True, truncation=True, return_tensors="pt")
 
     for k, v in encoding.items():
           encoding[k] = v.squeeze()
 
 
     targets = [torch.tensor(processor.tokenizer.encode(x, add_special_tokens=False)+[processor.tokenizer.eos_token_id]) for x in outputs]
     targets = pad_sequence(targets, batch_first=True, padding_value=model.config.ignore_index)
 
     encoding["labels"] = targets
     return encoding

I am able to get an output when I use

processed_dataset = dataset.map(preprocess_data, batched=True, remove_columns=['image','text','answer'])
examples = processed_dataset[:2]
model.generate(**inputs, max_new_tokens=200, do_sample=False)

But still fails when I use

training_args = TrainingArguments(
    output_dir=PATH_TO_SAVE,
    per_device_train_batch_size=1,
    num_train_epochs=1,
    save_steps=200,
    logging_steps=50,
    learning_rate=5e-5,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    tokenizer=processor.tokenizer,
)

trainer.train()

I am launching the script with accelerate

@nielsr Even with the latest code from git I run into an issue.
I have more information on this issue .
It looks like there is a difference in how the same input is used during the generate call vs the forward call.
model.generate(**inputs) works without any issue but
model.forward(**inputs) throws this error
```

RuntimeError Traceback (most recent call last)
Cell In[7], line 1
----> 1 model.forward(**i)

File /opt/conda/envs/llava_finetune/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.py:431, in LlavaForConditionalGeneration.forward(self, input_ids, pixel_values, attention_mask, position_ids, past_key_values, inputs_embeds, vision_feature_layer, vision_feature_select_strategy, labels, use_cache, output_attentions, output_hidden_states, return_dict)
426 raise ValueError(
427 f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
428 )
430 image_features = self.multi_modal_projector(selected_image_feature)
--> 431 inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
432 image_features, inputs_embeds, input_ids, attention_mask, labels
433 )
434 if labels is None:
435 labels = torch.full_like(attention_mask, self.config.ignore_index).to(torch.long)

File /opt/conda/envs/llava_finetune/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.py:333, in LlavaForConditionalGeneration._merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels)
330 image_to_overwrite = torch.all(final_embedding == 0, dim=-1)
331 image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
--> 333 if image_to_overwrite.sum() != image_features.shape[:-1].numel():
334 raise ValueError(
335 f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
336 f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
337 )
339 final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
```

Hi Kshetrajna,
Were you able to solve the problem and run the forward without issues? I have the same problem...

Sign up or log in to comment