google/owlvit-base-patch32 · OWL-VIT Finetuning code for custom dataset in Hugging Face

Hi team,

when i am trying to finetune owl-vit base 32 model with custom data cppe-5 i am receiving the following error in the time of trainer.train() function.

######################################################################################################################

ValueError Traceback (most recent call last)
Cell In[40], line 11
1 from transformers import Trainer
3 trainer = Trainer(
4 model=lora_model,
5 args=training_args,
(...)
8 tokenizer=processor,
9 )
---> 11 trainer.train()

File ~/miniconda3/envs/testenv/lib/python3.10/site-packages/transformers/trainer.py:1537, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1535 hf_hub_utils.enable_progress_bars()
1536 else:
-> 1537 return inner_training_loop(
1538 args=args,
1539 resume_from_checkpoint=resume_from_checkpoint,
1540 trial=trial,
1541 ignore_keys_for_eval=ignore_keys_for_eval,
1542 )

File ~/miniconda3/envs/testenv/lib/python3.10/site-packages/transformers/trainer.py:1854, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1851 self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
1853 with self.accelerator.accumulate(model):
-> 1854 tr_loss_step = self.training_step(model, inputs)
1856 if (
1857 args.logging_nan_inf_filter
1858 and not is_torch_tpu_available()
1859 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
1860 ):
1861 # if loss is nan or inf simply add the average of previous logged losses
1862 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File ~/miniconda3/envs/testenv/lib/python3.10/site-packages/transformers/trainer.py:2735, in Trainer.training_step(self, model, inputs)
2732 return loss_mb.reduce_mean().detach().to(self.args.device)
2734 with self.compute_loss_context_manager():
-> 2735 loss = self.compute_loss(model, inputs)
2737 if self.args.n_gpu > 1:
2738 loss = loss.mean() # mean() to average on multi-gpu parallel training

File ~/miniconda3/envs/testenv/lib/python3.10/site-packages/transformers/trainer.py:2776, in Trainer.compute_loss(self, model, inputs, return_outputs)
2774 else:
2775 if isinstance(outputs, dict) and "loss" not in outputs:
-> 2776 raise ValueError(
2777 "The model did not return a loss from the inputs, only the following keys: "
2778 f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
2779 )
2780 # We don't use .loss here since the model may return tuples instead of ModelOutput.
2781 loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

ValueError: The model did not return a loss from the inputs, only the following keys: logits,pred_boxes,text_embeds,image_embeds,class_embeds,text_model_output,vision_model_output. For reference, the inputs it received are input_ids,attention_mask,pixel_values

#####################################################################################################################

Collate_fn() definition
#######################################################################################################################
def collate_fn(batch):
input_ids = torch.Tensor([item["input_ids"].tolist() for item in batch]).int()
input_ids = input_ids.to(device)
attention_mask = torch.Tensor([item["attention_mask"].tolist() for item in batch]).int()
attention_mask = attention_mask.to(device)
pixel_values = torch.Tensor([item["pixel_values"].tolist() for item in batch])
pixel_values = pixel_values.to(device)

batch = {}
batch["input_ids"] = input_ids
batch["attention_mask"] = attention_mask
batch["pixel_values"] = pixel_values
print(batch)
return batch

####################################################################################################################

i am using cppe-5 dataset from HF for custom training and testing.

let me know your feedback comments.