Xenova HF staff commited on
Commit
7e409e9
1 Parent(s): 0abaad4

Modelling fixes

Browse files
Files changed (1) hide show
  1. modeling_florence2.py +5 -2
modeling_florence2.py CHANGED
@@ -2288,7 +2288,7 @@ class Florence2Seq2SeqLMOutput(ModelOutput):
2288
 
2289
  image_hidden_states of the model produced by the vision encoder
2290
  """
2291
-
2292
  last_hidden_state: torch.FloatTensor = None
2293
  past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
2294
  decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -2297,6 +2297,8 @@ class Florence2Seq2SeqLMOutput(ModelOutput):
2297
  encoder_last_hidden_state: Optional[torch.FloatTensor] = None
2298
  encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
2299
  encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
2300
 
2301
 
2302
  FLORENCE2_START_DOCSTRING = r"""
@@ -2731,7 +2733,8 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
2731
  image_features = self._encode_image(pixel_values)
2732
  inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
2733
 
2734
- attention_mask = attention_mask.to(inputs_embeds.dtype)
 
2735
  outputs = self.language_model(
2736
  attention_mask=attention_mask,
2737
  labels=labels,
 
2288
 
2289
  image_hidden_states of the model produced by the vision encoder
2290
  """
2291
+ loss: Optional[torch.FloatTensor] = None
2292
  last_hidden_state: torch.FloatTensor = None
2293
  past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
2294
  decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
 
2297
  encoder_last_hidden_state: Optional[torch.FloatTensor] = None
2298
  encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
2299
  encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
2300
+ logits: torch.FloatTensor = None
2301
+ image_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
2302
 
2303
 
2304
  FLORENCE2_START_DOCSTRING = r"""
 
2733
  image_features = self._encode_image(pixel_values)
2734
  inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
2735
 
2736
+ if inputs_embeds is not None:
2737
+ attention_mask = attention_mask.to(inputs_embeds.dtype)
2738
  outputs = self.language_model(
2739
  attention_mask=attention_mask,
2740
  labels=labels,