add support for sequence classification

Browse files

Files changed (3) hide show

config.json +44 -42
configuration_flamingo.py +1 -1
modeling_flamingo.py +128 -26

config.json CHANGED Viewed

@@ -1,43 +1,45 @@
 {
-  "_name_or_path": "facebook/opt-125m",
-  "_remove_final_layer_norm": false,
-  "activation_dropout": 0.0,
-  "activation_function": "relu",
-  "architectures": [
-    "FlamingoForCausalLM"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_flamingo.FlamingoConfig",
-    "AutoModelForCausalLM": "modeling_flamingo.FlamingoForCausalLM"
-  },
-  "attention_dropout": 0.0,
-  "bos_token_id": 2,
-  "cross_attn_every": 2,
-  "do_layer_norm_before": true,
-  "dropout": 0.1,
-  "enable_bias": true,
-  "eos_token_id": 2,
-  "ffn_dim": 3072,
-  "finetune_LM": true,
-  "hidden_size": 768,
-  "id_perceiver": false,
-  "init_std": 0.02,
-  "inp_dim": 768,
-  "layer_norm_elementwise_affine": true,
-  "layerdrop": 0.0,
-  "max_position_embeddings": 2048,
-  "media_token_id": 32768,
-  "model_type": "opt",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "only_attend_immediate_media": true,
-  "pad_token_id": 1,
-  "perceiver_depth": 2,
-  "perceiver_num_latents": 64,
-  "prefix": "</s>",
-  "torch_dtype": "float32",
-  "transformers_version": "4.29.0",
-  "use_cache": true,
-  "vocab_size": 32778,
-  "word_embed_proj_dim": 768
-}

 {
+    "_name_or_path": "facebook/opt-125m",
+    "_remove_final_layer_norm": false,
+    "activation_dropout": 0.0,
+    "activation_function": "relu",
+    "architectures": [
+      "FlamingoForCausalLM"
+    ],
+    "auto_map": {
+      "AutoConfig": "configuration_flamingo.FlamingoConfig",
+      "AutoModelForCausalLM": "modeling_flamingo.FlamingoForCausalLM",
+      "AutoModelForSequenceClassification": "modeling_flamingo.FlamingoForSequenceClassification"
+    },
+    "attention_dropout": 0.0,
+    "bos_token_id": 2,
+    "cross_attn_every": 2,
+    "do_layer_norm_before": true,
+    "dropout": 0.1,
+    "enable_bias": true,
+    "eos_token_id": 2,
+    "ffn_dim": 3072,
+    "finetune_LM": true,
+    "hidden_size": 768,
+    "id_perceiver": false,
+    "init_std": 0.02,
+    "inp_dim": 768,
+    "layer_norm_elementwise_affine": true,
+    "layerdrop": 0.0,
+    "max_position_embeddings": 2048,
+    "media_token_id": 32768,
+    "model_type": "opt",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "only_attend_immediate_media": true,
+    "pad_token_id": 1,
+    "perceiver_depth": 2,
+    "perceiver_num_latents": 64,
+    "prefix": "</s>",
+    "torch_dtype": "float32",
+    "transformers_version": "4.29.0",
+    "use_cache": true,
+    "vocab_size": 32778,
+    "word_embed_proj_dim": 768
+  }

configuration_flamingo.py CHANGED Viewed

@@ -32,4 +32,4 @@ class FlamingoConfig(configuration_opt.OPTConfig, dict):
                 self, vocab_size=vocab_size, **kwargs)
         self.media_token_id = media_token_id
         self.cross_attn_every = cross_attn_every
-        dict.__init__(self, **self.__dict__)

                 self, vocab_size=vocab_size, **kwargs)
         self.media_token_id = media_token_id
         self.cross_attn_every = cross_attn_every
+        dict.__init__(self, **self.__dict__)

modeling_flamingo.py CHANGED Viewed

@@ -7,9 +7,9 @@ import os
 import torch
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 import transformers.models.opt.modeling_opt as modeling_opt
 from transformers.models.opt.modeling_opt\
         import OPTDecoderLayer, OPTPreTrainedModel, OPTConfig
@@ -46,7 +46,6 @@ class OPTLearnedPositionalEmbedding(nn.Embedding):
 class OPTDecoder(modeling_opt.OPTDecoder):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]
     Args:
         config: OPTConfig
         embed_tokens (nn.Embedding): output embedding
@@ -136,35 +135,26 @@ class OPTDecoder(modeling_opt.OPTDecoder):
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
                 Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
                 [What are input IDs?](../glossary#input-ids)
             attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                 - 1 for tokens that are **not masked**,
                 - 0 for tokens that are **masked**.
                 [What are attention masks?](../glossary#attention-mask)
             head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
                 Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
                 - 1 indicates the head is **not masked**,
                 - 0 indicates the head is **masked**.
             past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                 Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                 shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                 Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                 cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
                 If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                 that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                 all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                 Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                 This is useful if you want more control over how to convert `input_ids` indices into associated vectors
@@ -405,33 +395,25 @@ class FlamingoForCausalLM(modeling_opt.OPTForCausalLM):
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
                 Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
                 [What are input IDs?](../glossary#input-ids)
             attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                 - 1 for tokens that are **not masked**,
                 - 0 for tokens that are **masked**.
                 [What are attention masks?](../glossary#attention-mask)
             head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
                 Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
                 - 1 indicates the head is **not masked**,
                 - 0 indicates the head is **masked**.
             past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                 Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                 shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                 shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
                 tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
                 Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                 cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
                 If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                 that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                 all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
@@ -454,20 +436,14 @@ class FlamingoForCausalLM(modeling_opt.OPTForCausalLM):
                 for more detail.
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         Returns:
         Example:
         ```python
         >>> from transformers import GPT2Tokenizer, OPTForCausalLM
         >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
         >>> tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m")
         >>> prompt = "Hey, are you consciours? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -514,3 +490,129 @@ class FlamingoForCausalLM(modeling_opt.OPTForCausalLM):
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )

 import torch
 import torch.utils.checkpoint
 from torch import nn
+from torch.nn import CrossEntropyLoss, BCEWithLogitsLoss, MSELoss
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 import transformers.models.opt.modeling_opt as modeling_opt
 from transformers.models.opt.modeling_opt\
         import OPTDecoderLayer, OPTPreTrainedModel, OPTConfig
 class OPTDecoder(modeling_opt.OPTDecoder):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]
     Args:
         config: OPTConfig
         embed_tokens (nn.Embedding): output embedding
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
                 Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
                 [What are input IDs?](../glossary#input-ids)
             attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                 - 1 for tokens that are **not masked**,
                 - 0 for tokens that are **masked**.
                 [What are attention masks?](../glossary#attention-mask)
             head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
                 Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
                 - 1 indicates the head is **not masked**,
                 - 0 indicates the head is **masked**.
             past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                 Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                 shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                 Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                 cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
                 If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                 that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                 all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                 Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                 This is useful if you want more control over how to convert `input_ids` indices into associated vectors
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                 provide it.
                 Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                 [`PreTrainedTokenizer.__call__`] for details.
                 [What are input IDs?](../glossary#input-ids)
             attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                 - 1 for tokens that are **not masked**,
                 - 0 for tokens that are **masked**.
                 [What are attention masks?](../glossary#attention-mask)
             head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
                 Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
                 - 1 indicates the head is **not masked**,
                 - 0 indicates the head is **masked**.
             past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                 Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                 shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                 shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
                 tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
                 Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                 cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
                 If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                 that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                 all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
                 for more detail.
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         Returns:
         Example:
         ```python
         >>> from transformers import GPT2Tokenizer, OPTForCausalLM
         >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
         >>> tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m")
         >>> prompt = "Hey, are you consciours? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+class FlamingoForSequenceClassification(OPTPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r"score.weight",
+    ]
+    def __init__(self, config: OPTConfig):
+        OPTPreTrainedModel.__init__(self, config)
+        config = setup_default_flamingo_configs(config)
+        self.num_labels = config.num_labels
+        self.model = OPTModel(config)
+        # the lm_head weight is automatically tied to the embed tokens weight
+        self.score = nn.Linear(config.word_embed_proj_dim, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.model.decoder.img_encoder = None
+        self.loss_fct = CrossEntropyLoss()
+        dino_model = ViTModel.from_pretrained("facebook/dino-vitb16")
+        self.setup_vis_encoder(dino_model)
+    def setup_vis_encoder(self, img_encoder):
+        self.model.decoder.img_encoder = img_encoder
+        freeze_all_layers_(img_encoder)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        *args, **kwargs) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            *args, **kwargs)
+        hidden_states = outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size, sequence_length = input_ids.shape[:2]
+        else:
+            batch_size, sequence_length = inputs_embeds.shape[:2]
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+                # logger.warning(
+                #     f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                #     "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                # )
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value