flax-community
/

vit-gpt2

TensorBoard

Model card Files Files and versions Metrics Training metrics Community

ydshieh commited on Aug 4, 2021

Commit

7f266a7

•

1 Parent(s): 06bcf58

revert to original flax gpt2

Browse files

Files changed (1) hide show

vit_gpt2/modeling_flax_gpt2.py +36 -185

vit_gpt2/modeling_flax_gpt2.py CHANGED Viewed

@@ -23,11 +23,11 @@ from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
 from jax import lax
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
-from transformers.modeling_flax_outputs import FlaxBaseModelOutput, FlaxBaseModelOutputWithPast, FlaxCausalLMOutput, FlaxBaseModelOutputWithPastAndCrossAttentions, FlaxCausalLMOutputWithCrossAttentions
-from transformers.modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
-from transformers.utils import logging
-from transformers.models.gpt2.configuration_gpt2 import GPT2Config
 logger = logging.get_logger(__name__)
@@ -117,8 +117,6 @@ class FlaxConv1D(nn.Module):
 class FlaxGPT2Attention(nn.Module):
     config: GPT2Config
     dtype: jnp.dtype = jnp.float32
-    causal: bool = True
-    self_attn: bool = True
     def setup(self):
         config = self.config
@@ -126,18 +124,10 @@ class FlaxGPT2Attention(nn.Module):
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
-        factor = 3 if self.self_attn else 2
-        self.c_attn = FlaxConv1D(features=factor * self.embed_dim, dtype=self.dtype)
         self.c_proj = FlaxConv1D(self.embed_dim, dtype=self.dtype)
-        if not self.self_attn:
-            self.c_query_attn = FlaxConv1D(features=1 * self.embed_dim, dtype=self.dtype)
         self.resid_dropout = nn.Dropout(rate=config.resid_pdrop)
-        if self.causal:
-            self.causal_mask = make_causal_mask(
-                jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool"
-            )
     def _split_heads(self, hidden_states):
         return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
@@ -180,30 +170,13 @@ class FlaxGPT2Attention(nn.Module):
     def __call__(
         self,
         hidden_states,
-        key_value_states: Optional[jnp.ndarray] = None,
         attention_mask=None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
     ):
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-        if not is_cross_attention:
-            # self_attention
-            assert self.self_attn
-            qkv_out = self.c_attn(hidden_states)
-            query, key, value = jnp.split(qkv_out, 3, axis=2)
-        else:
-            # cross_attentions
-            assert not self.self_attn
-            assert not self.causal
-            q_out = self.c_query_attn(hidden_states)
-            (query,) = jnp.split(q_out, 1, axis=2)
-            kv_out = self.c_attn(key_value_states)
-            key, value = jnp.split(kv_out, 2, axis=2)
         query = self._split_heads(query)
         key = self._split_heads(key)
@@ -211,27 +184,20 @@ class FlaxGPT2Attention(nn.Module):
         query_length, key_length = query.shape[1], key.shape[1]
-        if self.causal:
-            if self.has_variable("cache", "cached_key"):
-                mask_shift = self.variables["cache"]["cache_index"]
-                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
-                causal_mask = lax.dynamic_slice(
-                    self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
-                )
-            else:
-                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
-            batch_size = hidden_states.shape[0]
-            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
-        # combine masks if needed
-        if attention_mask is not None and self.causal:
-            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
-            attention_mask = combine_masks(attention_mask, causal_mask)
-        elif self.causal:
-            attention_mask = causal_mask
-        elif attention_mask is not None:
-            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
         dropout_rng = None
         if not deterministic and self.config.attn_pdrop > 0.0:
@@ -239,18 +205,15 @@ class FlaxGPT2Attention(nn.Module):
         # During fast autoregressive decoding, we feed one position at a time,
         # and cache the keys and values step by step.
-        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
             key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
         # transform boolean mask into float mask
-        if attention_mask is not None:
-            attention_bias = lax.select(
-                attention_mask > 0,
-                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-                jnp.full(attention_mask.shape, -1e4).astype(self.dtype),
-            )
-        else:
-            attention_bias = None
         # usual dot product attention
         attn_weights = dot_product_attention_weights(
@@ -298,23 +261,11 @@ class FlaxGPT2Block(nn.Module):
     dtype: jnp.dtype = jnp.float32
     def setup(self):
         hidden_size = self.config.hidden_size
         inner_dim = self.config.n_inner if self.config.n_inner is not None else 4 * hidden_size
         self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
         self.attn = FlaxGPT2Attention(self.config, dtype=self.dtype)
-        if self.config.add_cross_attention:
-            self.ln_cross_attn = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
-            # [IMPORTANT] Cross attention requires ``causal=False``! This is a bug I made previously.
-            self.crossattention = FlaxGPT2Attention(config=self.config, dtype=self.dtype, causal=False, self_attn=False)
-            project_encoder = getattr(self.config, "project_encoder", None)
-            if project_encoder:
-                self.encoder_projection_ln = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
-                self.encoder_projection_mlp = FlaxGPT2MLP(self.config, self.config.hidden_size, dtype=self.dtype)
         self.ln_2 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
         self.mlp = FlaxGPT2MLP(self.config, inner_dim, dtype=self.dtype)
@@ -322,8 +273,6 @@ class FlaxGPT2Block(nn.Module):
         self,
         hidden_states,
         attention_mask=None,
-        encoder_hidden_states: Optional[jnp.ndarray] = None,
-        encoder_attention_mask: Optional[jnp.ndarray] = None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
@@ -341,61 +290,13 @@ class FlaxGPT2Block(nn.Module):
         attn_output = outputs[0]
         hidden_states = attn_output + residual
-        # Cross-Attention Block
-        cross_attn_weights = None
-        if encoder_hidden_states is not None:
-            # add one self-attention block for cross-attention
-            if not hasattr(self, "crossattention"):
-                raise ValueError(
-                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
-                    "cross-attention layers by setting `config.add_cross_attention=True`"
-                )
-            project_encoder = getattr(self.config, "project_encoder", None)
-            if project_encoder:
-                residual = encoder_hidden_states
-                encoder_hidden_states = self.encoder_projection_ln(encoder_hidden_states)
-                feed_forward_hidden_states = self.encoder_projection_mlp(
-                    encoder_hidden_states, deterministic=deterministic
-                )
-                # residual connection
-                encoder_hidden_states = residual + feed_forward_hidden_states
-            residual = hidden_states
-            hidden_states = self.ln_cross_attn(hidden_states)
-            cross_attn_outputs = self.crossattention(
-                hidden_states=hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                deterministic=deterministic,
-                # `init_cache` is only for decoder's `self_attn`
-                init_cache=False,
-                output_attentions=output_attentions,
-            )
-            # residual connection
-            cross_attn_output = cross_attn_outputs[0]
-            hidden_states = cross_attn_output + residual
-            if output_attentions:
-                cross_attn_weights = cross_attn_outputs[1]
         residual = hidden_states
         hidden_states = self.ln_2(hidden_states)
         feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic)
         # residual connection
         hidden_states = residual + feed_forward_hidden_states
-        outputs = (hidden_states,)
-        if output_attentions:
-            self_attn_weights = attn_output[1]
-            outputs += (self_attn_weights,)
-            if cross_attn_weights is not None:
-                outputs += (cross_attn_weights,)
-        return outputs
 class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
@@ -427,24 +328,7 @@ class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
         params_rng, dropout_rng = jax.random.split(rng)
         rngs = {"params": params_rng, "dropout": dropout_rng}
-        if self.config.add_cross_attention:
-            encoder_hidden_states = jnp.zeros(input_shape + (self.config.n_embd,))
-            encoder_attention_mask = attention_mask
-            module_init_outputs = self.module.init(
-                rngs, input_ids, attention_mask, position_ids,
-                encoder_hidden_states, encoder_attention_mask, return_dict=False
-            )
-        else:
-            module_init_outputs = self.module.init(
-                rngs, input_ids, attention_mask, position_ids, return_dict=False
-            )
-        return module_init_outputs["params"]
-    # TODO: Remove if OK
-    # @classmethod
-    # def _from_config(cls, config, **kwargs):
-    #    return super()._from_config(config, **kwargs)
     def init_cache(self, batch_size, max_length):
         r"""
@@ -471,8 +355,6 @@ class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
         input_ids,
         attention_mask=None,
         position_ids=None,
-        encoder_hidden_states: Optional[jnp.ndarray] = None,
-        encoder_attention_mask: Optional[jnp.ndarray] = None,
         params: dict = None,
         past_key_values: dict = None,
         dropout_rng: jax.random.PRNGKey = None,
@@ -487,10 +369,6 @@ class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
         )
         return_dict = return_dict if return_dict is not None else self.config.return_dict
-        if encoder_hidden_states is not None and encoder_attention_mask is None:
-            batch_size, sequence_length = encoder_hidden_states.shape[:2]
-            encoder_attention_mask = jnp.ones((batch_size, sequence_length))
         batch_size, sequence_length = input_ids.shape
         if position_ids is None:
@@ -521,8 +399,6 @@ class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
             jnp.array(input_ids, dtype="i4"),
             jnp.array(attention_mask, dtype="i4"),
             jnp.array(position_ids, dtype="i4"),
-            encoder_hidden_states,
-            encoder_attention_mask,
             not train,
             False,
             output_attentions,
@@ -557,8 +433,6 @@ class FlaxGPT2BlockCollection(nn.Module):
         self,
         hidden_states,
         attention_mask=None,
-        encoder_hidden_states: Optional[jnp.ndarray] = None,
-        encoder_attention_mask: Optional[jnp.ndarray] = None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
@@ -567,7 +441,6 @@ class FlaxGPT2BlockCollection(nn.Module):
     ):
         all_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
-        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
         for block in self.blocks:
             if output_hidden_states:
@@ -576,8 +449,6 @@ class FlaxGPT2BlockCollection(nn.Module):
             layer_outputs = block(
                 hidden_states,
                 attention_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
                 deterministic=deterministic,
                 init_cache=init_cache,
                 output_attentions=output_attentions,
@@ -587,25 +458,19 @@ class FlaxGPT2BlockCollection(nn.Module):
             if output_attentions:
                 all_attentions += (layer_outputs[1],)
-                if encoder_hidden_states is not None:
-                    all_cross_attentions += (layer_outputs[2],)
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
-        # In Flax, `past_key_values` is not contained in modules' outputs.
-        outputs = [hidden_states, all_hidden_states, all_attentions, all_cross_attentions]
         if not return_dict:
             return tuple(v for v in outputs if v is not None)
-        # with cross_attn
-        return FlaxBaseModelOutputWithPastAndCrossAttentions(
             last_hidden_state=hidden_states,
             past_key_values=None,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
-            cross_attentions=all_cross_attentions,
         )
@@ -637,8 +502,6 @@ class FlaxGPT2Module(nn.Module):
         input_ids,
         attention_mask,
         position_ids,
-        encoder_hidden_states: Optional[jnp.ndarray] = None,
-        encoder_attention_mask: Optional[jnp.ndarray] = None,
         deterministic=True,
         init_cache: bool = False,
         output_attentions: bool = False,
@@ -654,8 +517,6 @@ class FlaxGPT2Module(nn.Module):
         outputs = self.h(
             hidden_states,
             attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
             deterministic=deterministic,
             init_cache=init_cache,
             output_attentions=output_attentions,
@@ -669,11 +530,10 @@ class FlaxGPT2Module(nn.Module):
         if not return_dict:
             return (hidden_states,) + outputs[1:]
-        return FlaxBaseModelOutputWithPastAndCrossAttentions(
             last_hidden_state=hidden_states,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-            cross_attentions=outputs.cross_attentions,
         )
@@ -708,8 +568,6 @@ class FlaxGPT2LMHeadModule(nn.Module):
         input_ids,
         attention_mask,
         position_ids,
-        encoder_hidden_states: Optional[jnp.ndarray] = None,
-        encoder_attention_mask: Optional[jnp.ndarray] = None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
@@ -720,8 +578,6 @@ class FlaxGPT2LMHeadModule(nn.Module):
             input_ids,
             attention_mask,
             position_ids,
-            encoder_hidden_states,
-            encoder_attention_mask,
             deterministic=deterministic,
             init_cache=init_cache,
             output_attentions=output_attentions,
@@ -740,13 +596,8 @@ class FlaxGPT2LMHeadModule(nn.Module):
         if not return_dict:
             return (lm_logits,) + outputs[1:]
-        return FlaxCausalLMOutputWithCrossAttentions(
-            logits=lm_logits,
-            past_key_values=None,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            cross_attentions=outputs.cross_attentions
-        )
 @add_start_docstrings(
     """

 from flax.linen.attention import dot_product_attention_weights
 from jax import lax
+from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
+from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxBaseModelOutputWithPast, FlaxCausalLMOutput
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
+from ...utils import logging
+from .configuration_gpt2 import GPT2Config
 logger = logging.get_logger(__name__)
 class FlaxGPT2Attention(nn.Module):
     config: GPT2Config
     dtype: jnp.dtype = jnp.float32
     def setup(self):
         config = self.config
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
+        self.c_attn = FlaxConv1D(features=3 * self.embed_dim, dtype=self.dtype)
         self.c_proj = FlaxConv1D(self.embed_dim, dtype=self.dtype)
         self.resid_dropout = nn.Dropout(rate=config.resid_pdrop)
+        self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
     def _split_heads(self, hidden_states):
         return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
     def __call__(
         self,
         hidden_states,
         attention_mask=None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
     ):
+        qkv_out = self.c_attn(hidden_states)
+        query, key, value = jnp.split(qkv_out, 3, axis=2)
         query = self._split_heads(query)
         key = self._split_heads(key)
         query_length, key_length = query.shape[1], key.shape[1]
+        if self.has_variable("cache", "cached_key"):
+            mask_shift = self.variables["cache"]["cache_index"]
+            max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+            causal_mask = lax.dynamic_slice(
+                self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+            )
+        else:
+            causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+        batch_size = hidden_states.shape[0]
+        causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+        attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+        attention_mask = combine_masks(attention_mask, causal_mask)
         dropout_rng = None
         if not deterministic and self.config.attn_pdrop > 0.0:
         # During fast autoregressive decoding, we feed one position at a time,
         # and cache the keys and values step by step.
+        if self.has_variable("cache", "cached_key") or init_cache:
             key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
         # transform boolean mask into float mask
+        attention_bias = lax.select(
+            attention_mask > 0,
+            jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+            jnp.full(attention_mask.shape, -1e4).astype(self.dtype),
+        )
         # usual dot product attention
         attn_weights = dot_product_attention_weights(
     dtype: jnp.dtype = jnp.float32
     def setup(self):
         hidden_size = self.config.hidden_size
         inner_dim = self.config.n_inner if self.config.n_inner is not None else 4 * hidden_size
         self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
         self.attn = FlaxGPT2Attention(self.config, dtype=self.dtype)
         self.ln_2 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
         self.mlp = FlaxGPT2MLP(self.config, inner_dim, dtype=self.dtype)
         self,
         hidden_states,
         attention_mask=None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
         attn_output = outputs[0]
         hidden_states = attn_output + residual
         residual = hidden_states
         hidden_states = self.ln_2(hidden_states)
         feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic)
         # residual connection
         hidden_states = residual + feed_forward_hidden_states
+        return (hidden_states,) + outputs[1:]
 class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
         params_rng, dropout_rng = jax.random.split(rng)
         rngs = {"params": params_rng, "dropout": dropout_rng}
+        return self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"]
     def init_cache(self, batch_size, max_length):
         r"""
         input_ids,
         attention_mask=None,
         position_ids=None,
         params: dict = None,
         past_key_values: dict = None,
         dropout_rng: jax.random.PRNGKey = None,
         )
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         batch_size, sequence_length = input_ids.shape
         if position_ids is None:
             jnp.array(input_ids, dtype="i4"),
             jnp.array(attention_mask, dtype="i4"),
             jnp.array(position_ids, dtype="i4"),
             not train,
             False,
             output_attentions,
         self,
         hidden_states,
         attention_mask=None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
     ):
         all_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
         for block in self.blocks:
             if output_hidden_states:
             layer_outputs = block(
                 hidden_states,
                 attention_mask,
                 deterministic=deterministic,
                 init_cache=init_cache,
                 output_attentions=output_attentions,
             if output_attentions:
                 all_attentions += (layer_outputs[1],)
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
+        outputs = (hidden_states,)
         if not return_dict:
             return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=None,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
         )
         input_ids,
         attention_mask,
         position_ids,
         deterministic=True,
         init_cache: bool = False,
         output_attentions: bool = False,
         outputs = self.h(
             hidden_states,
             attention_mask,
             deterministic=deterministic,
             init_cache=init_cache,
             output_attentions=output_attentions,
         if not return_dict:
             return (hidden_states,) + outputs[1:]
+        return FlaxBaseModelOutput(
             last_hidden_state=hidden_states,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
         input_ids,
         attention_mask,
         position_ids,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
             input_ids,
             attention_mask,
             position_ids,
             deterministic=deterministic,
             init_cache=init_cache,
             output_attentions=output_attentions,
         if not return_dict:
             return (lm_logits,) + outputs[1:]
+        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
 @add_start_docstrings(
     """