Spaces:

flax-community
/

dalle-mini

Running

App Files Files Community

boris commited on Mar 29, 2022

Commit

f139b0b

•

1 Parent(s): 69bcbeb

feat: add sinkformer + custom final ln + pre-ln (#151)

Browse files

Files changed (3) hide show

README.md +11 -1
src/dalle_mini/model/configuration.py +22 -12
src/dalle_mini/model/modeling.py +75 -17

README.md CHANGED Viewed

@@ -124,8 +124,9 @@ Sequence to sequence model based on "[BART: Denoising Sequence-to-Sequence Pre-t
 - "[Deepnet: Scaling Transformers to 1,000 Layers](https://arxiv.org/abs/2203.00555)"
 - "[NormFormer: Improved Transformer Pretraining with Extra Normalization](https://arxiv.org/abs/2110.09456)"
 - "[Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)"
-- "[CogView: Mastering Text-to-Image Generation via Transformers](https://arxiv.org/abs/2105.13290v2)
 - "[Root Mean Square Layer Normalization](https://arxiv.org/abs/1910.07467)"
 Main optimizer (Distributed Shampoo) from "[Scalable Second Order Optimization for Deep Learning](https://arxiv.org/abs/2002.09018)".
@@ -247,3 +248,12 @@ Main optimizer (Distributed Shampoo) from "[Scalable Second Order Optimization f
       primaryClass = {cs.LG}
 }
 ```

 - "[Deepnet: Scaling Transformers to 1,000 Layers](https://arxiv.org/abs/2203.00555)"
 - "[NormFormer: Improved Transformer Pretraining with Extra Normalization](https://arxiv.org/abs/2110.09456)"
 - "[Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)"
+- "[CogView: Mastering Text-to-Image Generation via Transformers](https://arxiv.org/abs/2105.13290v2)"
 - "[Root Mean Square Layer Normalization](https://arxiv.org/abs/1910.07467)"
+- "[Sinkformers: Transformers with Doubly Stochastic Attention](https://arxiv.org/abs/2110.11773)"
 Main optimizer (Distributed Shampoo) from "[Scalable Second Order Optimization for Deep Learning](https://arxiv.org/abs/2002.09018)".
       primaryClass = {cs.LG}
 }
 ```
+```text
+@misc{title = {Sinkformers: Transformers with Doubly Stochastic Attention},
+      url = {https://arxiv.org/abs/2110.11773},
+      author = {Sander, Michael E. and Ablin, Pierre and Blondel, Mathieu and Peyré, Gabriel},
+      publisher = {arXiv},
+      year = {2021},
+}
+```

src/dalle_mini/model/configuration.py CHANGED Viewed

@@ -59,37 +59,39 @@ class DalleBartConfig(PretrainedFromWandbMixin, PretrainedConfig):
         do_sample=True,
         # transformer variants
         ln_type="layernorm",  # layer normalization type, "rmsnorm", "layernorm"
-        ln_positions="normformer",  # layer normalization positions, "normformer", "swinv2", "cogview", "postln", "deepnet" (same as postln)
-        head_scale=False,  # used in NormFormer
         use_cosine_attention=False,  # used in Swin v2
         tau_init=0.05,  # used only in cosine attention (Swin v2)
         use_deepnet_scaling=False,  # used in Deepnet
         use_glu=False,  # "GLU Variants Improve Transformer"
         use_alibi=False,  # from "Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation"
-        sink_iters=1,  # used in SinkFormers
         # parameters that should not be necessary but could affect results
-        force_ln_scale=True,  # force scale in layernorm even when followed by dense layers
-        force_final_ln_encoder=False,  # force layer normalization in encoder final layer even when followed by dense layers
         **kwargs,
     ):
         # text normalizer
         self.normalize_text = normalize_text
         # transformer variants
-        self.head_scale = head_scale  # per Normformer
         assert ln_type in [
             "rmsnorm",
             "layernorm",
         ], "ln_type must be 'rmsnorm' or 'layernorm'"
         self.ln_type = ln_type
         assert ln_positions in [
             "normformer",
             "swinv2",
             "cogview",
-            "deepnet",
-        ], "ln_positions must be 'normformer', 'swinv2' or 'deepnet'"
-        if ln_positions == "deepnet":
-            ln_positions = "postln"
         assert use_alibi is False, "use_alibi is not supported yet"
         self.ln_positions = ln_positions
         self.use_cosine_attention = use_cosine_attention
@@ -97,9 +99,17 @@ class DalleBartConfig(PretrainedFromWandbMixin, PretrainedConfig):
         self.use_deepnet_scaling = use_deepnet_scaling
         self.use_glu = use_glu
         self.use_alibi = use_alibi
-        self.sink_iters = sink_iters
         self.force_ln_scale = force_ln_scale
-        self.force_final_ln_encoder = force_final_ln_encoder
         # common parameters
         self.encoder_vocab_size = encoder_vocab_size

         do_sample=True,
         # transformer variants
         ln_type="layernorm",  # layer normalization type, "rmsnorm", "layernorm"
+        ln_positions="normformer",  # layer normalization positions, "normformer", "swinv2", "cogview", "postln", "preln", "deepnet" (same as postln)
+        use_head_scale=False,  # used in NormFormer
         use_cosine_attention=False,  # used in Swin v2
         tau_init=0.05,  # used only in cosine attention (Swin v2)
         use_deepnet_scaling=False,  # used in Deepnet
         use_glu=False,  # "GLU Variants Improve Transformer"
         use_alibi=False,  # from "Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation"
+        sinkhorn_iters=1,  # used in SinkFormers
+        use_final_ln_encoder=False,  # final layer normalization in encoder
+        use_final_ln_decoder=False,  # final layer normalization in decoder
         # parameters that should not be necessary but could affect results
+        force_ln_scale=False,  # force scale in layernorm even when followed by dense layers
         **kwargs,
     ):
         # text normalizer
         self.normalize_text = normalize_text
         # transformer variants
+        self.use_head_scale = use_head_scale  # per Normformer
         assert ln_type in [
             "rmsnorm",
             "layernorm",
         ], "ln_type must be 'rmsnorm' or 'layernorm'"
         self.ln_type = ln_type
+        if ln_positions == "deepnet":
+            ln_positions = "postln"
         assert ln_positions in [
             "normformer",
             "swinv2",
             "cogview",
+            "postln",
+            "preln",
+        ], "ln_positions must be 'normformer', 'swinv2', 'cogview', 'postln', 'preln'"
         assert use_alibi is False, "use_alibi is not supported yet"
         self.ln_positions = ln_positions
         self.use_cosine_attention = use_cosine_attention
         self.use_deepnet_scaling = use_deepnet_scaling
         self.use_glu = use_glu
         self.use_alibi = use_alibi
+        self.sinkhorn_iters = sinkhorn_iters
+        if ln_positions == "postln":
+            assert (
+                use_final_ln_encoder
+            ), "use_final_ln_encoder must be True when ln_positions is 'postln'"
+            assert (
+                use_final_ln_decoder
+            ), "use_final_ln_decoder must be True when ln_positions is 'postln'"
+        self.use_final_ln_encoder = use_final_ln_encoder
+        self.use_final_ln_decoder = use_final_ln_decoder
         self.force_ln_scale = force_ln_scale
         # common parameters
         self.encoder_vocab_size = encoder_vocab_size

src/dalle_mini/model/modeling.py CHANGED Viewed

@@ -28,7 +28,7 @@ import msgpack.exceptions
 from flax.core.frozen_dict import unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen import partitioning as nn_partitioning
-from flax.linen.attention import dot_product_attention_weights
 from flax.serialization import from_bytes
 from flax.traverse_util import flatten_dict, unflatten_dict
 from jax import lax
@@ -175,6 +175,66 @@ def norm(type, *args, **kwargs):
         raise ValueError(f"Unknown norm type {type}")
 class FlaxBartAttention(FlaxBartAttention):
     """
     Edits:
@@ -225,7 +285,7 @@ class FlaxBartAttention(FlaxBartAttention):
         )
         self.dropout_layer = nn.Dropout(rate=self.dropout)
-        if self.config.head_scale:
             self.head_scale = self.param(
                 "head_scale", jax.nn.initializers.ones, (1, 1, self.num_heads, 1)
             )
@@ -342,13 +402,14 @@ class FlaxBartAttention(FlaxBartAttention):
             deterministic=deterministic,
             dtype=self.dtype,
             precision=None,
         )
         if self.config.use_cosine_attention:
             # divide by tau
             attn_weights = attn_weights / jnp.maximum(self.tau, 0.01)
         attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
-        if self.config.head_scale:
             # per Normformer
             attn_output = attn_output * self.head_scale
         attn_output = self._merge_heads(attn_output)
@@ -373,7 +434,7 @@ class GLU(nn.Module):
             self.config
         )
-        if self.config.ln_positions in ["normformer", "cogview"]:
             x = norm(
                 self.config.ln_type,
                 dtype=self.dtype,
@@ -438,7 +499,7 @@ class FFN(nn.Module):
         gain = deepnet_gain["encoder" if self.is_encoder else "decoder"]["beta"](
             self.config
         )
-        if self.config.ln_positions in ["normformer", "cogview"]:
             x = norm(
                 self.config.ln_type,
                 dtype=self.dtype,
@@ -507,7 +568,7 @@ class FlaxBartEncoderLayer(nn.Module):
         embed_dim = self.config.d_model
         residual = hidden_states
-        if self.config.ln_positions in ["normformer", "cogview"]:
             hidden_states = norm(
                 self.config.ln_type,
                 dtype=self.dtype,
@@ -612,7 +673,7 @@ class FlaxBartDecoderLayer(nn.Module):
         residual = hidden_states
         # Self Attention
-        if self.config.ln_positions in ["normformer", "cogview"]:
             hidden_states = norm(
                 self.config.ln_type,
                 dtype=self.dtype,
@@ -651,7 +712,7 @@ class FlaxBartDecoderLayer(nn.Module):
         cross_attn_weights = None
         if encoder_hidden_states is not None:
             residual = hidden_states
-            if self.config.ln_positions in ["normformer", "cogview"]:
                 hidden_states = norm(
                     self.config.ln_type,
                     dtype=self.dtype,
@@ -759,12 +820,9 @@ class FlaxBartEncoderLayerCollection(nn.Module):
                 all_hidden_states += (hidden_states,)
             # final layernorm on the output of the last layer
             # or every 6 layers for Swin v2
-            # not needed for other models which use layernorm before x-attention
-            # ignored args for deepnet which always add a norm with scale
-            add_norm = self.config.force_final_ln_encoder or (
-                self.config.ln_positions == "swinv2"
-                and ((i == n_layers - 1) or ((i + 1) % 6 == 0))
-            )
             # we don't need to scale the norm for the last layer
             use_scale = i != n_layers - 1
             layer_outputs = layer(
@@ -839,9 +897,9 @@ class FlaxBartDecoderLayerCollection(nn.Module):
                 all_hidden_states += (hidden_states,)
             # final layernorm on the output of the last layer
             # or every 6 layers for Swin v2
-            add_norm = (i == n_layers - 1) or (
-                (self.config.ln_positions == "swinv2") and ((i + 1) % 6 == 0)
-            )
             # we don't need to scale the norm for the last layer
             use_scale = i != n_layers - 1
             layer_outputs = layer(

 from flax.core.frozen_dict import unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen import partitioning as nn_partitioning
+from flax.linen.linear import PrecisionLike
 from flax.serialization import from_bytes
 from flax.traverse_util import flatten_dict, unflatten_dict
 from jax import lax
         raise ValueError(f"Unknown norm type {type}")
+def dot_product_attention_weights(
+    query: Any,
+    key: Any,
+    bias: Optional[Any] = None,
+    mask: Optional[Any] = None,
+    broadcast_dropout: bool = True,
+    dropout_rng: Optional[PRNGKey] = None,
+    dropout_rate: float = 0.0,
+    deterministic: bool = False,
+    dtype: Any = jnp.float32,
+    precision: PrecisionLike = None,
+    sinkhorn_iters: int = 1,
+):
+    """
+    Computes dot-product attention weights given query and key.
+    Adapted from flax.linen.attention.dot_product_attention_weights"
+    """
+    assert query.ndim == key.ndim, "q, k must have same rank."
+    assert query.shape[:-3] == key.shape[:-3], "q, k batch dims must match."
+    assert query.shape[-2] == key.shape[-2], "q, k num_heads must match."
+    assert query.shape[-1] == key.shape[-1], "q, k depths must match."
+    # calculate attention matrix
+    depth = query.shape[-1]
+    query = query / jnp.sqrt(depth).astype(dtype)
+    # attn weight shape is (batch..., num_heads, q_length, kv_length)
+    attn_weights = jnp.einsum("...qhd,...khd->...hqk", query, key, precision=precision)
+    # apply attention bias: masking, dropout, proximity bias, etc.
+    if bias is not None:
+        attn_weights = attn_weights + bias
+    # apply attention mask
+    if mask is not None:
+        big_neg = jnp.finfo(dtype).min
+        attn_weights = jnp.where(mask, attn_weights, big_neg)
+    # normalize the attention weights
+    attn_weights = jax.nn.softmax(attn_weights).astype(dtype)
+    for i in range(sinkhorn_iters - 1):
+        axis = -2 if i % 2 == 0 else -1
+        attn_weights /= 1e-8 + jnp.sum(attn_weights, axis=axis, keepdims=True)
+    # apply attention dropout
+    if not deterministic and dropout_rate > 0.0:
+        keep_prob = 1.0 - dropout_rate
+        if broadcast_dropout:
+            # dropout is broadcast across the batch + head dimensions
+            dropout_shape = tuple([1] * (key.ndim - 2)) + attn_weights.shape[-2:]
+            keep = jax.random.bernoulli(dropout_rng, keep_prob, dropout_shape)
+        else:
+            keep = jax.random.bernoulli(dropout_rng, keep_prob, attn_weights.shape)
+        multiplier = keep.astype(attn_weights.dtype) / jnp.asarray(
+            keep_prob, dtype=dtype
+        )
+        attn_weights = attn_weights * multiplier
+    return attn_weights
 class FlaxBartAttention(FlaxBartAttention):
     """
     Edits:
         )
         self.dropout_layer = nn.Dropout(rate=self.dropout)
+        if self.config.use_head_scale:
             self.head_scale = self.param(
                 "head_scale", jax.nn.initializers.ones, (1, 1, self.num_heads, 1)
             )
             deterministic=deterministic,
             dtype=self.dtype,
             precision=None,
+            sinkhorn_iters=self.config.sinkhorn_iters,
         )
         if self.config.use_cosine_attention:
             # divide by tau
             attn_weights = attn_weights / jnp.maximum(self.tau, 0.01)
         attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        if self.config.use_head_scale:
             # per Normformer
             attn_output = attn_output * self.head_scale
         attn_output = self._merge_heads(attn_output)
             self.config
         )
+        if self.config.ln_positions in ["normformer", "cogview", "preln"]:
             x = norm(
                 self.config.ln_type,
                 dtype=self.dtype,
         gain = deepnet_gain["encoder" if self.is_encoder else "decoder"]["beta"](
             self.config
         )
+        if self.config.ln_positions in ["normformer", "cogview", "preln"]:
             x = norm(
                 self.config.ln_type,
                 dtype=self.dtype,
         embed_dim = self.config.d_model
         residual = hidden_states
+        if self.config.ln_positions in ["normformer", "cogview", "preln"]:
             hidden_states = norm(
                 self.config.ln_type,
                 dtype=self.dtype,
         residual = hidden_states
         # Self Attention
+        if self.config.ln_positions in ["normformer", "cogview", "preln"]:
             hidden_states = norm(
                 self.config.ln_type,
                 dtype=self.dtype,
         cross_attn_weights = None
         if encoder_hidden_states is not None:
             residual = hidden_states
+            if self.config.ln_positions in ["normformer", "cogview", "preln"]:
                 hidden_states = norm(
                     self.config.ln_type,
                     dtype=self.dtype,
                 all_hidden_states += (hidden_states,)
             # final layernorm on the output of the last layer
             # or every 6 layers for Swin v2
+            add_norm = (
+                self.config.ln_positions == "swinv2" and ((i + 1) % 6 == 0)
+            ) or (self.config.use_final_ln_encoder and (i == n_layers - 1))
             # we don't need to scale the norm for the last layer
             use_scale = i != n_layers - 1
             layer_outputs = layer(
                 all_hidden_states += (hidden_states,)
             # final layernorm on the output of the last layer
             # or every 6 layers for Swin v2
+            add_norm = (
+                self.config.ln_positions == "swinv2" and ((i + 1) % 6 == 0)
+            ) or (self.config.use_final_ln_decoder and (i == n_layers - 1))
             # we don't need to scale the norm for the last layer
             use_scale = i != n_layers - 1
             layer_outputs = layer(