Spaces:

flax-community
/

t5-vae

Runtime error

App Files Files Community

Fraser commited on Jul 15, 2021

Commit

1d30073

1 Parent(s): cecc83b

cope that submodules not allowed

Browse files

Files changed (13) hide show

app.py +1 -1
t5_vae_flax_alt/.gitignore +3 -0
t5_vae_flax_alt/README.md +3 -0
t5_vae_flax_alt/__init__.py +0 -0
t5_vae_flax_alt/src/__init__.py +0 -0
t5_vae_flax_alt/src/config.py +137 -0
t5_vae_flax_alt/src/decoders.py +23 -0
t5_vae_flax_alt/src/encoders.py +26 -0
t5_vae_flax_alt/src/generate.py +185 -0
t5_vae_flax_alt/src/outputs.py +74 -0
t5_vae_flax_alt/src/t5_vae.py +520 -0
t5_vae_flax_alt/src/utils.py +24 -0
t5_vae_flax_alt/src/vae.py +30 -0

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import streamlit as st
 import jax.numpy as jnp
 from transformers import AutoTokenizer
 from transformers.models.t5.modeling_flax_t5 import shift_tokens_right
-from t5_vae_flax.src.t5_vae import FlaxT5VaeForAutoencoding
 st.title('T5-VAE')

 import jax.numpy as jnp
 from transformers import AutoTokenizer
 from transformers.models.t5.modeling_flax_t5 import shift_tokens_right
+from t5_vae_flax_alt.src.t5_vae import FlaxT5VaeForAutoencoding
 st.title('T5-VAE')

t5_vae_flax_alt/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+*.pyc
+venv
+.vscode

t5_vae_flax_alt/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # t5-vae-flax
2	+
3	+ Model code for running a T5-VAE with flax.

t5_vae_flax_alt/__init__.py ADDED Viewed

File without changes

t5_vae_flax_alt/src/__init__.py ADDED Viewed

File without changes

t5_vae_flax_alt/src/config.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import copy
+from transformers.utils import logging
+from transformers.configuration_utils import PretrainedConfig
+from transformers import AutoConfig, T5Config
+from t5_vae_flax_alt.src.encoders import VAE_ENCODER_MODELS
+from t5_vae_flax_alt.src.decoders import VAE_DECODER_MODELS
+from t5_vae_flax_alt.src.utils import assertEqual, assertIn
+logger = logging.get_logger(__name__)
+class T5VaeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of :class:`FlaxT5VAE`.
+    It is used to instantiate a T5-VAE model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the T5 `t5-vae-base architecture.
+    To be able to use `transformer.trainer.Trainer` we need some specific training logic & config in the model.
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Arguments:
+        n_latent_tokens (:obj:`int`, `optional`, defaults to 6):
+            Number of latent tokens (must be less than seq length).
+        latent_token_size (:obj:`int`, `optional`, defaults to 32):
+            Number of dimensions to use for each latent token.
+        t5_name (:obj:`str`, `optional`, defaults to t5-base):
+            Name of the Transformer model to use as a decoder.
+        block_size (:obj:`int`, `optional`, defaults to 60):
+            NOTE: Every input sequence must be padded to be equal to this length.
+    """
+    model_type = "transformer_vae"
+    is_composition = True
+    def __init__(
+        self,
+        t5_model_name_or_path=None,
+        n_latent_tokens=6,  # set to -1 for full sequence
+        latent_token_size=32,
+        vae_encoder_model='',
+        vae_decoder_model='',
+        block_size=60,
+        decoder_start_token_id=0,
+        cache_dir=None,
+        tie_word_embeddings=True,
+        # T5 config
+        t5=dict(),
+        vocab_size=32128,
+        d_model=512,
+        d_kv=64,
+        d_ff=2048,
+        num_layers=6,
+        num_decoder_layers=None,
+        num_heads=8,
+        relative_attention_num_buckets=32,
+        dropout_rate=0.1,
+        layer_norm_epsilon=1e-6,
+        initializer_factor=1.0,
+        feed_forward_proj="relu",
+        is_encoder_decoder=True,
+        use_cache=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        gradient_checkpointing=False,
+        # end
+        **kwargs,
+    ):
+        assertIn(vae_encoder_model, VAE_ENCODER_MODELS.keys(), "Unexpected VAE encoder.")
+        assertIn(vae_decoder_model, VAE_DECODER_MODELS.keys(), "Unexpected VAE decoder.")
+        super().__init__(**kwargs)
+        self.set_seq_size = block_size
+        # VAE
+        self.vae_encoder_model = vae_encoder_model
+        self.vae_decoder_model = vae_decoder_model
+        self.latent_token_size = latent_token_size
+        assert(n_latent_tokens <= self.set_seq_size, 'Cannot use more latent tokens than input tokens.')
+        self.n_latent_tokens = n_latent_tokens
+        self.use_cache = use_cache
+        # T5
+        if t5_model_name_or_path:
+            self.t5 = AutoConfig.from_pretrained(t5_model_name_or_path, cache_dir=cache_dir)
+            assertEqual(self.t5.model_type, "t5", "Need t5 model type for transformer_decoder.")
+            self.t5.decoder_start_token_id = decoder_start_token_id
+        elif t5:
+            # use for loading a config
+            self.t5 = T5Config(**t5)
+        else:
+            self.t5 = T5Config(
+                vocab_size=vocab_size,
+                d_model=d_model,
+                d_kv=d_kv,
+                d_ff=d_ff,
+                num_layers=num_layers,
+                num_decoder_layers=num_decoder_layers,
+                num_heads=num_heads,
+                relative_attention_num_buckets=relative_attention_num_buckets,
+                dropout_rate=dropout_rate,
+                layer_norm_epsilon=layer_norm_epsilon,
+                initializer_factor=initializer_factor,
+                feed_forward_proj=feed_forward_proj,
+                is_encoder_decoder=is_encoder_decoder,
+                use_cache=use_cache,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                gradient_checkpointing=gradient_checkpointing,
+                **kwargs
+            )
+        if self.t5.d_model < self.latent_token_size:
+            raise Exception('Using larger latent token dimension then T5 hidden dimension.')
+        # Add t5 config options
+        self.tie_word_embeddings = tie_word_embeddings
+        self.t5.tie_word_embeddings = self.tie_word_embeddings
+        self.t5.use_cache = self.use_cache
+        self.pad_token_id = pad_token_id
+        self.eos_token_id = eos_token_id
+        self.decoder_start_token_id = self.t5.decoder_start_token_id
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig`.
+        Returns:
+            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["model_type"] = self.__class__.model_type
+        output['t5'] = self.t5.to_dict()
+        return output

t5_vae_flax_alt/src/decoders.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import logging
+import flax.linen as nn
+logger = logging.getLogger(__name__)
+class Decoder(nn.Module):
+    '''
+        Converts latent code -> transformer encoding.
+    '''
+    dim_model: int
+    n_latent_tokens: int
+    @nn.compact
+    def __call__(self, latent_code):  # (batch, latent_tokens_per_sequence, latent_token_dim)
+        raw_latent_tokens = nn.Dense(self.dim_model)(latent_code)
+        latent_tokens = nn.LayerNorm()(raw_latent_tokens)
+        return latent_tokens  # (batch, latent_tokens_per_sequence, dim_model)
+VAE_DECODER_MODELS = {
+    '': Decoder,
+}

t5_vae_flax_alt/src/encoders.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import logging
+import jax.numpy as jnp
+import flax.linen as nn
+logger = logging.getLogger(__name__)
+class Encoder(nn.Module):
+    '''
+        Converts N hidden tokens into N seperate latent codes.
+    '''
+    latent_token_size: int
+    n_latent_tokens: int
+    @nn.compact
+    def __call__(self, encoding):
+        latent_tokens = nn.Dense(self.latent_token_size)(encoding)
+        raw_latent_code = latent_tokens[:, : self.n_latent_tokens, :]
+        # TODO does this just apply tanh to each latent token? Or across the whole batch
+        latent_code = jnp.tanh(raw_latent_code)
+        return latent_code  # (batch, latent_tokens_per_sequence, latent_token_dim)
+VAE_ENCODER_MODELS = {
+    '': Encoder,
+}

t5_vae_flax_alt/src/generate.py ADDED Viewed

	@@ -0,0 +1,185 @@

+from typing import Dict, Optional
+import jax
+import jax.numpy as jnp
+import jaxlib.xla_extension as jax_xla
+from transformers.generation_flax_utils import FlaxGenerationMixin
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class VaeFlaxGenerationMixin(FlaxGenerationMixin):
+    def generate(
+        self,
+        latent_codes: jax_xla.DeviceArray,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        bos_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        decoder_start_token_id: Optional[int] = None,
+        do_sample: Optional[bool] = None,
+        prng_key: Optional[jax_xla.DeviceArray] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        temperature: Optional[float] = None,
+        num_beams: Optional[int] = None,
+        no_repeat_ngram_size: Optional[int] = None,
+        min_length: Optional[int] = None,
+        forced_bos_token_id: Optional[int] = None,
+        forced_eos_token_id: Optional[int] = None,
+        length_penalty: Optional[float] = None,
+        early_stopping: Optional[bool] = None,
+        trace: bool = True,
+        params: Optional[Dict[str, jax_xla.DeviceArray]] = None,
+        **model_kwargs,
+    ):
+        r"""
+        Generates sequences for models with a language modeling head. The method currently supports greedy decoding,
+        and, multinomial sampling.
+        Apart from :obj:`latent_codes`, all the arguments below will default to the value of the attribute of the same
+        name inside the :class:`~transformers.PretrainedConfig` of the model. The default values indicated are the
+        default values of those config.
+        Most of these parameters are explained in more detail in `this blog post
+        <https://huggingface.co/blog/how-to-generate>`__.
+        Parameters:
+            latent_codes (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, n_latent_tokens, latent_token_dim)`, `optional`):
+                The sequence used as a prompt for the generation.
+            max_length (:obj:`int`, `optional`, defaults to 20):
+                The maximum length of the sequence to be generated.
+            do_sample (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to use sampling ; use greedy decoding otherwise.
+            temperature (:obj:`float`, `optional`, defaults to 1.0):
+                The value used to module the next token probabilities.
+            top_k (:obj:`int`, `optional`, defaults to 50):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (:obj:`float`, `optional`, defaults to 1.0):
+                If set to float < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or
+                higher are kept for generation.
+            pad_token_id (:obj:`int`, `optional`):
+                The id of the `padding` token.
+            bos_token_id (:obj:`int`, `optional`):
+                The id of the `beginning-of-sequence` token.
+            eos_token_id (:obj:`int`, `optional`):
+                The id of the `end-of-sequence` token.
+            num_beams (:obj:`int`, `optional`, defaults to 1):
+                Number of beams for beam search. 1 means no beam search.
+            decoder_start_token_id (:obj:`int`, `optional`):
+                If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token.
+            trace (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether to trace generation. Setting ``trace=False`` should only be used for debugging and will lead to
+                a considerably slower runtime.
+            params (:obj:`Dict[str, jax_xla.DeviceArray]`, `optional`):
+                Optionally the model parameters can be passed. Can be useful for parallelized generation.
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model.
+        Return:
+            :class:`~transformers.file_utils.ModelOutput`.
+        Examples::
+            >>> from transformers import AutoTokenizer, FlaxAutoModelForCausalLM
+            >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+            >>> model = FlaxAutoModelForCausalLM.from_pretrained("distilgpt2")
+            >>> input_context = "The dog"
+            >>> # encode input context
+            >>> input_ids = tokenizer(input_context, return_tensors="jax").input_ids
+            >>> # generate candidates using sampling
+            >>> outputs = model.generate(input_ids=input_ids, max_length=20, top_k=30, do_sample=True)
+            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        """
+        # set init values
+        max_length = max_length if max_length is not None else self.config.max_length
+        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        decoder_start_token_id = (
+            decoder_start_token_id if decoder_start_token_id else self.config.decoder_start_token_id
+        )
+        prng_key = prng_key if prng_key is not None else jax.random.PRNGKey(0)
+        if decoder_start_token_id is None and self.config.is_encoder_decoder:
+            raise ValueError("`decoder_start_token_id` has to be defined for encoder-decoder generation.")
+        model_kwargs['latent_codes'] = latent_codes
+        if self.config.is_encoder_decoder:
+            # add encoder_outputs to model_kwargs
+            # NOTE: Don't prepare encoder outputs, instead rely on latent_codes.
+            # model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)
+            # prepare decoder_input_ids for generation
+            input_ids = jnp.ones((latent_codes.shape[0], 1), dtype="i4") * decoder_start_token_id
+        do_sample = do_sample if do_sample is not None else self.config.do_sample
+        num_beams = num_beams if num_beams is not None else self.config.num_beams
+        if not do_sample and num_beams == 1:
+            logits_processor = self._get_logits_processor(
+                no_repeat_ngram_size, min_length, max_length, eos_token_id, forced_bos_token_id, forced_eos_token_id
+            )
+            return self._greedy_search(
+                input_ids,
+                max_length,
+                pad_token_id,
+                eos_token_id,
+                logits_processor=logits_processor,
+                trace=trace,
+                params=params,
+                model_kwargs=model_kwargs,
+            )
+        elif do_sample and num_beams == 1:
+            logits_warper = self._get_logits_warper(top_k=top_k, top_p=top_p, temperature=temperature)
+            logits_processor = self._get_logits_processor(
+                no_repeat_ngram_size, min_length, max_length, eos_token_id, forced_bos_token_id, forced_eos_token_id
+            )
+            return self._sample(
+                input_ids,
+                max_length,
+                pad_token_id,
+                eos_token_id,
+                prng_key,
+                logits_warper=logits_warper,
+                logits_processor=logits_processor,
+                trace=trace,
+                params=params,
+                model_kwargs=model_kwargs,
+            )
+        elif not do_sample and num_beams > 1:
+            # broadcast input_ids & encoder_outputs
+            input_ids = self._expand_to_num_beams(input_ids, num_beams=num_beams)
+            if "encoder_outputs" in model_kwargs:
+                model_kwargs["encoder_outputs"]["last_hidden_state"] = self._expand_to_num_beams(
+                    model_kwargs["encoder_outputs"]["last_hidden_state"], num_beams=num_beams
+                )
+            if "attention_mask" in model_kwargs:
+                model_kwargs["attention_mask"] = self._expand_to_num_beams(
+                    model_kwargs["attention_mask"], num_beams=num_beams
+                )
+            logits_processor = self._get_logits_processor(
+                no_repeat_ngram_size, min_length, max_length, eos_token_id, forced_bos_token_id, forced_eos_token_id
+            )
+            return self._beam_search(
+                input_ids,
+                max_length,
+                pad_token_id,
+                eos_token_id,
+                length_penalty=length_penalty,
+                early_stopping=early_stopping,
+                logits_processor=logits_processor,
+                trace=trace,
+                params=params,
+                model_kwargs=model_kwargs,
+            )
+        else:
+            raise NotImplementedError("`Beam sampling is currently not implemented.")

t5_vae_flax_alt/src/outputs.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from typing import Optional, Tuple
+import flax
+import jaxlib.xla_extension as jax_xla
+from transformers.file_utils import ModelOutput
+@flax.struct.dataclass
+class TransformerVaeOutput(ModelOutput):
+    """
+    Base class for a Transformer-VAE's outputs.
+    Args:
+        latent_codes (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, n_latent_tokens, latent_token_size)`):
+            Latent codes representing encoded sequences.
+        remade_encoder_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, n_tokens, model_dim)`):
+            Reconstructed encoder hidden states representing sequences.
+    (std Seq2Seq) Args:
+        logits (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (:obj:`tuple(tuple(jax_xla.DeviceArray))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            Tuple of :obj:`tuple(jax_xla.DeviceArray)` of length :obj:`config.n_layers`, with each tuple having 2
+            tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+            tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+        last_hidden_state (:obj:`tuple(jax_xla.DeviceArray)`:
+            Last model hidden state.
+        decoder_hidden_states (:obj:`tuple(jax_xla.DeviceArray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`jax_xla.DeviceArray` (one for the output of the embeddings + one for the output of each
+            layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (:obj:`tuple(jax_xla.DeviceArray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`jax_xla.DeviceArray` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (:obj:`tuple(jax_xla.DeviceArray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`jax_xla.DeviceArray` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (:obj:`tuple(jax_xla.DeviceArray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`jax_xla.DeviceArray` (one for the output of the embeddings + one for the output of each
+            layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (:obj:`tuple(jax_xla.DeviceArray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`jax_xla.DeviceArray` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+    logits: jax_xla.DeviceArray = None
+    latent_codes: jax_xla.DeviceArray = None
+    remade_encoder_hidden_state: jax_xla.DeviceArray = None
+    # seq2seq
+    past_key_values: Optional[Tuple[Tuple[jax_xla.DeviceArray]]] = None
+    decoder_hidden_states: Optional[Tuple[jax_xla.DeviceArray]] = None
+    decoder_attentions: Optional[Tuple[jax_xla.DeviceArray]] = None
+    cross_attentions: Optional[Tuple[jax_xla.DeviceArray]] = None
+    last_hidden_state: Optional[jax_xla.DeviceArray] = None
+    encoder_last_hidden_state: Optional[jax_xla.DeviceArray] = None
+    encoder_hidden_states: Optional[Tuple[jax_xla.DeviceArray]] = None
+    encoder_attentions: Optional[Tuple[jax_xla.DeviceArray]] = None

t5_vae_flax_alt/src/t5_vae.py ADDED Viewed

	@@ -0,0 +1,520 @@

+from typing import Optional, Tuple
+import jax
+import jax.numpy as jnp
+from jax.random import PRNGKey
+import flax.linen as nn
+from flax.core.frozen_dict import FrozenDict, unfreeze
+from transformers.modeling_flax_outputs import FlaxCausalLMOutputWithCrossAttentions
+from transformers.file_utils import add_start_docstrings
+from transformers.modeling_flax_utils import FlaxPreTrainedModel
+from transformers.models.t5.modeling_flax_t5 import FlaxT5ForConditionalGenerationModule
+from t5_vae_flax_alt.src.vae import VAE
+from t5_vae_flax_alt.src.generate import VaeFlaxGenerationMixin
+from t5_vae_flax_alt.src.outputs import TransformerVaeOutput
+from t5_vae_flax_alt.src.config import T5VaeConfig
+@add_start_docstrings("""T5 Model with a `language modeling` head on top converted into a VAE.""")
+class FlaxT5VaeForAutoencodingModule(nn.Module):
+    config: T5VaeConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def _get_encoder_module(self):
+        return self.t5.encoder
+    def _get_vae_encoder_module(self):
+        return self.vae.encoder
+    def _get_vae_decoder_module(self):
+        return self.vae.decoder
+    def _get_decoder_module(self):
+        return self.t5.decoder
+    def setup(self):
+        self.t5 = FlaxT5ForConditionalGenerationModule(self.config.t5)
+        self.vae = VAE(self.config)
+    def __call__(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        latent_codes=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        deterministic: bool = True,
+    ):
+        """
+            Adapted from `FlaxT5ForConditionalGenerationModule`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Encode
+        encoder_outputs = self.t5.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+        hidden_states = encoder_outputs[0]
+        # Autoencode
+        hidden_states, latent_codes = self.vae(hidden_states, latent_codes)
+        encoder_attention_mask = jnp.ones((hidden_states.shape[0], hidden_states.shape[1]))
+        # Decode
+        decoder_outputs = self.t5.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+        sequence_output = decoder_outputs[0]
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.config.t5.d_model ** -0.5)
+        if self.t5.config.tie_word_embeddings:
+            shared_embedding = self.t5.shared.variables["params"]["embedding"]
+            lm_logits = self.t5.lm_head.apply({"params": {"kernel": shared_embedding.T}}, sequence_output)
+        else:
+            lm_logits = self.t5.lm_head(sequence_output)
+        if not return_dict:
+            return [lm_logits, latent_codes] + decoder_outputs[1:] + encoder_outputs
+        return TransformerVaeOutput(
+            logits=lm_logits,
+            latent_codes=latent_codes,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+class FlaxT5VaePreTrainedModel(FlaxPreTrainedModel, VaeFlaxGenerationMixin):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = T5VaeConfig
+    base_model_prefix = "transformer"
+    module_class: nn.Module = None
+    def __init__(
+        self,
+        config: T5VaeConfig,
+        input_shape: Tuple[int] = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        **kwargs
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype)
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        decoder_input_ids = jnp.ones_like(input_ids)
+        decoder_attention_mask = jnp.ones_like(input_ids)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+        return self.module.init(
+            rngs,
+            input_ids,
+            attention_mask,
+            decoder_input_ids,
+            decoder_attention_mask,
+        )["params"]
+    def __call__(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        decoder_input_ids: jnp.ndarray = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        if decoder_input_ids is None:
+            raise ValueError(
+                "Make sure to provide both `input_ids` and `decoder_input_ids`. `decoder_input_ids` is not passed here."
+            )
+        # prepare encoder inputs
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+        # prepare decoder inputs
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+        return self.module.apply(
+            {"params": params or self.params},
+            input_ids=jnp.array(input_ids, dtype="i4"),
+            attention_mask=jnp.array(attention_mask, dtype="i4"),
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+        )
+    def init_cache(self, batch_size, max_length, latent_codes):
+        r"""
+        Args:
+            batch_size (:obj:`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (:obj:`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+            latent_codes (:obj:`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+                ``latent_codes`` consists of compressed hidden-states at the output of the last layer of the encoder.
+                Used in the cross-attention of the decoder.
+        """
+        # init input variables to retrieve cache
+        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, **kwargs):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                decoder_input_ids,
+                decoder_attention_mask,
+                **kwargs,
+            )
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0),
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            init_cache=True,
+            method=_decoder_forward,  # we only need to call the decoder to init the cache
+        )
+        return unfreeze(init_variables["cache"])
+    def encode(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        raise NotImplementedError()
+    def decode(
+        self,
+        decoder_input_ids,
+        latent_codes,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        past_key_values: dict = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        raise NotImplementedError()
+class FlaxT5VaeForAutoencoding(FlaxT5VaePreTrainedModel):
+    module_class = FlaxT5VaeForAutoencodingModule
+    def __call__(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        '''
+            Adapted from `FlaxT5PreTrainedModel`
+        '''
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        if decoder_input_ids is None:
+            raise ValueError(
+                "Make sure to provide both `input_ids` and `decoder_input_ids`. `decoder_input_ids` is not passed here."
+            )
+        # prepare encoder inputs
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+        # prepare decoder inputs
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+        return self.module.apply(
+            {"params": params or self.params},
+            input_ids=jnp.array(input_ids, dtype="i4"),
+            attention_mask=jnp.array(attention_mask, dtype="i4"),
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+        )
+    def encode(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        def _encoder_forward(module, input_ids, attention_mask, **kwargs):
+            encode_module = module._get_encoder_module()
+            vae_encoder_module = module._get_vae_encoder_module()
+            return vae_encoder_module(encode_module(input_ids, attention_mask, **kwargs)[0])
+        return self.module.apply(
+            {"params": params or self.params},
+            input_ids=jnp.array(input_ids, dtype="i4"),
+            attention_mask=jnp.array(attention_mask, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            method=_encoder_forward,
+        )
+    def decode(
+        self,
+        decoder_input_ids,
+        latent_codes,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        past_key_values: dict = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+        Example::
+            >>> model = FlaxT5VaeForAutoencoding.from_pretrained('t5-small')
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> text = "My friends are cool but they eat too many carbs."
+            >>> inputs = tokenizer(text, max_length=512, return_tensors='jax')
+            >>> latent_codes = model.encode(**inputs)
+            >>> decoder_start_token_id = model.config.decoder_start_token_id
+            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+            >>> outputs = model.decode(decoder_input_ids, latent_codes)
+            >>> last_decoder_hidden_states = outputs.last_hidden_state
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        if encoder_attention_mask is None:
+            batch_size, sequence_length = latent_codes.shape[:2]
+            encoder_attention_mask = jnp.ones((batch_size, sequence_length))
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length))
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        inputs = {"params": params or self.params}
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxT5Attention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+        def _decoder_forward(module, decoder_input_ids, latent_codes, decoder_attention_mask, **kwargs):
+            vae_decoder_module = module._get_vae_decoder_module()
+            decoder_module = module._get_decoder_module()
+            decoder_outputs = decoder_module(
+                decoder_input_ids,
+                decoder_attention_mask,
+                encoder_hidden_states=vae_decoder_module(latent_codes),
+                **kwargs,
+            )
+            sequence_output = decoder_outputs[0]
+            if self.config.tie_word_embeddings:
+                # Rescale output before projecting on vocab
+                # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+                sequence_output = sequence_output * (self.config.t5.d_model ** -0.5)
+            if self.config.tie_word_embeddings:
+                shared_embedding = module.t5.shared.variables["params"]["embedding"]
+                lm_logits = module.t5.lm_head.apply({"params": {"kernel": shared_embedding.T}}, sequence_output)
+            else:
+                lm_logits = module.t5.lm_head(sequence_output)
+            return lm_logits, decoder_outputs
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            latent_codes=latent_codes,
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+        if past_key_values is None:
+            lm_logits, decoder_outputs = outputs
+        else:
+            (lm_logits, decoder_outputs), past = outputs
+        if return_dict:
+            outputs = FlaxCausalLMOutputWithCrossAttentions(
+                logits=lm_logits,
+                hidden_states=decoder_outputs.hidden_states,
+                attentions=decoder_outputs.attentions,
+                cross_attentions=decoder_outputs.cross_attentions,
+            )
+        else:
+            outputs = (lm_logits,) + decoder_outputs[1:]
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+        return outputs
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        max_length,
+        attention_mask: Optional[jnp.DeviceArray] = None,
+        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        latent_codes=None,
+        **kwargs
+    ):
+        # initializing the cache
+        batch_size, seq_length = decoder_input_ids.shape
+        past_key_values = self.init_cache(batch_size, max_length, latent_codes)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if decoder_attention_mask is not None:
+            extended_attention_mask = jax.lax.dynamic_update_slice(
+                extended_attention_mask, decoder_attention_mask, (0, 0)
+            )
+        return {
+            "past_key_values": past_key_values,
+            "latent_codes": latent_codes,
+            "encoder_attention_mask": attention_mask,
+            "decoder_attention_mask": extended_attention_mask,
+        }
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        return model_kwargs

t5_vae_flax_alt/src/utils.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from typing import Sequence
+import flax.linen as nn
+class MLP(nn.Module):
+    features: Sequence[int]
+    @nn.compact
+    def __call__(self, x):
+        for feat in self.features[:-1]:
+            x = nn.relu(nn.Dense(feat)(x))
+        x = nn.Dense(self.features[-1])(x)
+        return x
+def assertEqual(actual, expected, msg, first="Got", second="Expected"):
+    if actual != expected:
+        raise ValueError(msg + f' {first}: "{actual}" {second}: "{expected}"')
+def assertIn(actual, expected, msg, first="Got", second="Expected one of"):
+    if actual not in expected:
+        raise ValueError(msg + f' {first}: "{actual}" {second}: {expected}')

t5_vae_flax_alt/src/vae.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import jax.numpy as jnp
+import flax.linen as nn
+from t5_vae_flax_alt.src.encoders import VAE_ENCODER_MODELS
+from t5_vae_flax_alt.src.decoders import VAE_DECODER_MODELS
+from t5_vae_flax_alt.src.config import T5VaeConfig
+class VAE(nn.Module):
+    # see https://github.com/google/flax#what-does-flax-look-like
+    """
+        An MMD-VAE used with encoder-decoder models.
+        Encodes all token encodings into a single latent & spits them back out.
+    """
+    config: T5VaeConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.encoder = VAE_ENCODER_MODELS[self.config.vae_encoder_model](self.config.latent_token_size, self.config.n_latent_tokens)
+        self.decoder = VAE_DECODER_MODELS[self.config.vae_decoder_model](self.config.t5.d_model,  self.config.n_latent_tokens)
+    def __call__(self, encoding=None, latent_codes=None):
+        latent_codes = self.encode(encoding)
+        return self.decode(latent_codes), latent_codes
+    def encode(self, encoding):
+        return self.encoder(encoding)
+    def decode(self, latent):
+        return self.decoder(latent)