Upload 304 files

fa1a600 verified 10 months ago

29.2 kB

	# Copyright 2024 Big Vision Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Decoder-only and encoder-decoder GIVT model.

	Used abbreviations for dimension annotations:
	B: batch size.
	E: embedding size.
	L: (soft) token sequence length.
	D: soft token dimension.
	P: number of patches (extracted by a ViT encoder in GIVT-based UViM)
	"""

	import enum
	import itertools
	from typing import Literal, Optional, Sequence, Any, Mapping

	from absl import logging
	from big_vision import utils
	from big_vision.models import common
	from big_vision.models import vit
	import distrax
	import einops
	import flax.linen as nn
	from flax.linen import partitioning
	import jax
	import jax.numpy as jnp
	import numpy as np


	class _SpecialLabel(enum.Enum):

	MASK = "mask"
	NOMASK = "nomask"
	REPLACE = "replace"
	NOLABEL = "nolabel" # For CFG


	def _random_mask_with_ratios(rng, ratios: jax.Array, seq_len: int):
	"""Generates masks where a fraction of tokens is uncovered.

	Args:
	rng: RNG.
	ratios: Ratios, must be a 1D matrix of shape (B,). Values must be in
	[0, 1], and indicate at ratios[i] how many of the i-th tokens are
	uncovered (ie. equal to `True`).
	seq_len: How many tokens this mask has to cover.

	Returns:
	Mask of dtype bool, shape (B, L).

	Raises:
	ValueError: Incorrect inputs.
	"""
	if ratios.ndim != 1:
	raise ValueError("Ratios must have shape (B,)!")
	ratios = jnp.clip(ratios, 0, 1)
	indices = jnp.arange(seq_len, dtype=jnp.float32) # Shape: (L,)
	ratios = ratios[:, jnp.newaxis] * seq_len # Shape: (B, 1)
	# This is a binary array where the first ratios * seq_len positions are True
	mask = (indices < ratios).astype(jnp.bool_) # Shape: (B, L)
	# Shuffle to a actual mask.
	return jax.random.shuffle(rng, mask, axis=-1)


	def apply_mask_schedule(ratio: float \| jax.Array, method: str) -> jax.Array:
	"""Generate a mask rate by scheduling mask functions R."""
	if method == "cosine":
	mask_ratio = jax.lax.cos(jnp.pi / 2. * ratio)
	elif "pow:" in method:
	exponent = float(method.replace("pow:", ""))
	mask_ratio = 1. - ratio**exponent
	else:
	raise NotImplementedError(method)
	# Clamps mask into [epsilon, 1)
	mask_ratio = jnp.clip(mask_ratio, 1e-6, 1.)
	return mask_ratio


	class EncoderDecoderBlock(nn.Module):
	"""Transformer encoder-decoder layer."""
	mlp_dim: int
	num_heads: int
	dropout_rate: float = 0.
	decode: bool = False

	@nn.compact
	def __call__(
	self,
	targets: jax.Array,
	encoded: jax.Array \| None = None,
	decoder_mask: jax.Array \| None = None,
	deterministic: bool = True,
	) -> tuple[jax.Array, jax.Array]:
	"""Applies EncoderDecoderBlock module.

	Args:
	targets: target text embeddings [B, L, D].
	encoded: encoded image patches from encoder [B, P, E].
	decoder_mask: decoder self-attention mask.
	deterministic: bool, deterministic or not (to apply dropout).

	Returns:
	output after transformer encoder-decoder block [B, L, E].
	"""
	# Helper function for axis annotation.
	def wlc(f):
	dim_names = ("act_batch", "act_len", "act_emb")
	return nn.with_logical_constraint(f, dim_names)
	# Decoder block.
	x = wlc(nn.LayerNorm(name="LayerNorm1", use_bias=False)(targets))
	x = wlc(nn.SelfAttention(
	num_heads=self.num_heads, use_bias=False, broadcast_dropout=False,
	dropout_rate=self.dropout_rate, decode=self.decode, name="SelfAttn")(
	x, decoder_mask, deterministic=deterministic))
	x = wlc(nn.Dropout(rate=self.dropout_rate)(x, deterministic=deterministic))
	x = wlc(x + targets)

	if encoded is None:
	y = x
	else:
	# Encoder-Decoder block.
	y = wlc(nn.LayerNorm(name="LayerNorm2", use_bias=False)(x))
	y = wlc(nn.MultiHeadDotProductAttention(
	num_heads=self.num_heads, use_bias=False, broadcast_dropout=False,
	dropout_rate=self.dropout_rate, name="CrossAttn")(
	y, encoded, deterministic=deterministic))
	y = wlc(
	nn.Dropout(rate=self.dropout_rate)(y, deterministic=deterministic))
	y = wlc(y + x)

	# MLP block.
	z = wlc(nn.LayerNorm(name="LayerNorm3", use_bias=False)(y))
	z = wlc(vit.MlpBlock(mlp_dim=self.mlp_dim, dropout=self.dropout_rate,
	name="MLP")(z, deterministic=deterministic))

	# nn.scan requires a carry (second element in tuple)
	out = wlc(y + z)
	return out, out


	class Decoder(nn.Module):
	"""Transformer decoder model with optional cross-attention."""
	emb_dim: int
	mlp_dim: int
	num_heads: int
	num_layers: int
	out_dim: int
	seq_len: int
	style: Literal["ar", "masked"]
	dropout_rate: float = 0.
	zero_embedding_init: bool = False

	scan: bool = False
	remat_policy: str = "nothing_saveable"

	@nn.compact
	def __call__(
	self,
	targets: jax.Array,
	encoded: jax.Array \| None = None,
	decoder_mask: jax.Array \| None = None,
	decode: bool = False,
	deterministic: bool = True,
	return_reps: bool = False,
	) -> jax.Array \| tuple[jax.Array, Mapping[str, jax.Array]]:
	"""Applies Transformer model on the inputs.

	Args:
	targets: target text tokens [B, L].
	encoded: encoded sequence from an encoder [B, P, E].
	decoder_mask: decoder self-attention mask.
	decode: bool, whether to perform fast autoregressive decoding with cache.
	deterministic: bool, deterministic or not (to apply dropout).
	return_reps: bool, whether to return intermediate representations.

	Returns:
	output of a transformer decoder [B, L, out_dim], where out_dim is usually
	a multiple of D.
	"""
	if self.style == "masked" and decode:
	raise ValueError("Cannot run masked model in cached mode!")

	pos_emb = vit.get_posemb(
	self, "learn", self.seq_len, self.emb_dim,
	"pos_emb")

	y = common.AddPositionEmbs(
	decode=decode, name="PosEmbedTargets")(targets, pos_emb)

	out = {}
	if self.scan:
	# Mostly followed
	# https://github.com/google/maxtext/blob/4d99e30b3e0e0cb1d1aa11c7db7fffe18e301498/MaxText/layers.py#L1126
	# for the scanned version.

	# 1. remat
	enc_dec_block_remat = nn.remat(
	EncoderDecoderBlock,
	prevent_cse=False,
	static_argnums=(-1, -2),
	policy=getattr(jax.checkpoint_policies, self.remat_policy, None))
	# 2. scan
	initializing = self.is_mutable_collection("params")
	param_scan_axis = 1
	params_spec = (param_scan_axis if initializing
	else partitioning.ScanIn(param_scan_axis))
	dec_scanned = nn.scan(enc_dec_block_remat,
	variable_axes={
	"params": params_spec,
	"cache": 0,
	},
	split_rngs={"params": True, "dropout": True},
	in_axes=nn.broadcast,
	length=self.num_layers)
	# 3. fprop
	y, out = dec_scanned(num_heads=self.num_heads, mlp_dim=self.mlp_dim,
	dropout_rate=self.dropout_rate, decode=decode,
	name="EncDecBlock")(
	y, encoded, decoder_mask, deterministic)
	# Extracting the intermediate representation from the stacked activation
	# tensor `out`, which is a [num_layers, B, L, E] tensor. Indexing along
	# the first axis to extract individual layers, and then averaging across
	# the second axis, which corresponds to the sequence dimension after
	# indexing.
	assert out.shape[0] == self.num_layers and (
	decode or out.shape[2] == self.seq_len), (
	(out.shape, self.num_layers, self.seq_len))
	out = {f"block{l}_rep": jnp.mean(out[l], axis=1)
	for l in range(self.num_layers)}
	else:
	for lyr in range(self.num_layers):
	y, _ = EncoderDecoderBlock(
	num_heads=self.num_heads, mlp_dim=self.mlp_dim,
	dropout_rate=self.dropout_rate, decode=decode,
	name=f"EncDecBlock{lyr}")(y, encoded, decoder_mask=decoder_mask,
	deterministic=deterministic)
	out[f"block{lyr}_rep"] = jnp.mean(y, axis=1)
	y = nn.LayerNorm(name="LayerNorm")(y)
	out["pre_logits"] = jnp.mean(y, axis=1)

	logits = nn.Dense(
	self.out_dim,
	kernel_init=nn.initializers.zeros,
	name="LogitsDense",
	)(y)
	out["logits"] = logits
	if return_reps:
	return logits, out
	return logits


	class Model(nn.Module):
	"""GIVT model supporting decoder-only and encoder-decoder applications."""
	num_heads: int = 8
	# num_layers = 0 means no encoder
	num_layers: int = 0
	num_decoder_layers: int = 6
	mlp_dim: int = 2048
	enc_dropout_rate: float = 0.
	dec_dropout_rate: float = 0.
	# Decoder params:
	emb_dim: int = 512
	num_labels: Optional[int] = 1000
	seq_len: int = 256
	# Encoder params:
	patches: Sequence[int] = (16, 16)
	input_size: Sequence[int] = (256, 256)
	posemb_type: Literal["learn", "sincos2d"] = "learn"
	zero_decoder_seq: bool = False
	style: Literal["ar", "masked"] = "ar"

	zero_embedding_init: bool = False

	num_mixtures: int = 4
	multivariate: bool = False
	out_dim: int = 32
	scale_tol: float = 1e-6

	# Mask specific params.
	mask_schedule_train: str = "cosine"
	# Results in at least 40% masked tokens with cosine.
	min_masking_rate_training: float = 0.3

	# How to fuse mask at input:
	# - replace: replace token[masked] with lookup(MASK)
	# - concat: replace token[mask] with lookup(REPLACE) and concat either
	# lookup(NOMASK) or lookup(MASK).
	mask_style: str = "replace"

	# Set to >0 for CFG support.
	drop_labels_probability: float = 0.0

	fix_square_plus: bool = False

	# If True, and mixture >1, create a GMM per channel. Otherwise, create
	# a GMM of `dim`-dimensional Gaussians.
	per_channel_mixtures: bool = True

	scan: bool = False
	remat_policy: str = "nothing_saveable"

	@property
	def has_encoder(self) -> bool:
	return self.num_layers > 0

	@property
	def num_logits(self) -> int:
	if self.multivariate:
	assert self.num_mixtures == 1
	# d**2 covariance, d means.
	# Note: `round` makes pytype happy.
	return round(self.out_dim ** 2) + self.out_dim

	elif self.per_channel_mixtures:
	# One (mu, sigma, pi) per output dimension and mixture component.
	# Note that we predict a distribution for each output dimensions in
	# parallel.
	return 3 * self.num_mixtures * self.out_dim

	else:
	# Mixture weights plus mean/scale per mixture
	return self.num_mixtures + 2 * self.num_mixtures * self.out_dim

	def setup(self) -> None:
	assert self.posemb_type == "learn"
	assert self.num_mixtures > 0

	if self.multivariate and self.num_mixtures != 1:
	raise ValueError("Cannot do multivariate GMM!")

	if self.num_layers > 0:
	grid_size = np.array(self.input_size) // np.array(self.patches)

	self.pos_emb_for_encoder = vit.get_posemb(
	self, self.posemb_type, grid_size, self.emb_dim,
	"pos_embedding_encoder")

	self.conv = nn.Conv(self.emb_dim, self.patches, padding="VALID",
	strides=self.patches, name="EmbedPatches")

	self.encoder = vit.Encoder(
	depth=self.num_layers,
	mlp_dim=self.mlp_dim,
	num_heads=self.num_heads,
	dropout=self.enc_dropout_rate,
	scan=self.scan,
	remat_policy=self.remat_policy,)
	else:
	self.encoder = None

	# Iterator that will lead free label IDs.
	next_label = itertools.count(self.num_labels or 0)
	special_labels = {}

	if self.style == "ar":
	pass
	elif self.style == "masked":
	if self.mask_style == "replace":
	special_labels = {_SpecialLabel.MASK: next(next_label)}
	elif self.mask_style == "concat":
	special_labels = {
	_SpecialLabel.MASK: next(next_label),
	_SpecialLabel.NOMASK: next(next_label),
	_SpecialLabel.REPLACE: next(next_label),
	}
	else:
	raise NotImplementedError(self.mask_style)
	else:
	raise NotImplementedError(self.style)

	if self.drop_labels_probability > 0:
	special_labels[_SpecialLabel.NOLABEL] = next(next_label)

	self.special_labels = special_labels
	lookup_size = (self.num_labels or 1) + len(self.special_labels)

	self.labels_emb = nn.Embed(
	lookup_size,
	self.emb_dim,
	name="EmbedLabels",
	embedding_init=nn.initializers.zeros
	if self.zero_embedding_init
	else nn.initializers.normal(stddev=1.0),
	)

	self.targets_emb = nn.Dense(self.emb_dim, name="EmbedTargets")

	self.decoder = Decoder(
	num_layers=self.num_decoder_layers or self.num_layers,
	mlp_dim=self.mlp_dim,
	num_heads=self.num_heads,
	out_dim=self.num_logits,
	# In masked mode, we run with 1 more token at the input.
	seq_len=self.seq_len + int(self.style == "masked"),
	dropout_rate=self.dec_dropout_rate,
	emb_dim=self.emb_dim,
	zero_embedding_init=self.zero_embedding_init,
	style=self.style,
	scan=self.scan,
	remat_policy=self.remat_policy,
	)

	def encode(self, image: jax.Array, train: bool = False) -> jax.Array:
	"""Encodes input image or embeddings."""
	emb = self.conv(image)
	patch_embeddings = einops.rearrange(emb, "B PH PW E -> B (PH PW) E")
	encoded, _ = self.encoder(
	patch_embeddings + self.pos_emb_for_encoder, deterministic=not train)
	return encoded

	def embed_labels(
	self,
	labels: jax.Array \| None = None,
	batch_size: int \| None = None,
	) -> jax.Array:
	if labels is not None:
	# Embed class label, add a sequence dim (output shape (B, 1, E))
	return self.labels_emb(labels)[:, None, :]

	assert ((self.num_labels == 1 or self.num_labels is None)
	and batch_size is not None)
	# Create [BOS] token embedding
	return self.labels_emb(jnp.zeros((batch_size,), jnp.int32))[:, None, :]

	def prefill(
	self, labels=None, batch_size=None, encoded=None, drop_labels=None
	):
	labels = self._drop_labels(drop_labels, labels)
	labels_for_prefill = self.embed_labels(labels=labels, batch_size=batch_size)
	return self.decoder(
	labels_for_prefill,
	encoded=encoded,
	decode=True)

	def _decode_ar(
	self,
	targets: jax.Array,
	labels: jax.Array \| None = None,
	encoded: jax.Array \| None = None,
	decode: bool = False,
	train: bool = False,
	) -> tuple[jax.Array, Mapping[str, jax.Array]]:
	"""Autoregressive decoding."""
	targets_embedded = self.targets_emb(targets)

	if decode:
	decoder_mask = None
	else:
	decoder_mask = nn.make_causal_mask(targets[:, :, 0])
	b = targets.shape[0]
	labels_embedded = self.embed_labels(labels, b)
	assert labels_embedded.shape == (b, 1, self.emb_dim), (
	labels_embedded.shape, (b, 1, self.emb_dim))
	targets_embedded = jnp.concatenate(
	[labels_embedded, targets_embedded[:, : -1]], axis=1)

	logits, out = self.decoder(
	targets_embedded,
	encoded=encoded,
	decoder_mask=decoder_mask,
	decode=decode,
	deterministic=not train,
	return_reps=True)

	return logits, out

	def _get_special_label(self, size, label: _SpecialLabel):
	return self.labels_emb(
	jnp.full(size, self.special_labels[label], jnp.int32)
	)

	def _decode_masked(
	self,
	targets,
	input_mask,
	labels=None,
	encoded=None,
	train=False,
	):
	"""Masked decoding."""
	b, s, _ = targets.shape
	assert input_mask.shape == (b, s)

	if self.mask_style == "replace":
	targets_embedded = jnp.where(
	input_mask[:, :, None],
	self._get_special_label((b, s), _SpecialLabel.MASK),
	self.targets_emb(targets),
	)
	elif self.mask_style == "concat":
	masks = jnp.where(
	input_mask[:, :, None],
	self._get_special_label((b, s), _SpecialLabel.MASK),
	self._get_special_label((b, s), _SpecialLabel.NOMASK),
	)
	embedded_targets = self.targets_emb(targets)
	targets_embedded = jnp.where(
	input_mask[:, :, None],
	self._get_special_label((b, s), _SpecialLabel.REPLACE),
	embedded_targets,
	)
	# Only take half of each to get the right embedding size.
	targets_embedded = jnp.concatenate(
	[masks[..., ::2], targets_embedded[..., ::2]], axis=-1
	)
	else:
	raise ValueError(self.mask_style)

	labels_embedded = self.embed_labels(labels, b)
	assert labels_embedded.shape == (b, 1, self.emb_dim)
	# Note that we do not truncate the input here, so this has shape
	# (B, L+1, E).
	targets_embedded = jnp.concatenate(
	[labels_embedded, targets_embedded], axis=1)

	logits = self.decoder(
	targets_embedded,
	encoded=encoded,
	decoder_mask=None,
	decode=False,
	deterministic=not train)

	logits = logits[:, 1:, ...] # Remove class label
	assert logits.shape[:2] == (b, s)
	return logits

	def _drop_labels(self, drop_labels_mask, labels):
	if labels is None:
	return None
	if self.drop_labels_probability >= 0.999:
	logging.warning("Dropping all labels...")
	return jnp.full_like(labels, self.special_labels[_SpecialLabel.NOLABEL])
	if drop_labels_mask is None:
	return labels
	assert _SpecialLabel.NOLABEL in self.special_labels
	nolabel = jnp.full_like(
	labels, self.special_labels[_SpecialLabel.NOLABEL]
	)
	return jnp.where(drop_labels_mask, nolabel, labels)

	def decode(
	self,
	targets: jax.Array,
	labels: jax.Array \| None = None,
	encoded: jax.Array \| None = None,
	decode: bool = False,
	train: bool = False,
	max_decode_length: int \| None = None,
	input_mask: jax.Array \| None = None,
	drop_labels: jax.Array \| None = None,
	return_reps: bool = False,
	) -> jax.Array \| tuple[jax.Array, Mapping[str, jax.Array]]:
	"""Applies Transformer decoder-branch on encoded-input and target.

	Args:
	targets: target text tokens [B, L, out_dim].
	labels: optional class labes, [B].
	encoded: encoded image patches from encoder [B, P, E].
	decode: whether to prepare and use an autoregressive cache.
	train: whether it is training.
	max_decode_length: optional max length for positional embeddings.
	input_mask: If given, mask input. Required for style=="masked".
	Shape [B, L], bool tensor. True means the token will be removed
	from the input.
	drop_labels: Drop labels at corresponding locations [B].
	return_reps: whether to return intermediate representations.

	Returns:
	logits array from transformer decoder [B, L, 3 * num_mixtures * out_dim].
	"""
	del max_decode_length
	labels = self._drop_labels(drop_labels, labels)
	if self.style == "ar":
	logits, out = self._decode_ar(
	targets, labels, encoded, decode, train)
	if return_reps:
	return logits, out
	return logits
	elif self.style == "masked":
	assert not decode # Cache not supported.
	assert input_mask is not None
	assert not return_reps # Not implemented.
	return self._decode_masked(targets, input_mask, labels, encoded, train)
	else:
	raise NotImplementedError(self.style)

	def _square_plus(self, x):
	# Via https://twitter.com/jon_barron/status/1387167648669048833
	if self.fix_square_plus:
	return (x + jnp.sqrt(jnp.square(x) + 4)) / 2
	else:
	return x + jnp.sqrt(jnp.square(x) + 4) / 2

	def get_pdf(
	self,
	logits: jax.Array,
	temperature_scales: float \| None = None,
	temperature_probs: float \| None = None,
	) -> distrax.Distribution:
	assert logits.shape[-1] == self.num_logits
	if self.multivariate:
	scales = logits[..., :self.out_dim ** 2]
	locs = logits[..., self.out_dim ** 2:]
	assert locs.shape[-1] == self.out_dim
	scales = self._square_plus(scales)
	# Turn into a square matrix.
	*leading, _ = scales.shape
	scales = scales.reshape(*leading, self.out_dim, self.out_dim)
	# Make sure the diagonals are non zero.
	diag_scale_tol = jnp.eye(self.out_dim) * self.scale_tol
	scales = jnp.maximum(scales, diag_scale_tol)
	if (t := temperature_scales) is not None:
	scales = scales * t

	# Note that there is `tfd.MultivariateNormalFullCovariance`` but it just
	# calls linalg.cholesky on the covariance and then uses the
	# MultivariateNormalTri class. Using ... direcly avoids having to
	# construct a hermetian matrix.
	#
	# Note that only the lower triag part of `scales` is used by applying
	# jnp.tril. The other elements are replaced with zeros.
	#
	# Note on output shapes:
	# - .sample() -> shape (..., seq_len, out_dim)
	# - .prob() -> shape (..., seq_len).
	return distrax.MultivariateNormalTri(locs, scales)

	elif self.per_channel_mixtures:
	# [..., 3 * num_mixtures * out_dim] -> [..., 3 * out_dim, num_mixtures]
	logits = jnp.reshape(logits, logits.shape[: -1] + (-1, self.num_mixtures))
	# 3 tensors with shape [..., out_dim, num_mixtures]
	probs, locs, scales = jnp.split(logits, 3, axis=-2)
	if (t := temperature_probs) is not None:
	probs = probs * t

	# normalize mixture probabilities
	probs = nn.softmax(probs)
	scales = self._square_plus(scales)
	# threshold scale
	scales = jnp.maximum(scales, self.scale_tol)
	if (t := temperature_scales) is not None:
	scales = scales * t

	# Note on output shapes:
	# - .sample() -> shape (..., seq_len, out_dim)
	# - .prob() -> shape (..., seq_len, out_dim).
	return distrax.MixtureSameFamily(
	mixture_distribution=distrax.Categorical(probs=probs),
	components_distribution=distrax.Normal(loc=locs, scale=scales),
	)
	else:
	*shape, num_logits = logits.shape
	assert num_logits == self.num_logits, (num_logits, self.num_logits)
	prob_logits, other_logits = (
	logits[..., : self.num_mixtures],
	logits[..., self.num_mixtures :],
	)
	if (t := temperature_probs) is not None:
	prob_logits = prob_logits * t
	other_logits = jnp.reshape(
	other_logits, (*shape, self.num_mixtures, 2, self.out_dim)
	)
	locs = other_logits[..., 0, :]
	scales = self._square_plus(other_logits[..., 1, :])

	scales = jnp.maximum(scales, self.scale_tol) # Threshold scale
	if (t := temperature_scales) is not None:
	scales = scales * t

	# prob_logits has shape (b, seq_len, m)
	# locs/scales has shape (b, seq_len, m, d)
	assert prob_logits.ndim == locs.ndim - 1, (prob_logits.shape, locs.shape)
	assert locs.shape == scales.shape, (locs.shape, scales.shape)

	# Note on output shapes:
	# - .sample() -> shape (..., seq_len, out_dim)
	# - .prob() -> shape (..., seq_len,)
	# - .nll() -> shape (..., seq_len,)
	return distrax.MixtureSameFamily(
	mixture_distribution=distrax.Categorical(logits=prob_logits),
	components_distribution=distrax.MultivariateNormalDiag(
	loc=locs, scale_diag=scales
	),
	)

	def __call__(
	self,
	sequence: jax.Array,
	labels: jax.Array \| None = None,
	*,
	image: jax.Array \| None = None,
	decode: bool = False,
	input_mask: jax.Array \| None = None,
	drop_labels: jax.Array \| None = None,
	train: bool = False,
	) -> tuple[jax.Array, distrax.Distribution]:
	"""Applies Transformer model on the inputs.

	Args:
	sequence: batch of sequences [B, L].
	labels: class labels for class conditional generation [B].
	image: batch of images [B, H, W, 3].
	decode: whether to prepare and use an autoregressive cache.
	input_mask: If given, mask input. Required for style=="masked" [B, L].
	drop_labels: If given, drop labels of the corresponding batches [B].
	train: whether it is training.

	Returns:
	logits array from full transformer [B, L, out_dim].
	"""
	if self.style == "masked" and input_mask is None:
	raise ValueError("Cannot run masked model without input mask!")

	if self.encoder is not None:
	assert image is not None
	encoded = self.encode(image, train=train)
	else:
	assert image is None
	encoded = None

	logits = self.decode(sequence, labels=labels, encoded=encoded,
	decode=decode, input_mask=input_mask, train=train)
	pdf = self.get_pdf(logits)
	return logits, pdf

	def get_input_mask_training(
	self,
	rng: jax.Array,
	shape: tuple[int, int],
	) -> jax.Array \| None:
	"""Creates a random maask of shape (B, L) for training masked models."""
	if self.style == "ar":
	return None
	b, s = shape
	# Sample b values in [0, 1-min_mask_ratio].
	keep = jax.random.uniform(
	rng, shape=(b,), maxval=1.0 - self.min_masking_rate_training
	)
	mask_ratio = apply_mask_schedule(keep, self.mask_schedule_train)
	return _random_mask_with_ratios(rng, ratios=mask_ratio, seq_len=s)

	def get_input_mask_teacher_forced(
	self,
	shape: tuple[int, int],
	) -> jax.Array \| None:
	"""Creates a random maask of shape (B, L) for training masked models."""
	if self.style == "ar":
	return None
	return jnp.zeros(shape, dtype=jnp.bool_)

	def get_drop_labels(
	self,
	rng: jax.Array,
	batch_size: int,
	) -> jax.Array \| None:
	if (p := self.drop_labels_probability) > 0:
	return jax.random.uniform(rng, shape=(batch_size,)) <= p
	else:
	return None


	def load(
	init_params: Any,
	init_files: str \| Mapping[str, str],
	model_params: Any = None,
	dont_load: Sequence[str] = (),
	resample_encoder_posemb: bool = False,
	trim_decoder_posemb: bool = False,
	) -> Any:
	"""Loads params from init checkpoint and merges into init_params."""
	del model_params
	if isinstance(init_files, str):
	ckpt_params = utils.load_params(init_files)
	ckpt_params = common.merge_params(ckpt_params, init_params, dont_load)

	if resample_encoder_posemb:
	if init_params and "pos_embedding_encoder" in init_params:
	ckpt_params["pos_embedding_encoder"] = vit.resample_posemb(
	old=ckpt_params["pos_embedding_encoder"],
	new=init_params["pos_embedding_encoder"])

	if trim_decoder_posemb:
	if init_params and "pos_embedding_decoder" in init_params:
	ckpt_params["pos_embedding_decoder"] = (
	ckpt_params["pos_embedding_decoder"][
	:, :init_params["pos_embedding_decoder"].shape[1], :])

	else:
	init_files = {**init_files} # Shallow copy because we'll pop stuff off.

	enc_init = init_files.pop("encoder", None)
	if enc_init:
	ckpt_params = init_params.copy()
	vit_params = {
	"pos_embedding": ckpt_params["pos_embedding_encoder"],
	"Transformer": ckpt_params["encoder"],
	"embedding": ckpt_params["EmbedPatches"],
	}
	encoder_params = vit.load(
	vit_params, enc_init, model_cfg={},
	dont_load=dont_load)
	ckpt_params["encoder"] = encoder_params["Transformer"]
	ckpt_params["pos_embedding_encoder"] = encoder_params["pos_embedding"]
	ckpt_params["EmbedPatches"] = encoder_params["embedding"]
	else:
	raise ValueError("Only encoder init is supported: {}.".format(init_files))

	return ckpt_params