Spaces:

songweig
/

rich-text-to-image

Running on A10G

rich-text-to-image / models /attention.py

Songwei Ge

demo!

4c022fe about 1 year ago

No virus

39.3 kB

	# Copyright 2022 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import math
	import warnings
	from dataclasses import dataclass
	from typing import Optional

	import torch
	import torch.nn.functional as F
	from torch import nn

	from diffusers.configuration_utils import ConfigMixin, register_to_config
	from diffusers.models.modeling_utils import ModelMixin
	from diffusers.models.embeddings import ImagePositionalEmbeddings
	from diffusers.utils import BaseOutput
	from diffusers.utils.import_utils import is_xformers_available


	@dataclass
	class Transformer2DModelOutput(BaseOutput):
	"""
	Args:
	sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
	Hidden states conditioned on `encoder_hidden_states` input. If discrete, returns probability distributions
	for the unnoised latent pixels.
	"""

	sample: torch.FloatTensor


	if is_xformers_available():
	import xformers
	import xformers.ops
	else:
	xformers = None


	class Transformer2DModel(ModelMixin, ConfigMixin):
	"""
	Transformer model for image-like data. Takes either discrete (classes of vector embeddings) or continuous (actual
	embeddings) inputs.

	When input is continuous: First, project the input (aka embedding) and reshape to b, t, d. Then apply standard
	transformer action. Finally, reshape to image.

	When input is discrete: First, input (classes of latent pixels) is converted to embeddings and has positional
	embeddings applied, see `ImagePositionalEmbeddings`. Then apply standard transformer action. Finally, predict
	classes of unnoised image.

	Note that it is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised
	image do not contain a prediction for the masked pixel as the unnoised image cannot be masked.

	Parameters:
	num_attention_heads (`int`, optional, defaults to 16): The number of heads to use for multi-head attention.
	attention_head_dim (`int`, optional, defaults to 88): The number of channels in each head.
	in_channels (`int`, optional):
	Pass if the input is continuous. The number of channels in the input and output.
	num_layers (`int`, optional, defaults to 1): The number of layers of Transformer blocks to use.
	dropout (`float`, optional, defaults to 0.1): The dropout probability to use.
	cross_attention_dim (`int`, optional): The number of context dimensions to use.
	sample_size (`int`, optional): Pass if the input is discrete. The width of the latent images.
	Note that this is fixed at training time as it is used for learning a number of position embeddings. See
	`ImagePositionalEmbeddings`.
	num_vector_embeds (`int`, optional):
	Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
	Includes the class for the masked latent pixel.
	activation_fn (`str`, optional, defaults to `"geglu"`): Activation function to be used in feed-forward.
	num_embeds_ada_norm ( `int`, optional): Pass if at least one of the norm_layers is `AdaLayerNorm`.
	The number of diffusion steps used during training. Note that this is fixed at training time as it is used
	to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
	up to but not more than steps than `num_embeds_ada_norm`.
	attention_bias (`bool`, optional):
	Configure if the TransformerBlocks' attention should contain a bias parameter.
	"""

	@register_to_config
	def __init__(
	self,
	num_attention_heads: int = 16,
	attention_head_dim: int = 88,
	in_channels: Optional[int] = None,
	num_layers: int = 1,
	dropout: float = 0.0,
	norm_num_groups: int = 32,
	cross_attention_dim: Optional[int] = None,
	attention_bias: bool = False,
	sample_size: Optional[int] = None,
	num_vector_embeds: Optional[int] = None,
	activation_fn: str = "geglu",
	num_embeds_ada_norm: Optional[int] = None,
	use_linear_projection: bool = False,
	only_cross_attention: bool = False,
	):
	super().__init__()
	self.use_linear_projection = use_linear_projection
	self.num_attention_heads = num_attention_heads
	self.attention_head_dim = attention_head_dim
	inner_dim = num_attention_heads * attention_head_dim

	# 1. Transformer2DModel can process both standard continous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
	# Define whether input is continuous or discrete depending on configuration
	self.is_input_continuous = in_channels is not None
	self.is_input_vectorized = num_vector_embeds is not None

	if self.is_input_continuous and self.is_input_vectorized:
	raise ValueError(
	f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
	" sure that either `in_channels` or `num_vector_embeds` is None."
	)
	elif not self.is_input_continuous and not self.is_input_vectorized:
	raise ValueError(
	f"Has to define either `in_channels`: {in_channels} or `num_vector_embeds`: {num_vector_embeds}. Make"
	" sure that either `in_channels` or `num_vector_embeds` is not None."
	)

	# 2. Define input layers
	if self.is_input_continuous:
	self.in_channels = in_channels

	self.norm = torch.nn.GroupNorm(
	num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
	if use_linear_projection:
	self.proj_in = nn.Linear(in_channels, inner_dim)
	else:
	self.proj_in = nn.Conv2d(
	in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
	elif self.is_input_vectorized:
	assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
	assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"

	self.height = sample_size
	self.width = sample_size
	self.num_vector_embeds = num_vector_embeds
	self.num_latent_pixels = self.height * self.width

	self.latent_image_embedding = ImagePositionalEmbeddings(
	num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
	)

	# 3. Define transformers blocks
	self.transformer_blocks = nn.ModuleList(
	[
	BasicTransformerBlock(
	inner_dim,
	num_attention_heads,
	attention_head_dim,
	dropout=dropout,
	cross_attention_dim=cross_attention_dim,
	activation_fn=activation_fn,
	num_embeds_ada_norm=num_embeds_ada_norm,
	attention_bias=attention_bias,
	only_cross_attention=only_cross_attention,
	)
	for d in range(num_layers)
	]
	)

	# 4. Define output layers
	if self.is_input_continuous:
	if use_linear_projection:
	self.proj_out = nn.Linear(in_channels, inner_dim)
	else:
	self.proj_out = nn.Conv2d(
	inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
	elif self.is_input_vectorized:
	self.norm_out = nn.LayerNorm(inner_dim)
	self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)

	def _set_attention_slice(self, slice_size):
	for block in self.transformer_blocks:
	block._set_attention_slice(slice_size)

	def forward(self, hidden_states, encoder_hidden_states=None, timestep=None,
	text_format_dict={}, return_dict: bool = True):
	"""
	Args:
	hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
	When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
	hidden_states
	encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, context dim)`, optional):
	Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
	self-attention.
	timestep ( `torch.long`, optional):
	Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.

	Returns:
	[`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`]
	if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample
	tensor.
	"""
	# 1. Input
	if self.is_input_continuous:
	batch, channel, height, weight = hidden_states.shape
	residual = hidden_states

	hidden_states = self.norm(hidden_states)
	if not self.use_linear_projection:
	hidden_states = self.proj_in(hidden_states)
	inner_dim = hidden_states.shape[1]
	hidden_states = hidden_states.permute(
	0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
	else:
	inner_dim = hidden_states.shape[1]
	hidden_states = hidden_states.permute(
	0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
	hidden_states = self.proj_in(hidden_states)
	elif self.is_input_vectorized:
	hidden_states = self.latent_image_embedding(hidden_states)

	# 2. Blocks
	for block in self.transformer_blocks:
	hidden_states = block(hidden_states, context=encoder_hidden_states, timestep=timestep,
	text_format_dict=text_format_dict)

	# 3. Output
	if self.is_input_continuous:
	if not self.use_linear_projection:
	hidden_states = (
	hidden_states.reshape(batch, height, weight, inner_dim).permute(
	0, 3, 1, 2).contiguous()
	)
	hidden_states = self.proj_out(hidden_states)
	else:
	hidden_states = self.proj_out(hidden_states)
	hidden_states = (
	hidden_states.reshape(batch, height, weight, inner_dim).permute(
	0, 3, 1, 2).contiguous()
	)

	output = hidden_states + residual
	elif self.is_input_vectorized:
	hidden_states = self.norm_out(hidden_states)
	logits = self.out(hidden_states)
	# (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
	logits = logits.permute(0, 2, 1)

	# log(p(x_0))
	output = F.log_softmax(logits.double(), dim=1).float()

	if not return_dict:
	return (output,)

	return Transformer2DModelOutput(sample=output)

	def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
	for block in self.transformer_blocks:
	block._set_use_memory_efficient_attention_xformers(
	use_memory_efficient_attention_xformers)


	class AttentionBlock(nn.Module):
	"""
	An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
	to the N-d case.
	https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
	Uses three q, k, v linear layers to compute attention.

	Parameters:
	channels (`int`): The number of channels in the input and output.
	num_head_channels (`int`, optional):
	The number of channels in each head. If None, then `num_heads` = 1.
	norm_num_groups (`int`, optional, defaults to 32): The number of groups to use for group norm.
	rescale_output_factor (`float`, optional, defaults to 1.0): The factor to rescale the output by.
	eps (`float`, optional, defaults to 1e-5): The epsilon value to use for group norm.
	"""

	def __init__(
	self,
	channels: int,
	num_head_channels: Optional[int] = None,
	norm_num_groups: int = 32,
	rescale_output_factor: float = 1.0,
	eps: float = 1e-5,
	):
	super().__init__()
	self.channels = channels

	self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
	self.num_head_size = num_head_channels
	self.group_norm = nn.GroupNorm(
	num_channels=channels, num_groups=norm_num_groups, eps=eps, affine=True)

	# define q,k,v as linear layers
	self.query = nn.Linear(channels, channels)
	self.key = nn.Linear(channels, channels)
	self.value = nn.Linear(channels, channels)

	self.rescale_output_factor = rescale_output_factor
	self.proj_attn = nn.Linear(channels, channels, 1)

	def transpose_for_scores(self, projection: torch.Tensor) -> torch.Tensor:
	new_projection_shape = projection.size()[:-1] + (self.num_heads, -1)
	# move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
	new_projection = projection.view(
	new_projection_shape).permute(0, 2, 1, 3)
	return new_projection

	def forward(self, hidden_states):
	residual = hidden_states
	batch, channel, height, width = hidden_states.shape

	# norm
	hidden_states = self.group_norm(hidden_states)

	hidden_states = hidden_states.view(
	batch, channel, height * width).transpose(1, 2)

	# proj to q, k, v
	query_proj = self.query(hidden_states)
	key_proj = self.key(hidden_states)
	value_proj = self.value(hidden_states)

	scale = 1 / math.sqrt(self.channels / self.num_heads)

	# get scores
	if self.num_heads > 1:
	query_states = self.transpose_for_scores(query_proj)
	key_states = self.transpose_for_scores(key_proj)
	value_states = self.transpose_for_scores(value_proj)

	# TODO: is there a way to perform batched matmul (e.g. baddbmm) on 4D tensors?
	# or reformulate this into a 3D problem?
	# TODO: measure whether on MPS device it would be faster to do this matmul via einsum
	# as some matmuls can be 1.94x slower than an equivalent einsum on MPS
	# https://gist.github.com/Birch-san/cba16789ec27bb20996a4b4831b13ce0
	attention_scores = torch.matmul(
	query_states, key_states.transpose(-1, -2)) * scale
	else:
	query_states, key_states, value_states = query_proj, key_proj, value_proj

	attention_scores = torch.baddbmm(
	torch.empty(
	query_states.shape[0],
	query_states.shape[1],
	key_states.shape[1],
	dtype=query_states.dtype,
	device=query_states.device,
	),
	query_states,
	key_states.transpose(-1, -2),
	beta=0,
	alpha=scale,
	)

	attention_probs = torch.softmax(
	attention_scores.float(), dim=-1).type(attention_scores.dtype)

	# compute attention output
	if self.num_heads > 1:
	# TODO: is there a way to perform batched matmul (e.g. bmm) on 4D tensors?
	# or reformulate this into a 3D problem?
	# TODO: measure whether on MPS device it would be faster to do this matmul via einsum
	# as some matmuls can be 1.94x slower than an equivalent einsum on MPS
	# https://gist.github.com/Birch-san/cba16789ec27bb20996a4b4831b13ce0
	hidden_states = torch.matmul(attention_probs, value_states)
	hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
	new_hidden_states_shape = hidden_states.size()[
	:-2] + (self.channels,)
	hidden_states = hidden_states.view(new_hidden_states_shape)
	else:
	hidden_states = torch.bmm(attention_probs, value_states)

	# compute next hidden_states
	hidden_states = self.proj_attn(hidden_states)
	hidden_states = hidden_states.transpose(
	-1, -2).reshape(batch, channel, height, width)

	# res connect and rescale
	hidden_states = (hidden_states + residual) / self.rescale_output_factor
	return hidden_states


	class BasicTransformerBlock(nn.Module):
	r"""
	A basic Transformer block.

	Parameters:
	dim (`int`): The number of channels in the input and output.
	num_attention_heads (`int`): The number of heads to use for multi-head attention.
	attention_head_dim (`int`): The number of channels in each head.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	cross_attention_dim (`int`, optional): The size of the context vector for cross attention.
	activation_fn (`str`, optional, defaults to `"geglu"`): Activation function to be used in feed-forward.
	num_embeds_ada_norm (:
	obj: `int`, optional): The number of diffusion steps used during training. See `Transformer2DModel`.
	attention_bias (:
	obj: `bool`, optional, defaults to `False`): Configure if the attentions should contain a bias parameter.
	"""

	def __init__(
	self,
	dim: int,
	num_attention_heads: int,
	attention_head_dim: int,
	dropout=0.0,
	cross_attention_dim: Optional[int] = None,
	activation_fn: str = "geglu",
	num_embeds_ada_norm: Optional[int] = None,
	attention_bias: bool = False,
	only_cross_attention: bool = False,
	):
	super().__init__()
	self.only_cross_attention = only_cross_attention
	self.attn1 = CrossAttention(
	query_dim=dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	cross_attention_dim=cross_attention_dim if only_cross_attention else None,
	) # is a self-attention
	self.ff = FeedForward(dim, dropout=dropout,
	activation_fn=activation_fn)
	self.attn2 = CrossAttention(
	query_dim=dim,
	cross_attention_dim=cross_attention_dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	) # is self-attn if context is none

	# layer norms
	self.use_ada_layer_norm = num_embeds_ada_norm is not None
	if self.use_ada_layer_norm:
	self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
	self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
	else:
	self.norm1 = nn.LayerNorm(dim)
	self.norm2 = nn.LayerNorm(dim)
	self.norm3 = nn.LayerNorm(dim)

	# if xformers is installed try to use memory_efficient_attention by default
	if is_xformers_available():
	try:
	self._set_use_memory_efficient_attention_xformers(True)
	except Exception as e:
	warnings.warn(
	"Could not enable memory efficient attention. Make sure xformers is installed"
	f" correctly and a GPU is available: {e}"
	)

	def _set_attention_slice(self, slice_size):
	self.attn1._slice_size = slice_size
	self.attn2._slice_size = slice_size

	def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
	if not is_xformers_available():
	print("Here is how to install it")
	raise ModuleNotFoundError(
	"Refer to https://github.com/facebookresearch/xformers for more information on how to install"
	" xformers",
	name="xformers",
	)
	elif not torch.cuda.is_available():
	raise ValueError(
	"torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only"
	" available for GPU "
	)
	else:
	try:
	# Make sure we can run the memory efficient attention
	_ = xformers.ops.memory_efficient_attention(
	torch.randn((1, 2, 40), device="cuda"),
	torch.randn((1, 2, 40), device="cuda"),
	torch.randn((1, 2, 40), device="cuda"),
	)
	except Exception as e:
	raise e
	self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
	self.attn2._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers

	def forward(self, hidden_states, context=None, timestep=None, text_format_dict={}):
	# 1. Self-Attention
	norm_hidden_states = (
	self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(
	hidden_states)
	)

	if self.only_cross_attention:
	attn_out, _ = self.attn1(
	norm_hidden_states, context, text_format_dict=text_format_dict) + hidden_states
	hidden_states = attn_out + hidden_states
	else:
	attn_out, _ = self.attn1(norm_hidden_states)
	hidden_states = attn_out + hidden_states

	# 2. Cross-Attention
	norm_hidden_states = (
	self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(
	hidden_states)
	)
	attn_out, _ = self.attn2(
	norm_hidden_states, context=context, text_format_dict=text_format_dict)
	hidden_states = attn_out + hidden_states

	# 3. Feed-forward
	hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states

	return hidden_states


	class CrossAttention(nn.Module):
	r"""
	A cross attention layer.

	Parameters:
	query_dim (`int`): The number of channels in the query.
	cross_attention_dim (`int`, optional):
	The number of channels in the context. If not given, defaults to `query_dim`.
	heads (`int`, optional, defaults to 8): The number of heads to use for multi-head attention.
	dim_head (`int`, optional, defaults to 64): The number of channels in each head.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	bias (`bool`, optional, defaults to False):
	Set to `True` for the query, key, and value linear layers to contain a bias parameter.
	"""

	def __init__(
	self,
	query_dim: int,
	cross_attention_dim: Optional[int] = None,
	heads: int = 8,
	dim_head: int = 64,
	dropout: float = 0.0,
	bias=False,
	):
	super().__init__()
	inner_dim = dim_head * heads
	self.is_cross_attn = cross_attention_dim is not None
	cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim

	self.scale = dim_head**-0.5
	self.heads = heads
	# for slice_size > 0 the attention score computation
	# is split across the batch axis to save memory
	# You can set slice_size with `set_attention_slice`
	self._slice_size = None
	self._use_memory_efficient_attention_xformers = False

	self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
	self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
	self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)

	self.to_out = nn.ModuleList([])
	self.to_out.append(nn.Linear(inner_dim, query_dim))
	self.to_out.append(nn.Dropout(dropout))

	def reshape_heads_to_batch_dim(self, tensor):
	batch_size, seq_len, dim = tensor.shape
	head_size = self.heads
	tensor = tensor.reshape(batch_size, seq_len,
	head_size, dim // head_size)
	tensor = tensor.permute(0, 2, 1, 3).reshape(
	batch_size * head_size, seq_len, dim // head_size)
	return tensor

	def reshape_batch_dim_to_heads(self, tensor):
	batch_size, seq_len, dim = tensor.shape
	head_size = self.heads
	tensor = tensor.reshape(batch_size // head_size,
	head_size, seq_len, dim)
	tensor = tensor.permute(0, 2, 1, 3).reshape(
	batch_size // head_size, seq_len, dim * head_size)
	return tensor

	def reshape_batch_dim_to_heads_and_average(self, tensor):
	batch_size, seq_len, seq_len2 = tensor.shape
	head_size = self.heads
	tensor = tensor.reshape(batch_size // head_size,
	head_size, seq_len, seq_len2)
	return tensor.mean(1)

	def forward(self, hidden_states, context=None, mask=None, text_format_dict={}):
	batch_size, sequence_length, _ = hidden_states.shape

	query = self.to_q(hidden_states)
	context = context if context is not None else hidden_states
	key = self.to_k(context)
	value = self.to_v(context)

	dim = query.shape[-1]

	query = self.reshape_heads_to_batch_dim(query)
	key = self.reshape_heads_to_batch_dim(key)
	value = self.reshape_heads_to_batch_dim(value)

	# attention, what we cannot get enough of
	if self._use_memory_efficient_attention_xformers:
	hidden_states = self._memory_efficient_attention_xformers(
	query, key, value)
	# Some versions of xformers return output in fp32, cast it back to the dtype of the input
	hidden_states = hidden_states.to(query.dtype)
	else:
	if self._slice_size is None or query.shape[0] // self._slice_size == 1:
	# only this attention function is used
	hidden_states, attn_probs = self._attention(
	query, key, value, **text_format_dict)

	# linear proj
	hidden_states = self.to_out[0](hidden_states)
	# dropout
	hidden_states = self.to_out[1](hidden_states)
	return hidden_states, attn_probs

	def _qk(self, query, key):
	return torch.baddbmm(
	torch.empty(query.shape[0], query.shape[1], key.shape[1],
	dtype=query.dtype, device=query.device),
	query,
	key.transpose(-1, -2),
	beta=0,
	alpha=self.scale,
	)

	def _attention(self, query, key, value, word_pos=None, font_size=None,
	**kwargs):
	attention_scores = self._qk(query, key)

	# Font size:
	if self.is_cross_attn and word_pos is not None and font_size is not None:
	assert key.shape[1] == 77
	attention_score_exp = attention_scores.exp()
	font_size_abs, font_size_sign = font_size.abs(), font_size.sign()
	attention_score_exp[:, :, word_pos] = attention_score_exp[:, :, word_pos].clone(
	)*font_size_abs
	attention_probs = attention_score_exp / \
	attention_score_exp.sum(-1, True)
	attention_probs[:, :, word_pos] *= font_size_sign
	else:
	attention_probs = attention_scores.softmax(dim=-1)

	hidden_states = torch.bmm(attention_probs, value)

	# reshape hidden_states
	hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
	attention_probs = self.reshape_batch_dim_to_heads_and_average(
	attention_probs)
	return hidden_states, attention_probs

	def _memory_efficient_attention_xformers(self, query, key, value):
	query = query.contiguous()
	key = key.contiguous()
	value = value.contiguous()
	hidden_states = xformers.ops.memory_efficient_attention(
	query, key, value, attn_bias=None)
	hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
	return hidden_states


	class FeedForward(nn.Module):
	r"""
	A feed-forward layer.

	Parameters:
	dim (`int`): The number of channels in the input.
	dim_out (`int`, optional): The number of channels in the output. If not given, defaults to `dim`.
	mult (`int`, optional, defaults to 4): The multiplier to use for the hidden dimension.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	activation_fn (`str`, optional, defaults to `"geglu"`): Activation function to be used in feed-forward.
	"""

	def __init__(
	self,
	dim: int,
	dim_out: Optional[int] = None,
	mult: int = 4,
	dropout: float = 0.0,
	activation_fn: str = "geglu",
	):
	super().__init__()
	inner_dim = int(dim * mult)
	dim_out = dim_out if dim_out is not None else dim

	if activation_fn == "geglu":
	geglu = GEGLU(dim, inner_dim)
	elif activation_fn == "geglu-approximate":
	geglu = ApproximateGELU(dim, inner_dim)

	self.net = nn.ModuleList([])
	# project in
	self.net.append(geglu)
	# project dropout
	self.net.append(nn.Dropout(dropout))
	# project out
	self.net.append(nn.Linear(inner_dim, dim_out))

	def forward(self, hidden_states):
	for module in self.net:
	hidden_states = module(hidden_states)
	return hidden_states


	# feedforward
	class GEGLU(nn.Module):
	r"""
	A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.

	Parameters:
	dim_in (`int`): The number of channels in the input.
	dim_out (`int`): The number of channels in the output.
	"""

	def __init__(self, dim_in: int, dim_out: int):
	super().__init__()
	self.proj = nn.Linear(dim_in, dim_out * 2)

	def gelu(self, gate):
	if gate.device.type != "mps":
	return F.gelu(gate)
	# mps: gelu is not implemented for float16
	return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)

	def forward(self, hidden_states):
	hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
	return hidden_states * self.gelu(gate)


	class ApproximateGELU(nn.Module):
	"""
	The approximate form of Gaussian Error Linear Unit (GELU)

	For more details, see section 2: https://arxiv.org/abs/1606.08415
	"""

	def __init__(self, dim_in: int, dim_out: int):
	super().__init__()
	self.proj = nn.Linear(dim_in, dim_out)

	def forward(self, x):
	x = self.proj(x)
	return x * torch.sigmoid(1.702 * x)


	class AdaLayerNorm(nn.Module):
	"""
	Norm layer modified to incorporate timestep embeddings.
	"""

	def __init__(self, embedding_dim, num_embeddings):
	super().__init__()
	self.emb = nn.Embedding(num_embeddings, embedding_dim)
	self.silu = nn.SiLU()
	self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
	self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False)

	def forward(self, x, timestep):
	emb = self.linear(self.silu(self.emb(timestep)))
	scale, shift = torch.chunk(emb, 2)
	x = self.norm(x) * (1 + scale) + shift
	return x


	class DualTransformer2DModel(nn.Module):
	"""
	Dual transformer wrapper that combines two `Transformer2DModel`s for mixed inference.

	Parameters:
	num_attention_heads (`int`, optional, defaults to 16): The number of heads to use for multi-head attention.
	attention_head_dim (`int`, optional, defaults to 88): The number of channels in each head.
	in_channels (`int`, optional):
	Pass if the input is continuous. The number of channels in the input and output.
	num_layers (`int`, optional, defaults to 1): The number of layers of Transformer blocks to use.
	dropout (`float`, optional, defaults to 0.1): The dropout probability to use.
	cross_attention_dim (`int`, optional): The number of context dimensions to use.
	sample_size (`int`, optional): Pass if the input is discrete. The width of the latent images.
	Note that this is fixed at training time as it is used for learning a number of position embeddings. See
	`ImagePositionalEmbeddings`.
	num_vector_embeds (`int`, optional):
	Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
	Includes the class for the masked latent pixel.
	activation_fn (`str`, optional, defaults to `"geglu"`): Activation function to be used in feed-forward.
	num_embeds_ada_norm ( `int`, optional): Pass if at least one of the norm_layers is `AdaLayerNorm`.
	The number of diffusion steps used during training. Note that this is fixed at training time as it is used
	to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
	up to but not more than steps than `num_embeds_ada_norm`.
	attention_bias (`bool`, optional):
	Configure if the TransformerBlocks' attention should contain a bias parameter.
	"""

	def __init__(
	self,
	num_attention_heads: int = 16,
	attention_head_dim: int = 88,
	in_channels: Optional[int] = None,
	num_layers: int = 1,
	dropout: float = 0.0,
	norm_num_groups: int = 32,
	cross_attention_dim: Optional[int] = None,
	attention_bias: bool = False,
	sample_size: Optional[int] = None,
	num_vector_embeds: Optional[int] = None,
	activation_fn: str = "geglu",
	num_embeds_ada_norm: Optional[int] = None,
	):
	super().__init__()
	self.transformers = nn.ModuleList(
	[
	Transformer2DModel(
	num_attention_heads=num_attention_heads,
	attention_head_dim=attention_head_dim,
	in_channels=in_channels,
	num_layers=num_layers,
	dropout=dropout,
	norm_num_groups=norm_num_groups,
	cross_attention_dim=cross_attention_dim,
	attention_bias=attention_bias,
	sample_size=sample_size,
	num_vector_embeds=num_vector_embeds,
	activation_fn=activation_fn,
	num_embeds_ada_norm=num_embeds_ada_norm,
	)
	for _ in range(2)
	]
	)

	# Variables that can be set by a pipeline:

	# The ratio of transformer1 to transformer2's output states to be combined during inference
	self.mix_ratio = 0.5

	# The shape of `encoder_hidden_states` is expected to be
	# `(batch_size, condition_lengths[0]+condition_lengths[1], num_features)`
	self.condition_lengths = [77, 257]

	# Which transformer to use to encode which condition.
	# E.g. `(1, 0)` means that we'll use `transformers[1](conditions[0])` and `transformers[0](conditions[1])`
	self.transformer_index_for_condition = [1, 0]

	def forward(self, hidden_states, encoder_hidden_states, timestep=None, return_dict: bool = True):
	"""
	Args:
	hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
	When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
	hidden_states
	encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, context dim)`, optional):
	Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
	self-attention.
	timestep ( `torch.long`, optional):
	Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.

	Returns:
	[`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`]
	if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample
	tensor.
	"""
	input_states = hidden_states

	encoded_states = []
	tokens_start = 0
	for i in range(2):
	# for each of the two transformers, pass the corresponding condition tokens
	condition_state = encoder_hidden_states[:,
	tokens_start: tokens_start + self.condition_lengths[i]]
	transformer_index = self.transformer_index_for_condition[i]
	encoded_state = self.transformers[transformer_index](input_states, condition_state, timestep, return_dict)[
	0
	]
	encoded_states.append(encoded_state - input_states)
	tokens_start += self.condition_lengths[i]

	output_states = encoded_states[0] * self.mix_ratio + \
	encoded_states[1] * (1 - self.mix_ratio)
	output_states = output_states + input_states

	if not return_dict:
	return (output_states,)

	return Transformer2DModelOutput(sample=output_states)

	def _set_attention_slice(self, slice_size):
	for transformer in self.transformers:
	transformer._set_attention_slice(slice_size)

	def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
	for transformer in self.transformers:
	transformer._set_use_memory_efficient_attention_xformers(
	use_memory_efficient_attention_xformers)