Spaces:

amphion
/

Text-to-Speech

Running

App Files Files Community

Text-to-Speech / modules /diffusion /unet /attention.py

zyingt

Upload 685 files

0d80816 about 1 year ago

raw

history blame

8.65 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	from modules.general.utils import Conv1d, normalization, zero_module
	from .basic import UNetBlock


	class AttentionBlock(UNetBlock):
	r"""A spatial transformer encoder block that allows spatial positions to attend
	to each other. Reference from `latent diffusion repo
	<https://github.com/Stability-AI/generative-models/blob/main/sgm/modules/attention.py#L531>`_.

	Args:
	channels: Number of channels in the input.
	num_head_channels: Number of channels per attention head.
	num_heads: Number of attention heads. Overrides ``num_head_channels`` if set.
	encoder_channels: Number of channels in the encoder output for cross-attention.
	If ``None``, then self-attention is performed.
	use_self_attention: Whether to use self-attention before cross-attention, only applicable if encoder_channels is set.
	dims: Number of spatial dimensions, i.e. 1 for temporal signals, 2 for images.
	h_dim: The dimension of the height, would be applied if ``dims`` is 2.
	encoder_hdim: The dimension of the height of the encoder output, would be applied if ``dims`` is 2.
	p_dropout: Dropout probability.
	"""

	def __init__(
	self,
	channels: int,
	num_head_channels: int = 32,
	num_heads: int = -1,
	encoder_channels: int = None,
	use_self_attention: bool = False,
	dims: int = 1,
	h_dim: int = 100,
	encoder_hdim: int = 384,
	p_dropout: float = 0.0,
	):
	super().__init__()

	self.channels = channels
	self.p_dropout = p_dropout
	self.dims = dims

	if dims == 1:
	self.channels = channels
	elif dims == 2:
	# We consider the channel as product of channel and height, i.e. C x H
	# This is because we want to apply attention on the audio signal, which is 1D
	self.channels = channels * h_dim
	else:
	raise ValueError(f"invalid number of dimensions: {dims}")

	if num_head_channels == -1:
	assert (
	self.channels % num_heads == 0
	), f"q,k,v channels {self.channels} is not divisible by num_heads {num_heads}"
	self.num_heads = num_heads
	self.num_head_channels = self.channels // num_heads
	else:
	assert (
	self.channels % num_head_channels == 0
	), f"q,k,v channels {self.channels} is not divisible by num_head_channels {num_head_channels}"
	self.num_heads = self.channels // num_head_channels
	self.num_head_channels = num_head_channels

	if encoder_channels is not None:
	self.use_self_attention = use_self_attention

	if dims == 1:
	self.encoder_channels = encoder_channels
	elif dims == 2:
	self.encoder_channels = encoder_channels * encoder_hdim
	else:
	raise ValueError(f"invalid number of dimensions: {dims}")

	if use_self_attention:
	self.self_attention = BasicAttentionBlock(
	self.channels,
	self.num_head_channels,
	self.num_heads,
	p_dropout=self.p_dropout,
	)
	self.cross_attention = BasicAttentionBlock(
	self.channels,
	self.num_head_channels,
	self.num_heads,
	self.encoder_channels,
	p_dropout=self.p_dropout,
	)
	else:
	self.encoder_channels = None
	self.self_attention = BasicAttentionBlock(
	self.channels,
	self.num_head_channels,
	self.num_heads,
	p_dropout=self.p_dropout,
	)

	def forward(self, x: torch.Tensor, encoder_output: torch.Tensor = None):
	r"""
	Args:
	x: input tensor with shape [B x ``channels`` x ...]
	encoder_output: feature tensor with shape [B x ``encoder_channels`` x ...], if ``None``, then self-attention is performed.

	Returns:
	output tensor with shape [B x ``channels`` x ...]
	"""
	shape = x.size()
	x = x.reshape(shape[0], self.channels, -1).contiguous()

	if self.encoder_channels is None:
	assert (
	encoder_output is None
	), "encoder_output must be None for self-attention."
	h = self.self_attention(x)

	else:
	assert (
	encoder_output is not None
	), "encoder_output must be given for cross-attention."
	encoder_output = encoder_output.reshape(
	shape[0], self.encoder_channels, -1
	).contiguous()

	if self.use_self_attention:
	x = self.self_attention(x)
	h = self.cross_attention(x, encoder_output)

	return h.reshape(*shape).contiguous()


	class BasicAttentionBlock(nn.Module):
	def __init__(
	self,
	channels: int,
	num_head_channels: int = 32,
	num_heads: int = -1,
	context_channels: int = None,
	p_dropout: float = 0.0,
	):
	super().__init__()

	self.channels = channels
	self.p_dropout = p_dropout
	self.context_channels = context_channels

	if num_head_channels == -1:
	assert (
	self.channels % num_heads == 0
	), f"q,k,v channels {self.channels} is not divisible by num_heads {num_heads}"
	self.num_heads = num_heads
	self.num_head_channels = self.channels // num_heads
	else:
	assert (
	self.channels % num_head_channels == 0
	), f"q,k,v channels {self.channels} is not divisible by num_head_channels {num_head_channels}"
	self.num_heads = self.channels // num_head_channels
	self.num_head_channels = num_head_channels

	if context_channels is not None:
	self.to_q = nn.Sequential(
	normalization(self.channels),
	Conv1d(self.channels, self.channels, 1),
	)
	self.to_kv = Conv1d(context_channels, 2 * self.channels, 1)
	else:
	self.to_qkv = nn.Sequential(
	normalization(self.channels),
	Conv1d(self.channels, 3 * self.channels, 1),
	)

	self.linear = Conv1d(self.channels, self.channels)

	self.proj_out = nn.Sequential(
	normalization(self.channels),
	Conv1d(self.channels, self.channels, 1),
	nn.GELU(),
	nn.Dropout(p=self.p_dropout),
	zero_module(Conv1d(self.channels, self.channels, 1)),
	)

	def forward(self, q: torch.Tensor, kv: torch.Tensor = None):
	r"""
	Args:
	q: input tensor with shape [B, ``channels``, L]
	kv: feature tensor with shape [B, ``context_channels``, T], if ``None``, then self-attention is performed.

	Returns:
	output tensor with shape [B, ``channels``, L]
	"""
	N, C, L = q.size()

	if self.context_channels is not None:
	assert kv is not None, "kv must be given for cross-attention."

	q = (
	self.to_q(q)
	.reshape(self.num_heads, self.num_head_channels, -1)
	.transpose(-1, -2)
	.contiguous()
	)
	kv = (
	self.to_kv(kv)
	.reshape(2, self.num_heads, self.num_head_channels, -1)
	.transpose(-1, -2)
	.chunk(2)
	)
	k, v = (
	kv[0].squeeze(0).contiguous(),
	kv[1].squeeze(0).contiguous(),
	)

	else:
	qkv = (
	self.to_qkv(q)
	.reshape(3, self.num_heads, self.num_head_channels, -1)
	.transpose(-1, -2)
	.chunk(3)
	)
	q, k, v = (
	qkv[0].squeeze(0).contiguous(),
	qkv[1].squeeze(0).contiguous(),
	qkv[2].squeeze(0).contiguous(),
	)

	h = F.scaled_dot_product_attention(q, k, v, dropout_p=self.p_dropout).transpose(
	-1, -2
	)
	h = h.reshape(N, -1, L).contiguous()
	h = self.linear(h)

	x = q + h
	h = self.proj_out(x)

	return x + h