camenduru
/

text2-video-zero

Text-to-Video

Model card Files Files and versions Community

text2-video-zero / annotator /uniformer /mmcv /cnn /bricks /context_block.py

camenduru

thanks to Text2Video-Zero team ❤

b944fa1 over 1 year ago

raw

history blame contribute delete

4.68 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	import torch
	from torch import nn

	from ..utils import constant_init, kaiming_init
	from .registry import PLUGIN_LAYERS


	def last_zero_init(m):
	if isinstance(m, nn.Sequential):
	constant_init(m[-1], val=0)
	else:
	constant_init(m, val=0)


	@PLUGIN_LAYERS.register_module()
	class ContextBlock(nn.Module):
	"""ContextBlock module in GCNet.

	See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
	(https://arxiv.org/abs/1904.11492) for details.

	Args:
	in_channels (int): Channels of the input feature map.
	ratio (float): Ratio of channels of transform bottleneck
	pooling_type (str): Pooling method for context modeling.
	Options are 'att' and 'avg', stand for attention pooling and
	average pooling respectively. Default: 'att'.
	fusion_types (Sequence[str]): Fusion method for feature fusion,
	Options are 'channels_add', 'channel_mul', stand for channelwise
	addition and multiplication respectively. Default: ('channel_add',)
	"""

	_abbr_ = 'context_block'

	def __init__(self,
	in_channels,
	ratio,
	pooling_type='att',
	fusion_types=('channel_add', )):
	super(ContextBlock, self).__init__()
	assert pooling_type in ['avg', 'att']
	assert isinstance(fusion_types, (list, tuple))
	valid_fusion_types = ['channel_add', 'channel_mul']
	assert all([f in valid_fusion_types for f in fusion_types])
	assert len(fusion_types) > 0, 'at least one fusion should be used'
	self.in_channels = in_channels
	self.ratio = ratio
	self.planes = int(in_channels * ratio)
	self.pooling_type = pooling_type
	self.fusion_types = fusion_types
	if pooling_type == 'att':
	self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1)
	self.softmax = nn.Softmax(dim=2)
	else:
	self.avg_pool = nn.AdaptiveAvgPool2d(1)
	if 'channel_add' in fusion_types:
	self.channel_add_conv = nn.Sequential(
	nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
	nn.LayerNorm([self.planes, 1, 1]),
	nn.ReLU(inplace=True), # yapf: disable
	nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
	else:
	self.channel_add_conv = None
	if 'channel_mul' in fusion_types:
	self.channel_mul_conv = nn.Sequential(
	nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
	nn.LayerNorm([self.planes, 1, 1]),
	nn.ReLU(inplace=True), # yapf: disable
	nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
	else:
	self.channel_mul_conv = None
	self.reset_parameters()

	def reset_parameters(self):
	if self.pooling_type == 'att':
	kaiming_init(self.conv_mask, mode='fan_in')
	self.conv_mask.inited = True

	if self.channel_add_conv is not None:
	last_zero_init(self.channel_add_conv)
	if self.channel_mul_conv is not None:
	last_zero_init(self.channel_mul_conv)

	def spatial_pool(self, x):
	batch, channel, height, width = x.size()
	if self.pooling_type == 'att':
	input_x = x
	# [N, C, H * W]
	input_x = input_x.view(batch, channel, height * width)
	# [N, 1, C, H * W]
	input_x = input_x.unsqueeze(1)
	# [N, 1, H, W]
	context_mask = self.conv_mask(x)
	# [N, 1, H * W]
	context_mask = context_mask.view(batch, 1, height * width)
	# [N, 1, H * W]
	context_mask = self.softmax(context_mask)
	# [N, 1, H * W, 1]
	context_mask = context_mask.unsqueeze(-1)
	# [N, 1, C, 1]
	context = torch.matmul(input_x, context_mask)
	# [N, C, 1, 1]
	context = context.view(batch, channel, 1, 1)
	else:
	# [N, C, 1, 1]
	context = self.avg_pool(x)

	return context

	def forward(self, x):
	# [N, C, 1, 1]
	context = self.spatial_pool(x)

	out = x
	if self.channel_mul_conv is not None:
	# [N, C, 1, 1]
	channel_mul_term = torch.sigmoid(self.channel_mul_conv(context))
	out = out * channel_mul_term
	if self.channel_add_conv is not None:
	# [N, C, 1, 1]
	channel_add_term = self.channel_add_conv(context)
	out = out + channel_add_term

	return out