Upload folder using huggingface_hub

c187b4b verified 10 months ago

13.9 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	from .transformer import lang_tf_enc, TransformerEncoderLayer, TransformerEncoder
	from .position_encoding import PositionEmbeddingSine

	class SFA(nn.Module):
	def __init__(self, in_channels, out_channels, scale_factors = [1, 2, 4], fuse_type="sum"):
	super(SFA, self).__init__()
	self.stages = []
	for idx, scale in enumerate(scale_factors):
	out_dim = out_channels
	if scale == 4.0:
	layers = [
	nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2),
	nn.BatchNorm2d(
	num_features=in_channels // 2, eps=1e-5, momentum=0.999, affine=True),
	nn.GELU(),
	nn.ConvTranspose2d(in_channels // 2, in_channels // 4, kernel_size=2, stride=2),
	]
	out_dim = in_channels // 4
	elif scale == 2.0:
	layers = [nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)]
	out_dim = in_channels // 2
	elif scale == 1.0:
	layers = []
	out_dim = in_channels
	elif scale == 0.5:
	layers = [nn.MaxPool2d(kernel_size=2, stride=2)]
	else:
	raise NotImplementedError(f"scale_factor={scale} is not supported yet.")

	layers.extend(
	[
	ConvBatchNormReLU(out_dim, out_channels, 1, 1, 0, 1, leaky=True),
	ConvBatchNormReLU(out_channels, out_channels, 3, 1, 1, 1, leaky=True),
	]
	)
	layers = nn.Sequential(*layers)
	self.stages.append(layers)

	self.stages = nn.ModuleList(self.stages)

	# 假设所有输入特征图的通道数相同
	self.lateral_convs = nn.ModuleList([
	ConvBatchNormReLU(out_channels, out_channels, 1, 1, 0, 1, leaky=True) for _ in range(3)
	])

	self.output_convs = nn.ModuleList([
	ConvBatchNormReLU(out_channels, out_channels, 3, 1, 1, 1, leaky=True) for _ in range(3)
	])

	self._fuse_type = fuse_type # or "avg"

	self.downsample = nn.MaxPool2d(kernel_size=4, stride=4, padding=0)

	def forward(self, x):
	'''
	Args:
	x: list[Tensor], T个特征图，每个特征图的尺寸和通道数相同，[x12, x9, x6]
	'''
	# 模拟bottom-up, 获取多尺度特征图
	mutil_scale_features = []
	for idx, stage in enumerate(self.stages):
	mutil_scale_features.append(stage(x[idx]))

	# top-down
	results = []
	prev_features = self.lateral_convs[0](mutil_scale_features[0])

	for idx, (lateral_conv, output_conv) in enumerate(
	zip(self.lateral_convs, self.output_convs)
	):
	# Slicing of ModuleList is not supported https://github.com/pytorch/pytorch/issues/47336
	# Therefore we loop over all modules but skip the first one
	if idx > 0:
	features = mutil_scale_features[idx]
	top_down_features = F.interpolate(prev_features, scale_factor=2.0, mode="nearest")
	lateral_features = lateral_conv(features) # 1x1卷积
	prev_features = lateral_features + top_down_features
	if self._fuse_type == "avg":
	prev_features /= 2
	results.insert(0, output_conv(prev_features))

	fused_features = self.downsample(results[0]) # 1/4分辨率，需要转换为1/16分辨率

	return fused_features

	class ConvBatchNormReLU(nn.Module):
	def __init__(
	self,
	in_channels,
	out_channels,
	kernel_size,
	stride,
	padding,
	dilation,
	leaky=False,
	relu=True,
	instance=False,
	):
	super(ConvBatchNormReLU, self).__init__()
	self.conv = nn.Conv2d(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=kernel_size,
	stride=stride,
	padding=padding,
	dilation=dilation,
	bias=False)
	# nn.init.kaiming_normal_(self.conv.weight, mode="fan_out", nonlinearity="leaky_relu" if leaky else "relu")

	if instance:
	self.bn = nn.InstanceNorm2d(num_features=out_channels)
	else:
	self.bn = nn.BatchNorm2d(
	num_features=out_channels, eps=1e-5, momentum=0.999, affine=True
	)

	if leaky:
	self.relu = nn.LeakyReLU(0.1)
	elif relu:
	self.relu = nn.ReLU()
	def forward(self, x):
	x = self.conv(x)
	x = self.bn(x)
	x = self.relu(x)
	return x

	# class ConvBatchNormReLU(nn.Sequential):
	# def __init__(
	# self,
	# in_channels,
	# out_channels,
	# kernel_size,
	# stride,
	# padding,
	# dilation,
	# leaky=False,
	# relu=True,
	# instance=False,
	# ):
	# super(ConvBatchNormReLU, self).__init__()

	# conv = nn.Conv2d(
	# in_channels=in_channels,
	# out_channels=out_channels,
	# kernel_size=kernel_size,
	# stride=stride,
	# padding=padding,
	# dilation=dilation,
	# bias=False,
	# )
	# nn.init.kaiming_normal_(conv.weight, mode="fan_out", nonlinearity="leaky_relu" if leaky else "relu")

	# self.add_module(
	# "conv", conv
	# )

	# if instance:
	# self.add_module(
	# "bn",
	# nn.InstanceNorm2d(num_features=out_channels),
	# )
	# else:
	# self.add_module(
	# "bn",
	# nn.BatchNorm2d(
	# num_features=out_channels, eps=1e-5, momentum=0.999, affine=True
	# ),
	# )

	# if leaky:
	# self.add_module("relu", nn.LeakyReLU(0.1))
	# elif relu:
	# self.add_module("relu", nn.ReLU())

	# def forward(self, x):
	# return super(ConvBatchNormReLU, self).forward(x)


	def concat_coord(x):
	ins_feat = x # [bt, c, h, w] [512, 26, 26]
	batch_size, c, h, w = x.size()

	float_h = float(h)
	float_w = float(w)

	y_range = torch.arange(0., float_h, dtype=torch.float32)
	y_range = 2.0 * y_range / (float_h - 1.0) - 1.0
	x_range = torch.arange(0., float_w, dtype=torch.float32)
	x_range = 2.0 * x_range / (float_w - 1.0) - 1.0
	x_range = x_range[None, :]
	y_range = y_range[:, None]
	x = x_range.repeat(h, 1)
	y = y_range.repeat(1, w)

	x = x[None, None, :, :]
	y = y[None, None, :, :]
	x = x.repeat(batch_size, 1, 1, 1)
	y = y.repeat(batch_size, 1, 1, 1)
	x = x.cuda()
	y = y.cuda()

	ins_feat_out = torch.cat((ins_feat, x, x, x, y, y, y), 1)

	return ins_feat_out


	class query_generator(nn.Module):
	def __init__(self, input, output, leaky=True):
	super(query_generator, self).__init__()
	self.proj1 = ConvBatchNormReLU(input+6, input+6, 3, 1, 1, 1, leaky=leaky)
	self.proj2 = ConvBatchNormReLU(input+6, input+6, 3, 1, 1, 1, leaky=leaky)
	self.proj3 = ConvBatchNormReLU(input+6, input+6, 3, 1, 1, 1, leaky=leaky)
	self.proj = nn.Conv2d(input+6, output, 1, 1, 0, 1)

	def forward(self, x):
	x = concat_coord(x)
	x = x + self.proj1(x)
	x = x + self.proj2(x)
	x = x + self.proj3(x)
	x = self.proj(x)
	return x


	class KLM(nn.Module):
	def __init__(self, f_dim, feat_dim):
	super(KLM, self).__init__()
	self.lang_tf_enc = lang_tf_enc(f_dim, f_dim, f_dim, head_num=8)

	self.pos_embedding = PositionEmbeddingSine(f_dim)
	encoder_layer = TransformerEncoderLayer(f_dim, nhead=8, dim_feedforward=f_dim,
	dropout=0.1, activation='relu', normalize_before=False)
	self.encoder = TransformerEncoder(encoder_layer, num_layers=2, norm=nn.LayerNorm(f_dim))

	# self.catproj = nn.Linear(f_dim * 2, f_dim)

	self.fc_ker = nn.Linear(f_dim, feat_dim + feat_dim)
	self.fc_vis = nn.Linear(f_dim, feat_dim + feat_dim)
	self.ker_norm = nn.LayerNorm(feat_dim)
	self.vis_norm = nn.LayerNorm(feat_dim)

	self.channel_fc = nn.Linear(feat_dim, feat_dim)
	self.channel_norm = nn.LayerNorm(feat_dim)

	self.spatial_fc = nn.Linear(feat_dim, feat_dim)
	self.spatial_norm = nn.LayerNorm(feat_dim)

	self.out_fc = nn.Linear(feat_dim, f_dim)
	self.out_norm = nn.LayerNorm(f_dim)

	self.d_model = f_dim
	self.feat_dim = feat_dim
	self.resolution_size = 26

	def forward(self, kernel, lang_feat, visu_feat):
	# kernel B x N x C
	# lang_feat B x T x C
	# visu_feat B x C x HW
	kernel = self.lang_tf_enc(kernel, lang_feat)
	# B x N x C
	bs, c, hw = visu_feat.shape
	bq, nq, cq = kernel.shape
	bl, ll, cl = lang_feat.shape

	# Image Attention
	visu_feat = visu_feat.permute(0, 2, 1)
	# B x HW x C
	pos_embed = self.pos_embedding(visu_feat)
	# B x HW x C

	visu_feat = visu_feat.transpose(0, 1)
	pos_embed = pos_embed.transpose(0, 1)
	visu_feat_ = self.encoder(visu_feat, pos=pos_embed) # HW x B x C
	visu_feat_ = visu_feat_.transpose(0, 1) # B x HW x C

	# repeat visual feats
	visu_feat = visu_feat_.unsqueeze(dim=1) # B x 1 x HW x C
	kernel = kernel.unsqueeze(dim=2) # B x N x 1 x C
	lang_feat = lang_feat.unsqueeze(dim=2) # B x Q x 1 x C

	kernel_in = self.fc_ker(kernel)
	kernel_out = kernel_in[:, :, :, self.feat_dim:]
	kernel_in = kernel_in[:, :, :, :self.feat_dim]

	vis_in = self.fc_vis(visu_feat)
	vis_out = vis_in[:, :, :, self.feat_dim:]
	vis_in = vis_in[:, :, :, :self.feat_dim]

	gate_feat = self.ker_norm(kernel_in) * self.vis_norm(vis_in)
	#[B N HW 64]

	channel_gate = self.channel_norm(self.channel_fc(gate_feat))
	channel_gate = channel_gate.mean(2, keepdim=True)
	channel_gate = torch.sigmoid(channel_gate)
	# B x N x 1 x C

	spatial_gate = self.spatial_norm(self.spatial_fc(gate_feat))
	# spatial_gate = spatial_gate.mean(3, keepdim=True)
	spatial_gate = torch.sigmoid(spatial_gate)
	# B x N x HW x C

	channel_gate = (1 + channel_gate) * kernel_out # B x N x 1 x C
	channel_gate = channel_gate.squeeze(2) # B x N x C

	spatial_gate = (1 + spatial_gate) * vis_out # B x N x HW x C
	spatial_gate = spatial_gate.mean(2) # B x N x C

	gate_feat = (channel_gate + spatial_gate) / 2
	# [B N 64]
	gate_feat = self.out_fc(gate_feat)
	gate_feat = self.out_norm(gate_feat)
	gate_feat = F.relu(gate_feat)
	#[B N C]

	#visu_feat_.transpose(1, 2) [B C HW]
	return gate_feat, visu_feat_.transpose(1, 2)


	class KAM(nn.Module):
	def __init__(self, f_dim, num_query):
	super(KAM, self).__init__()

	self.k_size = 1

	self.proj = nn.Linear(26*26, f_dim)

	self.fc_k = nn.Linear(f_dim, f_dim)
	self.fc_m = nn.Linear(f_dim, f_dim)
	self.fc_fus = nn.Linear(f_dim * 2, f_dim)
	self.fc_out = nn.Linear(f_dim, 1)

	self.outproj = ConvBatchNormReLU(num_query, f_dim, 3, 1, 1, 1, leaky=True)
	self.maskproj = nn.Conv2d(f_dim, 1, 3, 1, 1, 1)

	self.bn = nn.BatchNorm2d(f_dim)

	self.mask_fcs = []
	for _ in range(3):
	self.mask_fcs.append(nn.Linear(f_dim, f_dim, bias=False))
	self.mask_fcs.append(nn.LayerNorm(f_dim))
	self.mask_fcs.append(nn.ReLU())
	self.mask_fcs = nn.Sequential(*self.mask_fcs)


	def forward(self, kernel, visu_feat):
	# kernel [B N C]
	# visu_feat [B C HW]
	kernel = self.mask_fcs(kernel)

	B, N, C = kernel.shape
	kernel_ = kernel
	kernel = kernel.reshape(B, N, -1, C).permute(0, 1, 3, 2) # B x N x C x 1
	kernel = kernel.reshape(B, N, C, self.k_size, self.k_size) # B x N x C x 1 x 1
	#[B N C K K]
	visu_feat_ = visu_feat
	visu_feat = visu_feat.reshape(B, C, 26, 26) # B x C x H x W

	masks = []
	for i in range(B):
	masks.append(F.conv2d(visu_feat[i: i+1], kernel[i], padding=int(self.k_size // 2))) # 1 x N x H x W
	masks = torch.cat(masks, dim=0) # B x N x H x W

	feats = masks.reshape(B, N, -1) # B x N x HW
	feats = self.proj(feats) # B x N x C

	weights_kern = F.relu(self.fc_k(kernel_))
	weights_mask = F.relu(self.fc_m(feats))

	weights = torch.cat([weights_kern, weights_mask], dim=-1) # B x N x 2C
	weights = F.relu(self.fc_fus(weights)) # B x N x C
	weights = self.fc_out(weights) # B x N x 1
	weights = F.softmax(weights, dim=1) # B x N x 1

	weights = weights.unsqueeze(-1) # B x N x 1 x 1

	mask = weights * masks # B x N x H x W
	mask = self.outproj(mask) # B x C x H x W
	mask = self.maskproj(mask)
	mask = F.sigmoid(mask) # B x 1 x H x W

	visu_feat = visu_feat * mask # B x C x H x W

	visu_feat = self.bn(visu_feat)
	visu_feat = visu_feat.reshape(B, C, -1) + visu_feat_
	visu_feat = F.relu(visu_feat)
	return visu_feat