Spaces:

abhishekrs4
/

Handwriting_Recognition

Running

App Files Files Community

Handwriting_Recognition / iam_line_recognition /model_visual_features.py

abhishekrs4

code formatting

44066b7 2 months ago

raw history blame contribute delete

No virus

18.7 kB

	import torch
	import numpy as np
	import torch.nn as nn
	from typing import List
	from torch import Tensor
	import torch.nn.functional as F
	from torchvision.models.resnet import (
	BasicBlock,
	model_urls,
	load_state_dict_from_url,
	conv1x1,
	conv3x3,
	)

	device = torch.device("cuda")


	class CustomResNet(nn.Module):
	def __init__(
	self,
	layers: List[int],
	block=BasicBlock,
	zero_init_residual=False,
	groups=1,
	num_classes=1000,
	width_per_group=64,
	replace_stride_with_dilation=None,
	norm_layer=None,
	):

	super().__init__()

	if norm_layer is None:
	self._norm_layer = nn.BatchNorm2d

	self.inplanes = 64
	self.dilation = 1

	if replace_stride_with_dilation is None:
	# each element in the tuple indicates if we should replace
	# the 2x2 stride with a dilated convolution instead
	replace_stride_with_dilation = [False, False, False]

	if len(replace_stride_with_dilation) != 3:
	raise ValueError(
	"replace_stride_with_dilation should be None "
	f"or a 3-element tuple, got {replace_stride_with_dilation}"
	)

	self.groups = groups
	self.base_width = width_per_group

	self.conv1 = nn.Conv2d(
	3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False
	)
	self.bn1 = self._norm_layer(self.inplanes)
	self.relu = nn.ReLU(inplace=True)
	self.maxpool = nn.MaxPool2d(kernel_size=3, stride=(2, 1), padding=1)
	self.layer1 = self._make_layer(block, 64, layers[0])
	self.layer2 = self._make_layer(
	block, 128, layers[1], stride=(2, 1), dilate=replace_stride_with_dilation[0]
	)
	self.layer3 = self._make_layer(
	block, 256, layers[2], stride=(2, 2), dilate=replace_stride_with_dilation[1]
	)
	self.layer4 = self._make_layer(
	block, 512, layers[3], stride=(2, 1), dilate=replace_stride_with_dilation[2]
	)
	self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
	self.fc = nn.Linear(512 * block.expansion, num_classes)

	for m in self.modules():
	if isinstance(m, nn.Conv2d):
	nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
	elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
	nn.init.constant_(m.weight, 1)
	nn.init.constant_(m.bias, 0)

	# Zero-initialize the last BN in each residual branch,
	# so that the residual branch starts with zeros, and each residual block behaves like an identity.
	# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
	if zero_init_residual:
	for m in self.modules():
	if isinstance(m, BasicBlock):
	nn.init.constant_(m.bn2.weight, 0) # type: ignore[arg-type]

	def _make_layer(
	self,
	block,
	planes,
	blocks,
	stride=1,
	dilate=False,
	) -> nn.Sequential:
	norm_layer = self._norm_layer
	downsample = None
	previous_dilation = self.dilation
	if dilate:
	self.dilation *= stride
	stride = 1
	if stride != 1 or self.inplanes != planes * block.expansion:
	downsample = nn.Sequential(
	conv1x1(self.inplanes, planes * block.expansion, stride),
	norm_layer(planes * block.expansion),
	)

	layers = []
	layers.append(
	block(
	self.inplanes,
	planes,
	stride,
	downsample,
	self.groups,
	self.base_width,
	previous_dilation,
	norm_layer,
	)
	)
	self.inplanes = planes * block.expansion
	for _ in range(1, blocks):
	layers.append(
	block(
	self.inplanes,
	planes,
	groups=self.groups,
	base_width=self.base_width,
	dilation=self.dilation,
	norm_layer=norm_layer,
	)
	)

	return nn.Sequential(*layers)

	def _forward_impl(self, x: Tensor) -> Tensor:
	# See note [TorchScript super()]
	x = self.conv1(x)
	x = self.bn1(x)
	x = self.relu(x)
	x = self.maxpool(x)

	x = self.layer1(x)
	x = self.layer2(x)
	x = self.layer3(x)
	x = self.layer4(x)
	return x

	def forward(self, x: Tensor) -> Tensor:
	return self._forward_impl(x)


	def _resnet(layers: List[int], pretrained=True) -> CustomResNet:
	model = CustomResNet(layers)

	if pretrained:
	model.load_state_dict(load_state_dict_from_url(model_urls["resnet34"]))

	return model


	def resnet34(*, pretrained=True) -> CustomResNet:
	"""ResNet-34 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
	Args:
	weights (:class:`~torchvision.models.ResNet34_Weights`, optional): The
	pretrained weights to use. See
	:class:`~torchvision.models.ResNet34_Weights` below for
	more details, and possible values. By default, no pre-trained
	weights are used.
	progress (bool, optional): If True, displays a progress bar of the
	download to stderr. Default is True.
	**kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
	base class. Please refer to the `source code
	<https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
	for more details about this class.
	.. autoclass:: torchvision.models.ResNet34_Weights
	:members:
	"""

	return _resnet([3, 4, 6, 3], pretrained=pretrained)


	class ResNetFeatureExtractor(nn.Module):
	"""
	Defines Base ResNet-34 feature extractor
	"""

	def __init__(self, pretrained=True):
	"""
	---------
	Arguments
	---------
	pretrained : bool (default=True)
	boolean to indicate whether to use a pretrained resnet model or not
	"""
	super().__init__()
	self.output_channels = 512
	self.resnet34 = resnet34(pretrained=pretrained)

	def forward(self, x):
	block1 = self.resnet34.conv1(x)
	block1 = self.resnet34.bn1(block1)
	block1 = self.resnet34.relu(block1) # [64, H/2, W/2]

	block2 = self.resnet34.maxpool(block1)
	block2 = self.resnet34.layer1(block2) # [64, H/4, W/4]
	block3 = self.resnet34.layer2(block2) # [128, H/8, W/8]
	block4 = self.resnet34.layer3(block3) # [256, H/16, W/16]
	resnet_features = self.resnet34.layer4(block4) # [512, H/32, W/32]

	# [B, 512, H/32, W/32]
	return resnet_features


	#########################################
	### STN - Spatial Transformer Network ###
	#########################################
	class TPS_SpatialTransformerNetwork(nn.Module):
	"""Rectification Network of RARE, namely TPS based STN"""

	def __init__(self, num_fiducial_points, I_size, I_r_size, I_channel_num=1):
	"""Based on RARE TPS
	input:
	batch_I: Batch Input Image [batch_size x I_channel_num x I_height x I_width]
	I_size : (height, width) of the input image I
	I_r_size : (height, width) of the rectified image I_r
	I_channel_num : the number of channels of the input image I
	output:
	batch_I_r: rectified image [batch_size x I_channel_num x I_r_height x I_r_width]
	"""
	super(TPS_SpatialTransformerNetwork, self).__init__()
	self.num_fiducial_points = num_fiducial_points
	self.I_size = I_size
	self.I_r_size = I_r_size # = (I_r_height, I_r_width)
	self.I_channel_num = I_channel_num
	self.LocalizationNetwork = LocalizationNetwork(
	self.num_fiducial_points, self.I_channel_num
	)
	self.GridGenerator = GridGenerator(self.num_fiducial_points, self.I_r_size)

	def forward(self, batch_I):
	batch_C_prime = self.LocalizationNetwork(batch_I) # batch_size x K x 2
	build_P_prime = self.GridGenerator.build_P_prime(
	batch_C_prime
	) # batch_size x n (= I_r_width x I_r_height) x 2
	build_P_prime_reshape = build_P_prime.reshape(
	[build_P_prime.size(0), self.I_r_size[0], self.I_r_size[1], 2]
	)

	if torch.__version__ > "1.2.0":
	batch_I_r = F.grid_sample(
	batch_I,
	build_P_prime_reshape,
	padding_mode="border",
	align_corners=True,
	)
	else:
	batch_I_r = F.grid_sample(
	batch_I, build_P_prime_reshape, padding_mode="border"
	)

	return batch_I_r


	class LocalizationNetwork(nn.Module):
	"""Localization Network of RARE, which predicts C' (K x 2) from I (I_width x I_height)"""

	def __init__(self, num_fiducial_points, I_channel_num):
	super(LocalizationNetwork, self).__init__()
	self.num_fiducial_points = num_fiducial_points
	self.I_channel_num = I_channel_num
	self.conv = nn.Sequential(
	nn.Conv2d(
	in_channels=self.I_channel_num,
	out_channels=64,
	kernel_size=3,
	stride=1,
	padding=1,
	bias=False,
	),
	nn.BatchNorm2d(64),
	nn.ReLU(True),
	nn.MaxPool2d(2, 2), # batch_size x 64 x I_height/2 x I_width/2
	nn.Conv2d(64, 128, 3, 1, 1, bias=False),
	nn.BatchNorm2d(128),
	nn.ReLU(True),
	nn.MaxPool2d(2, 2), # batch_size x 128 x I_height/4 x I_width/4
	nn.Conv2d(128, 256, 3, 1, 1, bias=False),
	nn.BatchNorm2d(256),
	nn.ReLU(True),
	nn.MaxPool2d(2, 2), # batch_size x 256 x I_height/8 x I_width/8
	nn.Conv2d(256, 512, 3, 1, 1, bias=False),
	nn.BatchNorm2d(512),
	nn.ReLU(True),
	nn.AdaptiveAvgPool2d(1), # batch_size x 512
	)

	self.localization_fc1 = nn.Sequential(nn.Linear(512, 256), nn.ReLU(True))
	self.localization_fc2 = nn.Linear(256, self.num_fiducial_points * 2)

	# Init fc2 in LocalizationNetwork
	self.localization_fc2.weight.data.fill_(0)
	""" see RARE paper Fig. 6 (a) """
	ctrl_pts_x = np.linspace(-1.0, 1.0, int(num_fiducial_points / 2))
	ctrl_pts_y_top = np.linspace(0.0, -1.0, num=int(num_fiducial_points / 2))
	ctrl_pts_y_bottom = np.linspace(1.0, 0.0, num=int(num_fiducial_points / 2))
	ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
	ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
	initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
	self.localization_fc2.bias.data = (
	torch.from_numpy(initial_bias).float().view(-1)
	)

	def forward(self, batch_I):
	"""
	input: batch_I : Batch Input Image [batch_size x I_channel_num x I_height x I_width]
	output: batch_C_prime : Predicted coordinates of fiducial points for input batch [batch_size x F x 2]
	"""
	batch_size = batch_I.size(0)
	features = self.conv(batch_I).view(batch_size, -1)
	batch_C_prime = self.localization_fc2(self.localization_fc1(features)).view(
	batch_size, self.num_fiducial_points, 2
	)
	return batch_C_prime


	class GridGenerator(nn.Module):
	"""Grid Generator of RARE, which produces P_prime by multipling T with P"""

	def __init__(self, num_fiducial_points, I_r_size):
	"""Generate P_hat and inv_delta_C for later"""
	super(GridGenerator, self).__init__()
	self.eps = 1e-6
	self.I_r_height, self.I_r_width = I_r_size
	self.num_fiducial_points = num_fiducial_points
	self.C = self._build_C(self.num_fiducial_points) # F x 2
	self.P = self._build_P(self.I_r_width, self.I_r_height)
	## for multi-gpu, you need register buffer
	self.register_buffer(
	"inv_delta_C",
	torch.tensor(
	self._build_inv_delta_C(self.num_fiducial_points, self.C)
	).float(),
	) # F+3 x F+3
	self.register_buffer(
	"P_hat",
	torch.tensor(
	self._build_P_hat(self.num_fiducial_points, self.C, self.P)
	).float(),
	) # n x F+3
	## for fine-tuning with different image width, you may use below instead of self.register_buffer
	# self.inv_delta_C = torch.tensor(self._build_inv_delta_C(self.num_fiducial_points, self.C)).float().cuda() # F+3 x F+3
	# self.P_hat = torch.tensor(self._build_P_hat(self.num_fiducial_points, self.C, self.P)).float().cuda() # n x F+3

	def _build_C(self, F):
	"""Return coordinates of fiducial points in I_r; C"""
	ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2))
	ctrl_pts_y_top = -1 * np.ones(int(F / 2))
	ctrl_pts_y_bottom = np.ones(int(F / 2))
	ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
	ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
	C = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
	return C # F x 2

	def _build_inv_delta_C(self, F, C):
	"""Return inv_delta_C which is needed to calculate T"""
	hat_C = np.zeros((F, F), dtype=float) # F x F
	for i in range(0, F):
	for j in range(i, F):
	r = np.linalg.norm(C[i] - C[j])
	hat_C[i, j] = r
	hat_C[j, i] = r
	np.fill_diagonal(hat_C, 1)
	hat_C = (hat_C*2) np.log(hat_C)
	# print(C.shape, hat_C.shape)
	delta_C = np.concatenate( # F+3 x F+3
	[
	np.concatenate([np.ones((F, 1)), C, hat_C], axis=1), # F x F+3
	np.concatenate([np.zeros((2, 3)), np.transpose(C)], axis=1), # 2 x F+3
	np.concatenate([np.zeros((1, 3)), np.ones((1, F))], axis=1), # 1 x F+3
	],
	axis=0,
	)
	inv_delta_C = np.linalg.inv(delta_C)
	return inv_delta_C # F+3 x F+3

	def _build_P(self, I_r_width, I_r_height):
	I_r_grid_x = (
	np.arange(-I_r_width, I_r_width, 2) + 1.0
	) / I_r_width # self.I_r_width
	I_r_grid_y = (
	np.arange(-I_r_height, I_r_height, 2) + 1.0
	) / I_r_height # self.I_r_height
	P = np.stack( # self.I_r_width x self.I_r_height x 2
	np.meshgrid(I_r_grid_x, I_r_grid_y), axis=2
	)
	return P.reshape([-1, 2]) # n (= self.I_r_width x self.I_r_height) x 2

	def _build_P_hat(self, F, C, P):
	n = P.shape[0] # n (= self.I_r_width x self.I_r_height)
	P_tile = np.tile(
	np.expand_dims(P, axis=1), (1, F, 1)
	) # n x 2 -> n x 1 x 2 -> n x F x 2
	C_tile = np.expand_dims(C, axis=0) # 1 x F x 2
	P_diff = P_tile - C_tile # n x F x 2
	rbf_norm = np.linalg.norm(P_diff, ord=2, axis=2, keepdims=False) # n x F
	rbf = np.multiply(np.square(rbf_norm), np.log(rbf_norm + self.eps)) # n x F
	P_hat = np.concatenate([np.ones((n, 1)), P, rbf], axis=1)
	return P_hat # n x F+3

	def build_P_prime(self, batch_C_prime):
	"""Generate Grid from batch_C_prime [batch_size x F x 2]"""
	batch_size = batch_C_prime.size(0)
	batch_inv_delta_C = self.inv_delta_C.repeat(batch_size, 1, 1)
	batch_P_hat = self.P_hat.repeat(batch_size, 1, 1)
	batch_C_prime_with_zeros = torch.cat(
	(batch_C_prime, torch.zeros(batch_size, 3, 2).float().to(device)), dim=1
	) # batch_size x F+3 x 2
	batch_T = torch.bmm(
	batch_inv_delta_C, batch_C_prime_with_zeros
	) # batch_size x F+3 x 2
	batch_P_prime = torch.bmm(batch_P_hat, batch_T) # batch_size x n x 2
	return batch_P_prime # batch_size x n x 2


	"""
	########################################
	######## Pyramid Pooling Block #########
	########################################
	class PyramidPool(nn.Module):
	def __init__(self, pool_kernel_size, in_channels, out_channels):
	super().__init__()
	self.pool_kernel_size = pool_kernel_size
	self.avg_pool_block = nn.Sequential(
	nn.AvgPool2d((1, self.pool_kernel_size), stride=(1, self.pool_kernel_size)),
	nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding="same", bias=False),
	nn.BatchNorm2d(out_channels),
	nn.ELU(inplace=True),
	)

	for m in self.modules():
	if isinstance(m, nn.Conv2d):
	nn.init.xavier_normal_(m.weight)
	elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
	nn.init.constant_(m.weight, 1)
	nn.init.constant_(m.bias, 0)

	def forward(self, x):
	_, _, in_height, in_width = x.size()
	x = self.avg_pool_block(x)
	x = F.interpolate(x, size=(in_height, in_width), mode="bilinear")
	return x


	class PyramidPoolBlock(nn.Module):
	def __init__(self, pyramid_pool_kernel_sizes=[4, 8, 16, 32], num_channels=512):
	super().__init__()
	pp_out_channels = 256
	self.pyramid_pool_layers = nn.ModuleList([PyramidPool(pool_kernel_size=k, in_channels=num_channels, out_channels=pp_out_channels) for k in pyramid_pool_kernel_sizes])
	self.final_layer = nn.Sequential(
	nn.Conv2d((num_channels + (pp_out_channels * len(self.pyramid_pool_layers))), num_channels, (1, 5), stride=1, padding="same"),
	nn.BatchNorm2d(num_channels),
	nn.ELU(inplace=True),
	nn.Dropout(p=0.1),
	)

	def forward(self, input):
	pp_outputs = []
	for pp_layer in self.pyramid_pool_layers:
	pp_output = pp_layer(input)
	pp_outputs.append(pp_output)
	pp_outputs.append(input)
	x = torch.cat(pp_outputs, dim=1)
	x = self.final_layer(x)
	return x
	"""