abhishekrs4's picture
added iam_line_recognition module
bd421ea
raw history blame
No virus
17.8 kB
import torch
import numpy as np
import torch.nn as nn
from typing import List
from torch import Tensor
import torch.nn.functional as F
from torchvision.models.resnet import BasicBlock, model_urls, load_state_dict_from_url, conv1x1, conv3x3
device = torch.device("cuda")
class CustomResNet(nn.Module):
def __init__(
self,
layers: List[int],
block=BasicBlock,
zero_init_residual=False,
groups=1,
num_classes=1000,
width_per_group=64,
replace_stride_with_dilation=None,
norm_layer=None,
):
super().__init__()
if norm_layer is None:
self._norm_layer = nn.BatchNorm2d
self.inplanes = 64
self.dilation = 1
if replace_stride_with_dilation is None:
# each element in the tuple indicates if we should replace
# the 2x2 stride with a dilated convolution instead
replace_stride_with_dilation = [False, False, False]
if len(replace_stride_with_dilation) != 3:
raise ValueError(
"replace_stride_with_dilation should be None "
f"or a 3-element tuple, got {replace_stride_with_dilation}"
)
self.groups = groups
self.base_width = width_per_group
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = self._norm_layer(self.inplanes)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=(2, 1), padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=(2, 1), dilate=replace_stride_with_dilation[0])
self.layer3 = self._make_layer(block, 256, layers[2], stride=(2, 2), dilate=replace_stride_with_dilation[1])
self.layer4 = self._make_layer(block, 512, layers[3], stride=(2, 1), dilate=replace_stride_with_dilation[2])
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
# Zero-initialize the last BN in each residual branch,
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
if zero_init_residual:
for m in self.modules():
if isinstance(m, BasicBlock):
nn.init.constant_(m.bn2.weight, 0) # type: ignore[arg-type]
def _make_layer(
self,
block,
planes,
blocks,
stride=1,
dilate=False,
) -> nn.Sequential:
norm_layer = self._norm_layer
downsample = None
previous_dilation = self.dilation
if dilate:
self.dilation *= stride
stride = 1
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
norm_layer(planes * block.expansion),
)
layers = []
layers.append(
block(
self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer
)
)
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(
block(
self.inplanes,
planes,
groups=self.groups,
base_width=self.base_width,
dilation=self.dilation,
norm_layer=norm_layer,
)
)
return nn.Sequential(*layers)
def _forward_impl(self, x: Tensor) -> Tensor:
# See note [TorchScript super()]
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
return x
def forward(self, x: Tensor) -> Tensor:
return self._forward_impl(x)
def _resnet(layers: List[int], pretrained=True) -> CustomResNet:
model = CustomResNet(layers)
if pretrained:
model.load_state_dict(load_state_dict_from_url(model_urls["resnet34"]))
return model
def resnet34(*, pretrained=True) -> CustomResNet:
"""ResNet-34 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
Args:
weights (:class:`~torchvision.models.ResNet34_Weights`, optional): The
pretrained weights to use. See
:class:`~torchvision.models.ResNet34_Weights` below for
more details, and possible values. By default, no pre-trained
weights are used.
progress (bool, optional): If True, displays a progress bar of the
download to stderr. Default is True.
**kwargs: parameters passed to the ``torchvision.models.resnet.ResNet``
base class. Please refer to the `source code
<https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py>`_
for more details about this class.
.. autoclass:: torchvision.models.ResNet34_Weights
:members:
"""
return _resnet([3, 4, 6, 3], pretrained=pretrained)
class ResNetFeatureExtractor(nn.Module):
"""
Defines Base ResNet-34 feature extractor
"""
def __init__(self, pretrained=True):
"""
---------
Arguments
---------
pretrained : bool (default=True)
boolean to indicate whether to use a pretrained resnet model or not
"""
super().__init__()
self.output_channels = 512
self.resnet34 = resnet34(pretrained=pretrained)
def forward(self, x):
block1 = self.resnet34.conv1(x)
block1 = self.resnet34.bn1(block1)
block1 = self.resnet34.relu(block1) # [64, H/2, W/2]
block2 = self.resnet34.maxpool(block1)
block2 = self.resnet34.layer1(block2) # [64, H/4, W/4]
block3 = self.resnet34.layer2(block2) # [128, H/8, W/8]
block4 = self.resnet34.layer3(block3) # [256, H/16, W/16]
resnet_features = self.resnet34.layer4(block4) # [512, H/32, W/32]
# [B, 512, H/32, W/32]
return resnet_features
#########################################
### STN - Spatial Transformer Network ###
#########################################
class TPS_SpatialTransformerNetwork(nn.Module):
""" Rectification Network of RARE, namely TPS based STN """
def __init__(self, num_fiducial_points, I_size, I_r_size, I_channel_num=1):
""" Based on RARE TPS
input:
batch_I: Batch Input Image [batch_size x I_channel_num x I_height x I_width]
I_size : (height, width) of the input image I
I_r_size : (height, width) of the rectified image I_r
I_channel_num : the number of channels of the input image I
output:
batch_I_r: rectified image [batch_size x I_channel_num x I_r_height x I_r_width]
"""
super(TPS_SpatialTransformerNetwork, self).__init__()
self.num_fiducial_points = num_fiducial_points
self.I_size = I_size
self.I_r_size = I_r_size # = (I_r_height, I_r_width)
self.I_channel_num = I_channel_num
self.LocalizationNetwork = LocalizationNetwork(self.num_fiducial_points, self.I_channel_num)
self.GridGenerator = GridGenerator(self.num_fiducial_points, self.I_r_size)
def forward(self, batch_I):
batch_C_prime = self.LocalizationNetwork(batch_I) # batch_size x K x 2
build_P_prime = self.GridGenerator.build_P_prime(batch_C_prime) # batch_size x n (= I_r_width x I_r_height) x 2
build_P_prime_reshape = build_P_prime.reshape([build_P_prime.size(0), self.I_r_size[0], self.I_r_size[1], 2])
if torch.__version__ > "1.2.0":
batch_I_r = F.grid_sample(batch_I, build_P_prime_reshape, padding_mode='border', align_corners=True)
else:
batch_I_r = F.grid_sample(batch_I, build_P_prime_reshape, padding_mode='border')
return batch_I_r
class LocalizationNetwork(nn.Module):
""" Localization Network of RARE, which predicts C' (K x 2) from I (I_width x I_height) """
def __init__(self, num_fiducial_points, I_channel_num):
super(LocalizationNetwork, self).__init__()
self.num_fiducial_points = num_fiducial_points
self.I_channel_num = I_channel_num
self.conv = nn.Sequential(
nn.Conv2d(in_channels=self.I_channel_num, out_channels=64, kernel_size=3, stride=1, padding=1,
bias=False), nn.BatchNorm2d(64), nn.ReLU(True),
nn.MaxPool2d(2, 2), # batch_size x 64 x I_height/2 x I_width/2
nn.Conv2d(64, 128, 3, 1, 1, bias=False), nn.BatchNorm2d(128), nn.ReLU(True),
nn.MaxPool2d(2, 2), # batch_size x 128 x I_height/4 x I_width/4
nn.Conv2d(128, 256, 3, 1, 1, bias=False), nn.BatchNorm2d(256), nn.ReLU(True),
nn.MaxPool2d(2, 2), # batch_size x 256 x I_height/8 x I_width/8
nn.Conv2d(256, 512, 3, 1, 1, bias=False), nn.BatchNorm2d(512), nn.ReLU(True),
nn.AdaptiveAvgPool2d(1) # batch_size x 512
)
self.localization_fc1 = nn.Sequential(nn.Linear(512, 256), nn.ReLU(True))
self.localization_fc2 = nn.Linear(256, self.num_fiducial_points * 2)
# Init fc2 in LocalizationNetwork
self.localization_fc2.weight.data.fill_(0)
""" see RARE paper Fig. 6 (a) """
ctrl_pts_x = np.linspace(-1.0, 1.0, int(num_fiducial_points / 2))
ctrl_pts_y_top = np.linspace(0.0, -1.0, num=int(num_fiducial_points / 2))
ctrl_pts_y_bottom = np.linspace(1.0, 0.0, num=int(num_fiducial_points / 2))
ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
self.localization_fc2.bias.data = torch.from_numpy(initial_bias).float().view(-1)
def forward(self, batch_I):
"""
input: batch_I : Batch Input Image [batch_size x I_channel_num x I_height x I_width]
output: batch_C_prime : Predicted coordinates of fiducial points for input batch [batch_size x F x 2]
"""
batch_size = batch_I.size(0)
features = self.conv(batch_I).view(batch_size, -1)
batch_C_prime = self.localization_fc2(self.localization_fc1(features)).view(batch_size, self.num_fiducial_points, 2)
return batch_C_prime
class GridGenerator(nn.Module):
""" Grid Generator of RARE, which produces P_prime by multipling T with P """
def __init__(self, num_fiducial_points, I_r_size):
""" Generate P_hat and inv_delta_C for later """
super(GridGenerator, self).__init__()
self.eps = 1e-6
self.I_r_height, self.I_r_width = I_r_size
self.num_fiducial_points = num_fiducial_points
self.C = self._build_C(self.num_fiducial_points) # F x 2
self.P = self._build_P(self.I_r_width, self.I_r_height)
## for multi-gpu, you need register buffer
self.register_buffer("inv_delta_C", torch.tensor(self._build_inv_delta_C(self.num_fiducial_points, self.C)).float()) # F+3 x F+3
self.register_buffer("P_hat", torch.tensor(self._build_P_hat(self.num_fiducial_points, self.C, self.P)).float()) # n x F+3
## for fine-tuning with different image width, you may use below instead of self.register_buffer
#self.inv_delta_C = torch.tensor(self._build_inv_delta_C(self.num_fiducial_points, self.C)).float().cuda() # F+3 x F+3
#self.P_hat = torch.tensor(self._build_P_hat(self.num_fiducial_points, self.C, self.P)).float().cuda() # n x F+3
def _build_C(self, F):
""" Return coordinates of fiducial points in I_r; C """
ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2))
ctrl_pts_y_top = -1 * np.ones(int(F / 2))
ctrl_pts_y_bottom = np.ones(int(F / 2))
ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
C = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
return C # F x 2
def _build_inv_delta_C(self, F, C):
""" Return inv_delta_C which is needed to calculate T """
hat_C = np.zeros((F, F), dtype=float) # F x F
for i in range(0, F):
for j in range(i, F):
r = np.linalg.norm(C[i] - C[j])
hat_C[i, j] = r
hat_C[j, i] = r
np.fill_diagonal(hat_C, 1)
hat_C = (hat_C ** 2) * np.log(hat_C)
# print(C.shape, hat_C.shape)
delta_C = np.concatenate( # F+3 x F+3
[
np.concatenate([np.ones((F, 1)), C, hat_C], axis=1), # F x F+3
np.concatenate([np.zeros((2, 3)), np.transpose(C)], axis=1), # 2 x F+3
np.concatenate([np.zeros((1, 3)), np.ones((1, F))], axis=1) # 1 x F+3
],
axis=0
)
inv_delta_C = np.linalg.inv(delta_C)
return inv_delta_C # F+3 x F+3
def _build_P(self, I_r_width, I_r_height):
I_r_grid_x = (np.arange(-I_r_width, I_r_width, 2) + 1.0) / I_r_width # self.I_r_width
I_r_grid_y = (np.arange(-I_r_height, I_r_height, 2) + 1.0) / I_r_height # self.I_r_height
P = np.stack( # self.I_r_width x self.I_r_height x 2
np.meshgrid(I_r_grid_x, I_r_grid_y),
axis=2
)
return P.reshape([-1, 2]) # n (= self.I_r_width x self.I_r_height) x 2
def _build_P_hat(self, F, C, P):
n = P.shape[0] # n (= self.I_r_width x self.I_r_height)
P_tile = np.tile(np.expand_dims(P, axis=1), (1, F, 1)) # n x 2 -> n x 1 x 2 -> n x F x 2
C_tile = np.expand_dims(C, axis=0) # 1 x F x 2
P_diff = P_tile - C_tile # n x F x 2
rbf_norm = np.linalg.norm(P_diff, ord=2, axis=2, keepdims=False) # n x F
rbf = np.multiply(np.square(rbf_norm), np.log(rbf_norm + self.eps)) # n x F
P_hat = np.concatenate([np.ones((n, 1)), P, rbf], axis=1)
return P_hat # n x F+3
def build_P_prime(self, batch_C_prime):
""" Generate Grid from batch_C_prime [batch_size x F x 2] """
batch_size = batch_C_prime.size(0)
batch_inv_delta_C = self.inv_delta_C.repeat(batch_size, 1, 1)
batch_P_hat = self.P_hat.repeat(batch_size, 1, 1)
batch_C_prime_with_zeros = torch.cat((batch_C_prime, torch.zeros(
batch_size, 3, 2).float().to(device)), dim=1) # batch_size x F+3 x 2
batch_T = torch.bmm(batch_inv_delta_C, batch_C_prime_with_zeros) # batch_size x F+3 x 2
batch_P_prime = torch.bmm(batch_P_hat, batch_T) # batch_size x n x 2
return batch_P_prime # batch_size x n x 2
"""
########################################
######## Pyramid Pooling Block #########
########################################
class PyramidPool(nn.Module):
def __init__(self, pool_kernel_size, in_channels, out_channels):
super().__init__()
self.pool_kernel_size = pool_kernel_size
self.avg_pool_block = nn.Sequential(
nn.AvgPool2d((1, self.pool_kernel_size), stride=(1, self.pool_kernel_size)),
nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding="same", bias=False),
nn.BatchNorm2d(out_channels),
nn.ELU(inplace=True),
)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.xavier_normal_(m.weight)
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def forward(self, x):
_, _, in_height, in_width = x.size()
x = self.avg_pool_block(x)
x = F.interpolate(x, size=(in_height, in_width), mode="bilinear")
return x
class PyramidPoolBlock(nn.Module):
def __init__(self, pyramid_pool_kernel_sizes=[4, 8, 16, 32], num_channels=512):
super().__init__()
pp_out_channels = 256
self.pyramid_pool_layers = nn.ModuleList([PyramidPool(pool_kernel_size=k, in_channels=num_channels, out_channels=pp_out_channels) for k in pyramid_pool_kernel_sizes])
self.final_layer = nn.Sequential(
nn.Conv2d((num_channels + (pp_out_channels * len(self.pyramid_pool_layers))), num_channels, (1, 5), stride=1, padding="same"),
nn.BatchNorm2d(num_channels),
nn.ELU(inplace=True),
nn.Dropout(p=0.1),
)
def forward(self, input):
pp_outputs = []
for pp_layer in self.pyramid_pool_layers:
pp_output = pp_layer(input)
pp_outputs.append(pp_output)
pp_outputs.append(input)
x = torch.cat(pp_outputs, dim=1)
x = self.final_layer(x)
return x
"""