diff --git a/IndicPhotoOCR/detection/textbpn/__init__.py b/IndicPhotoOCR/detection/textbpn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/IndicPhotoOCR/detection/textbpn/cfglib/config.py b/IndicPhotoOCR/detection/textbpn/cfglib/config.py new file mode 100644 index 0000000000000000000000000000000000000000..ae1ea831dd23c535b9d220932a26e5ba823f474b --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/cfglib/config.py @@ -0,0 +1,90 @@ +from easydict import EasyDict +import torch +import os + +config = EasyDict() + + +# Normalize image +config.means = (0.485, 0.456, 0.406) +config.stds = (0.229, 0.224, 0.225) + +config.gpu = "1" + +# Experiment name # +config.exp_name = "Synthtext" + +# dataloader jobs number +config.num_workers = 24 + +# batch_size +config.batch_size = 12 + +# training epoch number +config.max_epoch = 200 + +config.start_epoch = 0 + +# learning rate +config.lr = 1e-4 + +# using GPU +config.cuda = False + +config.output_dir = 'output' + +config.input_size = 640 + +# max polygon per image +# synText, total-text:64; CTW1500: 64; icdar: 64; MLT: 32; TD500: 64. +config.max_annotation = 64 + +# adj num for graph +config.adj_num = 4 + +# control points number +config.num_points = 20 + +# use hard examples (annotated as '#') +config.use_hard = True + +# Load data into memory at one time +config.load_memory = False + +# prediction on 1/scale feature map +config.scale = 1 + +# # clip gradient of loss +config.grad_clip = 25 + +# demo tcl threshold +config.dis_threshold = 0.4 + +config.cls_threshold = 0.8 + +# Contour approximation factor +config.approx_factor = 0.004 + + +def update_config(config, extra_config): + for k, v in vars(extra_config).items(): + config[k] = v + # print(config.gpu) + # config.device = torch.device('cuda') if config.cuda else torch.device('cpu') + config.device = torch.device('cpu') + + +def print_config(config): + print('==========Options============') + for k, v in config.items(): + print('{}: {}'.format(k, v)) + print('=============End=============') + + + +################### MY Settings ################## +config.resume=True + +config.device="cpu" + +# config.test_size = [224, 224] \ No newline at end of file diff --git a/IndicPhotoOCR/detection/textbpn/cfglib/option.py b/IndicPhotoOCR/detection/textbpn/cfglib/option.py new file mode 100644 index 0000000000000000000000000000000000000000..6b487d0a37e6ea9eaa4ab9c6d901f2253f38855c --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/cfglib/option.py @@ -0,0 +1,123 @@ +import argparse +import torch +import os +import torch.backends.cudnn as cudnn + +from datetime import datetime + + +def str2bool(v): + return v.lower() in ("yes", "true", "t", "1") + + +def arg2str(args): + args_dict = vars(args) + option_str = datetime.now().strftime('%b%d_%H-%M-%S') + '\n' + + for k, v in sorted(args_dict.items()): + option_str += ('{}: {}\n'.format(str(k), str(v))) + + return option_str + + +class BaseOptions(object): + + def __init__(self): + + self.parser = argparse.ArgumentParser() + + # basic opts + self.parser.add_argument('--exp_name', default="TD500", type=str, + choices=['Synthtext', 'Totaltext', 'Ctw1500','Icdar2015', + "MLT2017", 'TD500', "MLT2019", "ArT", "ALL"], help='Experiment name') + self.parser.add_argument("--gpu", default="1", help="set gpu id", type=str) + self.parser.add_argument('--resume', default=None, type=str, help='Path to target resume checkpoint') + self.parser.add_argument('--num_workers', default=24, type=int, help='Number of workers used in dataloading') + self.parser.add_argument('--cuda', default=True, type=str2bool, help='Use cuda to train model') + self.parser.add_argument('--mgpu', action='store_true', help='Use multi-gpu to train model') + self.parser.add_argument('--save_dir', default='./model/', help='Path to save checkpoint models') + self.parser.add_argument('--vis_dir', default='./vis/', help='Path to save visualization images') + self.parser.add_argument('--log_dir', default='./logs/', help='Path to tensorboard log') + self.parser.add_argument('--loss', default='CrossEntropyLoss', type=str, help='Training Loss') + # self.parser.add_argument('--input_channel', default=1, type=int, help='number of input channels' ) + self.parser.add_argument('--pretrain', default=False, type=str2bool, help='Pretrained AutoEncoder model') + self.parser.add_argument('--verbose', '-v', default=True, type=str2bool, help='Whether to output debug info') + self.parser.add_argument('--viz', action='store_true', help='Whether to output debug info') + # self.parser.add_argument('--viz', default=True, type=str2bool, help='Whether to output debug info') + + # train opts + self.parser.add_argument('--max_epoch', default=250, type=int, help='Max epochs') + self.parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float, help='initial learning rate') + self.parser.add_argument('--lr_adjust', default='fix', + choices=['fix', 'poly'], type=str, help='Learning Rate Adjust Strategy') + self.parser.add_argument('--stepvalues', default=[], nargs='+', type=int, help='# of iter to change lr') + self.parser.add_argument('--weight_decay', '--wd', default=0., type=float, help='Weight decay for SGD') + self.parser.add_argument('--gamma', default=0.1, type=float, help='Gamma update for SGD lr') + self.parser.add_argument('--momentum', default=0.9, type=float, help='momentum') + self.parser.add_argument('--batch_size', default=6, type=int, help='Batch size for training') + self.parser.add_argument('--optim', default='Adam', type=str, choices=['SGD', 'Adam'], help='Optimizer') + self.parser.add_argument('--save_freq', default=5, type=int, help='save weights every # epoch') + self.parser.add_argument('--display_freq', default=10, type=int, help='display training metrics every # iter') + self.parser.add_argument('--viz_freq', default=50, type=int, help='visualize training process every # iter') + self.parser.add_argument('--log_freq', default=10000, type=int, help='log to tensorboard every # iterations') + self.parser.add_argument('--val_freq', default=1000, type=int, help='do validation every # iterations') + + # backbone + self.parser.add_argument('--scale', default=1, type=int, help='prediction on 1/scale feature map') + self.parser.add_argument('--net', default='resnet50', type=str, + choices=['vgg', 'resnet50', 'resnet18', + "deformable_resnet18", "deformable_resnet50"], + help='Network architecture') + # data args + self.parser.add_argument('--load_memory', default=False, type=str2bool, help='Load data into memory') + self.parser.add_argument('--rescale', type=float, default=255.0, help='rescale factor') + self.parser.add_argument('--input_size', default=640, type=int, help='model input size') + self.parser.add_argument('--test_size', default=[640, 960], type=int, nargs='+', help='test size') + + # eval args00 + self.parser.add_argument('--checkepoch', default=1070, type=int, help='Load checkpoint number') + self.parser.add_argument('--start_epoch', default=0, type=int, help='start epoch number') + self.parser.add_argument('--cls_threshold', default=0.875, type=float, help='threshold of pse') + self.parser.add_argument('--dis_threshold', default=0.35, type=float, help='filter the socre < score_i') + + # demo args + self.parser.add_argument('--img_root', default=None, type=str, help='Path to deploy images') + + def parse(self, fixed=None): + + if fixed is not None: + args = self.parser.parse_args(fixed) + else: + args = self.parser.parse_args() + + return args + + def initialize(self, fixed=None): + + # Parse options + self.args = self.parse(fixed) + os.environ['CUDA_VISIBLE_DEVICES'] = self.args.gpu + + # Setting default torch Tensor type + if self.args.cuda and torch.cuda.is_available(): + torch.set_default_tensor_type('torch.cuda.FloatTensor') + cudnn.benchmark = True + else: + torch.set_default_tensor_type('torch.FloatTensor') + + # Create weights saving directory + if not os.path.exists(self.args.save_dir): + os.mkdir(self.args.save_dir) + + # Create weights saving directory of target model + model_save_path = os.path.join(self.args.save_dir, self.args.exp_name) + + if not os.path.exists(model_save_path): + os.mkdir(model_save_path) + + return self.args + + def update(self, args, extra_options): + + for k, v in extra_options.items(): + setattr(args, k, v) diff --git a/IndicPhotoOCR/detection/textbpn/models/TextBPN_resnet50_300.pth b/IndicPhotoOCR/detection/textbpn/models/TextBPN_resnet50_300.pth new file mode 100644 index 0000000000000000000000000000000000000000..8426c51dd291d2932737b611c63aa87d5f4098c9 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/models/TextBPN_resnet50_300.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b735b9c93c8758972d3b8cfd3ef8e1c09afa8cd9106f4cb11406300b141b1d78 +size 145703602 diff --git a/IndicPhotoOCR/detection/textbpn/network/Reg_loss.py b/IndicPhotoOCR/detection/textbpn/network/Reg_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..0ca0c0dcd96b44d2a1dead384eacd7c06f69d75b --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/Reg_loss.py @@ -0,0 +1,196 @@ +# -*- coding: utf-8 -*- +# @Time : 10/1/21 +# @Author : GXYM +import torch +from torch import nn +import numpy as np +import torch.nn.functional as F + + +class PolyMatchingLoss(nn.Module): + def __init__(self, pnum, device, loss_type="L1"): + super(PolyMatchingLoss, self).__init__() + + self.pnum = pnum + self.device = device + self.loss_type = loss_type + self.smooth_L1 = F.smooth_l1_loss + self.L2_loss = torch.nn.MSELoss(reduce=False, size_average=False) + + batch_size = 1 + pidxall = np.zeros(shape=(batch_size, pnum, pnum), dtype=np.int32) + for b in range(batch_size): + for i in range(pnum): + pidx = (np.arange(pnum) + i) % pnum + pidxall[b, i] = pidx + + pidxall = torch.from_numpy(np.reshape(pidxall, newshape=(batch_size, -1))).to(device) + self.feature_id = pidxall.unsqueeze_(2).long().expand(pidxall.size(0), pidxall.size(1), 2).detach() + print(self.feature_id.shape) + + def match_loss(self, pred, gt): + batch_size = pred.shape[0] + feature_id = self.feature_id.expand(batch_size, self.feature_id.size(1), 2) + + gt_expand = torch.gather(gt, 1, feature_id).view(batch_size, self.pnum, self.pnum, 2) + pred_expand = pred.unsqueeze(1) + + if self.loss_type == "L2": + dis = self.L2_loss(pred_expand, gt_expand) + dis = dis.sum(3).sqrt().mean(2) + elif self.loss_type == "L1": + dis = self.smooth_L1(pred_expand, gt_expand, reduction='none') + dis = dis.sum(3).mean(2) + + min_dis, min_id = torch.min(dis, dim=1, keepdim=True) + + return min_dis + + def forward(self, pred_list, gt): + loss = torch.tensor(0.) + for pred in pred_list: + loss += torch.mean(self.match_loss(pred, gt)) + + return loss / torch.tensor(len(pred_list)) + + # los = [] + # for pred in pred_list: + # los.append(self.match_loss(pred, gt)) + # + # los_b = torch.tensor(0.) + # loss_c = torch.tensor(0.) + # for i, _ in enumerate(los): + # los_b += torch.mean(los[i]) + # loss_c += (torch.mean(torch.clamp(los[i] - los[i - 1], min=0.0)) if i > 0 else torch.tensor(0.)) + # loss = los_b / torch.tensor(len(los)) + 0.5*loss_c / torch.tensor(len(los)-1) + # + # return loss + + +class AttentionLoss(nn.Module): + def __init__(self, beta=4, gamma=0.5): + super(AttentionLoss, self).__init__() + + self.beta = beta + self.gamma = gamma + + def forward(self, pred, gt): + num_pos = torch.sum(gt) + num_neg = torch.sum(1 - gt) + alpha = num_neg / (num_pos + num_neg) + edge_beta = torch.pow(self.beta, torch.pow(1 - pred, self.gamma)) + bg_beta = torch.pow(self.beta, torch.pow(pred, self.gamma)) + + loss = 0 + loss = loss - alpha * edge_beta * torch.log(pred) * gt + loss = loss - (1 - alpha) * bg_beta * torch.log(1 - pred) * (1 - gt) + return torch.mean(loss) + + +class GeoCrossEntropyLoss(nn.Module): + def __init__(self): + super(GeoCrossEntropyLoss, self).__init__() + + def forward(self, output, target, poly): + output = torch.nn.functional.softmax(output, dim=1) + output = torch.log(torch.clamp(output, min=1e-4)) + poly = poly.view(poly.size(0), 4, poly.size(1) // 4, 2) + target = target[..., None, None].expand(poly.size(0), poly.size(1), 1, poly.size(3)) + target_poly = torch.gather(poly, 2, target) + sigma = (poly[:, :, 0] - poly[:, :, 1]).pow(2).sum(2, keepdim=True) + kernel = torch.exp(-(poly - target_poly).pow(2).sum(3) / (sigma / 3)) + loss = -(output * kernel.transpose(2, 1)).sum(1).mean() + return loss + + +class AELoss(nn.Module): + def __init__(self): + super(AELoss, self).__init__() + + def forward(self, ae, ind, ind_mask): + """ + ae: [b, 1, h, w] + ind: [b, max_objs, max_parts] + ind_mask: [b, max_objs, max_parts] + obj_mask: [b, max_objs] + """ + # first index + b, _, h, w = ae.shape + b, max_objs, max_parts = ind.shape + obj_mask = torch.sum(ind_mask, dim=2) != 0 + + ae = ae.view(b, h * w, 1) + seed_ind = ind.view(b, max_objs * max_parts, 1) + tag = ae.gather(1, seed_ind).view(b, max_objs, max_parts) + + # compute the mean + tag_mean = tag * ind_mask + tag_mean = tag_mean.sum(2) / (ind_mask.sum(2) + 1e-4) + + # pull ae of the same object to their mean + pull_dist = (tag - tag_mean.unsqueeze(2)).pow(2) * ind_mask + obj_num = obj_mask.sum(dim=1).float() + pull = (pull_dist.sum(dim=(1, 2)) / (obj_num + 1e-4)).sum() + pull /= b + + # push away the mean of different objects + push_dist = torch.abs(tag_mean.unsqueeze(1) - tag_mean.unsqueeze(2)) + push_dist = 1 - push_dist + push_dist = nn.functional.relu(push_dist, inplace=True) + obj_mask = (obj_mask.unsqueeze(1) + obj_mask.unsqueeze(2)) == 2 + push_dist = push_dist * obj_mask.float() + push = ((push_dist.sum(dim=(1, 2)) - obj_num) / (obj_num * (obj_num - 1) + 1e-4)).sum() + push /= b + return pull, push + + +def smooth_l1_loss(inputs, target, sigma=9.0): + try: + diff = torch.abs(inputs - target) + less_one = (diff < 1.0 / sigma).float() + loss = less_one * 0.5 * diff ** 2 * sigma \ + + torch.abs(torch.tensor(1.0) - less_one) * (diff - 0.5 / sigma) + loss = torch.mean(loss) if loss.numel() > 0 else torch.tensor(0.0) + except Exception as e: + print('RPN_REGR_Loss Exception:', e) + loss = torch.tensor(0.0) + + return loss + + +def _neg_loss(pred, gt): + ''' Modified focal loss. Exactly the same as CornerNet. + Runs faster and costs a little bit more memory + Arguments: + pred (batch x c x h x w) + gt_regr (batch x c x h x w) + ''' + pos_inds = gt.eq(1).float() + neg_inds = gt.lt(1).float() + + neg_weights = torch.pow(1 - gt, 4) + + loss = 0 + + pos_loss = torch.log(pred) * torch.pow(1 - pred, 2) * pos_inds + neg_loss = torch.log(1 - pred) * torch.pow(pred, 2) * neg_weights * neg_inds + + num_pos = pos_inds.float().sum() + pos_loss = pos_loss.sum() + neg_loss = neg_loss.sum() + + if num_pos == 0: + loss = loss - neg_loss + else: + loss = loss - (pos_loss + neg_loss) / num_pos + return loss + + +class FocalLoss(nn.Module): + '''nn.Module warpper for focal loss''' + def __init__(self): + super(FocalLoss, self).__init__() + self.neg_loss = _neg_loss + + def forward(self, out, target): + return self.neg_loss(out, target) \ No newline at end of file diff --git a/IndicPhotoOCR/detection/textbpn/network/Seg_loss.py b/IndicPhotoOCR/detection/textbpn/network/Seg_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..a03507f650f17d747c7f2eba2f1d31f57e411cb1 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/Seg_loss.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- +# @Time : 10/1/21 +# @Author : GXYM +import torch +from torch import nn +import numpy as np + + +class SegmentLoss(nn.Module): + def __init__(self, Lambda, ratio=3, reduction='mean'): + """Implement PSE Loss. + """ + super(SegmentLoss, self).__init__() + assert reduction in ['mean', 'sum'], " reduction must in ['mean','sum']" + self.Lambda = Lambda + self.ratio = ratio + self.reduction = reduction + + def forward(self, outputs, labels, training_masks, th=0.5): + texts = outputs[:, -1, :, :] + kernels = outputs[:, :-1, :, :] + gt_texts = labels[:, -1, :, :] + gt_kernels = labels[:, :-1, :, :] + + selected_masks = self.ohem_batch(texts, gt_texts, training_masks) + selected_masks = selected_masks.to(outputs.device) + + loss_text = self.dice_loss(texts, gt_texts, selected_masks) + + loss_kernels = [] + # mask0 = torch.sigmoid(texts).data.cpu().numpy() + mask0 = texts.data.cpu().numpy() + mask1 = training_masks.data.cpu().numpy() + selected_masks = ((mask0 > th) & (mask1 > th)).astype('float32') + selected_masks = torch.from_numpy(selected_masks).float() + selected_masks = selected_masks.to(outputs.device) + kernels_num = gt_kernels.size()[1] + for i in range(kernels_num): + kernel_i = kernels[:, i, :, :] + gt_kernel_i = gt_kernels[:, i, :, :] + loss_kernel_i = self.dice_loss(kernel_i, gt_kernel_i, selected_masks) + loss_kernels.append(loss_kernel_i) + loss_kernels = torch.stack(loss_kernels).mean(0) + if self.reduction == 'mean': + loss_text = loss_text.mean() + loss_kernels = loss_kernels.mean() + elif self.reduction == 'sum': + loss_text = loss_text.sum() + loss_kernels = loss_kernels.sum() + + loss = self.Lambda *loss_text + (1-self.Lambda)*loss_kernels + return loss_text, loss_kernels, loss + + def dice_loss(self, input, target, mask): + # input = torch.sigmoid(input) + + input = input.contiguous().view(input.size()[0], -1) + target = target.contiguous().view(target.size()[0], -1) + mask = mask.contiguous().view(mask.size()[0], -1) + + input = input * mask + target = (target.float()) * mask + + a = torch.sum(input * target, 1) + b = torch.sum(input * input, 1) + 0.001 + c = torch.sum(target * target, 1) + 0.001 + d = (2 * a) / (b + c) + return 1 - d + + def ohem_single(self, score, gt_text, training_mask, th=0.5): + pos_num = (int)(np.sum(gt_text > th)) - (int)(np.sum((gt_text > th) & (training_mask <= th))) + + if pos_num == 0: + # selected_mask = gt_text.copy() * 0 # may be not good + selected_mask = training_mask + selected_mask = selected_mask.reshape(1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32') + return selected_mask + + neg_num = (int)(np.sum(gt_text <= th)) + neg_num = (int)(min(pos_num * 3, neg_num)) + + if neg_num == 0: + selected_mask = training_mask + selected_mask = selected_mask.reshape(1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32') + return selected_mask + + neg_score = score[gt_text <= th] + # 将负样本得分从高到低排序 + neg_score_sorted = np.sort(-neg_score) + threshold = -neg_score_sorted[neg_num - 1] + # 选出 得分高的 负样本 和正样本 的 mask + selected_mask = ((score >= threshold) | (gt_text > th)) & (training_mask > th) + selected_mask = selected_mask.reshape(1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32') + return selected_mask + + def ohem_batch(self, scores, gt_texts, training_masks): + scores = scores.data.cpu().numpy() + gt_texts = gt_texts.data.cpu().numpy() + training_masks = training_masks.data.cpu().numpy() + selected_masks = [] + for i in range(scores.shape[0]): + selected_masks.append(self.ohem_single(scores[i, :, :], gt_texts[i, :, :], training_masks[i, :, :])) + + selected_masks = np.concatenate(selected_masks, 0) + selected_masks = torch.from_numpy(selected_masks).float() + + return selected_masks diff --git a/IndicPhotoOCR/detection/textbpn/network/__init__.py b/IndicPhotoOCR/detection/textbpn/network/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b9742821a6f164200bc145e7a847382f08778303 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/__init__.py @@ -0,0 +1 @@ +from . import * \ No newline at end of file diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/__init__.py b/IndicPhotoOCR/detection/textbpn/network/backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d05af9e17f2bb084365379d39f38305dc23f339e --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/backbone/__init__.py @@ -0,0 +1 @@ +from .resnet import resnet18, resnet34, resnet50, resnet101, deformable_resnet50, deformable_resnet18 \ No newline at end of file diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/Makefile b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..c50242699a4ddb7d97650378ef2a199fde6b3d99 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/Makefile @@ -0,0 +1,6 @@ +#!/bin/bash +rm *.so +python setup.py build_ext --inplace +rm -rf ./build + + diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/Makefile.sh b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/Makefile.sh new file mode 100644 index 0000000000000000000000000000000000000000..c50242699a4ddb7d97650378ef2a199fde6b3d99 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/Makefile.sh @@ -0,0 +1,6 @@ +#!/bin/bash +rm *.so +python setup.py build_ext --inplace +rm -rf ./build + + diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/__init__.py b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..165e63725354de429a448d866f665cccca991916 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/__init__.py @@ -0,0 +1,13 @@ +from .functions.deform_conv import deform_conv, modulated_deform_conv +from .functions.deform_pool import deform_roi_pooling +from .modules.deform_conv import (DeformConv, ModulatedDeformConv, + DeformConvPack, ModulatedDeformConvPack) +from .modules.deform_pool import (DeformRoIPooling, DeformRoIPoolingPack, + ModulatedDeformRoIPoolingPack) + +__all__ = [ + 'DeformConv', 'DeformConvPack', 'ModulatedDeformConv', + 'ModulatedDeformConvPack', 'DeformRoIPooling', 'DeformRoIPoolingPack', + 'ModulatedDeformRoIPoolingPack', 'deform_conv', 'modulated_deform_conv', + 'deform_roi_pooling' +] diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/functions/__init__.py b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/functions/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/functions/deform_conv.py b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/functions/deform_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..6af75a758b8448ca1d981054525259f536d99d1e --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/functions/deform_conv.py @@ -0,0 +1,181 @@ +import torch +from torch.autograd import Function +from torch.nn.modules.utils import _pair + +from .. import deform_conv_cuda + + +class DeformConvFunction(Function): + + @staticmethod + def forward(ctx, + input, + offset, + weight, + stride=1, + padding=0, + dilation=1, + groups=1, + deformable_groups=1, + im2col_step=64): + if input is not None and input.dim() != 4: + raise ValueError( + "Expected 4D tensor as input, got {}D tensor instead.".format( + input.dim())) + ctx.stride = _pair(stride) + ctx.padding = _pair(padding) + ctx.dilation = _pair(dilation) + ctx.groups = groups + ctx.deformable_groups = deformable_groups + ctx.im2col_step = im2col_step + + ctx.save_for_backward(input, offset, weight) + + output = input.new_empty( + DeformConvFunction._output_size(input, weight, ctx.padding, + ctx.dilation, ctx.stride)) + + ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones + + if not input.is_cuda: + raise NotImplementedError + else: + cur_im2col_step = min(ctx.im2col_step, input.shape[0]) + assert (input.shape[0] % + cur_im2col_step) == 0, 'im2col step must divide batchsize' + deform_conv_cuda.deform_conv_forward_cuda( + input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1], + weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0], + ctx.padding[1], ctx.padding[0], ctx.dilation[1], + ctx.dilation[0], ctx.groups, ctx.deformable_groups, + cur_im2col_step) + return output + + @staticmethod + def backward(ctx, grad_output): + input, offset, weight = ctx.saved_tensors + + grad_input = grad_offset = grad_weight = None + + if not grad_output.is_cuda: + raise NotImplementedError + else: + cur_im2col_step = min(ctx.im2col_step, input.shape[0]) + assert (input.shape[0] % + cur_im2col_step) == 0, 'im2col step must divide batchsize' + + if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: + grad_input = torch.zeros_like(input) + grad_offset = torch.zeros_like(offset) + deform_conv_cuda.deform_conv_backward_input_cuda( + input, offset, grad_output, grad_input, + grad_offset, weight, ctx.bufs_[0], weight.size(3), + weight.size(2), ctx.stride[1], ctx.stride[0], + ctx.padding[1], ctx.padding[0], ctx.dilation[1], + ctx.dilation[0], ctx.groups, ctx.deformable_groups, + cur_im2col_step) + + if ctx.needs_input_grad[2]: + grad_weight = torch.zeros_like(weight) + deform_conv_cuda.deform_conv_backward_parameters_cuda( + input, offset, grad_output, + grad_weight, ctx.bufs_[0], ctx.bufs_[1], weight.size(3), + weight.size(2), ctx.stride[1], ctx.stride[0], + ctx.padding[1], ctx.padding[0], ctx.dilation[1], + ctx.dilation[0], ctx.groups, ctx.deformable_groups, 1, + cur_im2col_step) + + return (grad_input, grad_offset, grad_weight, None, None, None, None, + None) + + @staticmethod + def _output_size(input, weight, padding, dilation, stride): + channels = weight.size(0) + output_size = (input.size(0), channels) + for d in range(input.dim() - 2): + in_size = input.size(d + 2) + pad = padding[d] + kernel = dilation[d] * (weight.size(d + 2) - 1) + 1 + stride_ = stride[d] + output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, ) + if not all(map(lambda s: s > 0, output_size)): + raise ValueError( + "convolution input is too small (output would be {})".format( + 'x'.join(map(str, output_size)))) + return output_size + + +class ModulatedDeformConvFunction(Function): + + @staticmethod + def forward(ctx, + input, + offset, + mask, + weight, + bias=None, + stride=1, + padding=0, + dilation=1, + groups=1, + deformable_groups=1): + ctx.stride = stride + ctx.padding = padding + ctx.dilation = dilation + ctx.groups = groups + ctx.deformable_groups = deformable_groups + ctx.with_bias = bias is not None + if not ctx.with_bias: + bias = input.new_empty(1) # fake tensor + if not input.is_cuda: + raise NotImplementedError + if weight.requires_grad or mask.requires_grad or offset.requires_grad \ + or input.requires_grad: + ctx.save_for_backward(input, offset, mask, weight, bias) + output = input.new_empty( + ModulatedDeformConvFunction._infer_shape(ctx, input, weight)) + ctx._bufs = [input.new_empty(0), input.new_empty(0)] + deform_conv_cuda.modulated_deform_conv_cuda_forward( + input, weight, bias, ctx._bufs[0], offset, mask, output, + ctx._bufs[1], weight.shape[2], weight.shape[3], ctx.stride, + ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation, + ctx.groups, ctx.deformable_groups, ctx.with_bias) + return output + + @staticmethod + def backward(ctx, grad_output): + if not grad_output.is_cuda: + raise NotImplementedError + input, offset, mask, weight, bias = ctx.saved_tensors + grad_input = torch.zeros_like(input) + grad_offset = torch.zeros_like(offset) + grad_mask = torch.zeros_like(mask) + grad_weight = torch.zeros_like(weight) + grad_bias = torch.zeros_like(bias) + deform_conv_cuda.modulated_deform_conv_cuda_backward( + input, weight, bias, ctx._bufs[0], offset, mask, ctx._bufs[1], + grad_input, grad_weight, grad_bias, grad_offset, grad_mask, + grad_output, weight.shape[2], weight.shape[3], ctx.stride, + ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation, + ctx.groups, ctx.deformable_groups, ctx.with_bias) + if not ctx.with_bias: + grad_bias = None + + return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias, + None, None, None, None, None) + + @staticmethod + def _infer_shape(ctx, input, weight): + n = input.size(0) + channels_out = weight.size(0) + height, width = input.shape[2:4] + kernel_h, kernel_w = weight.shape[2:4] + height_out = (height + 2 * ctx.padding - + (ctx.dilation * (kernel_h - 1) + 1)) // ctx.stride + 1 + width_out = (width + 2 * ctx.padding - + (ctx.dilation * (kernel_w - 1) + 1)) // ctx.stride + 1 + return n, channels_out, height_out, width_out + + +deform_conv = DeformConvFunction.apply +modulated_deform_conv = ModulatedDeformConvFunction.apply diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/functions/deform_pool.py b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/functions/deform_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..65ff0efb5737e87ccf49387b2d24abcbeedd6497 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/functions/deform_pool.py @@ -0,0 +1,69 @@ +import torch +from torch.autograd import Function + +from .. import deform_pool_cuda + + +class DeformRoIPoolingFunction(Function): + + @staticmethod + def forward(ctx, + data, + rois, + offset, + spatial_scale, + out_size, + out_channels, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0): + ctx.spatial_scale = spatial_scale + ctx.out_size = out_size + ctx.out_channels = out_channels + ctx.no_trans = no_trans + ctx.group_size = group_size + ctx.part_size = out_size if part_size is None else part_size + ctx.sample_per_part = sample_per_part + ctx.trans_std = trans_std + + assert 0.0 <= ctx.trans_std <= 1.0 + if not data.is_cuda: + raise NotImplementedError + + n = rois.shape[0] + output = data.new_empty(n, out_channels, out_size, out_size) + output_count = data.new_empty(n, out_channels, out_size, out_size) + deform_pool_cuda.deform_psroi_pooling_cuda_forward( + data, rois, offset, output, output_count, ctx.no_trans, + ctx.spatial_scale, ctx.out_channels, ctx.group_size, ctx.out_size, + ctx.part_size, ctx.sample_per_part, ctx.trans_std) + + if data.requires_grad or rois.requires_grad or offset.requires_grad: + ctx.save_for_backward(data, rois, offset) + ctx.output_count = output_count + + return output + + @staticmethod + def backward(ctx, grad_output): + if not grad_output.is_cuda: + raise NotImplementedError + + data, rois, offset = ctx.saved_tensors + output_count = ctx.output_count + grad_input = torch.zeros_like(data) + grad_rois = None + grad_offset = torch.zeros_like(offset) + + deform_pool_cuda.deform_psroi_pooling_cuda_backward( + grad_output, data, rois, offset, output_count, grad_input, + grad_offset, ctx.no_trans, ctx.spatial_scale, ctx.out_channels, + ctx.group_size, ctx.out_size, ctx.part_size, ctx.sample_per_part, + ctx.trans_std) + return (grad_input, grad_rois, grad_offset, None, None, None, None, + None, None, None, None) + + +deform_roi_pooling = DeformRoIPoolingFunction.apply diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/modules/__init__.py b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/modules/deform_conv.py b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/modules/deform_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..50d15d1513f0ebc145982e04958f76a5f1ca1343 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/modules/deform_conv.py @@ -0,0 +1,157 @@ +import math + +import torch +import torch.nn as nn +from torch.nn.modules.utils import _pair + +from ..functions.deform_conv import deform_conv, modulated_deform_conv + + +class DeformConv(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + deformable_groups=1, + bias=False): + super(DeformConv, self).__init__() + + assert not bias + assert in_channels % groups == 0, \ + 'in_channels {} cannot be divisible by groups {}'.format( + in_channels, groups) + assert out_channels % groups == 0, \ + 'out_channels {} cannot be divisible by groups {}'.format( + out_channels, groups) + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride) + self.padding = _pair(padding) + self.dilation = _pair(dilation) + self.groups = groups + self.deformable_groups = deformable_groups + + self.weight = nn.Parameter( + torch.Tensor(out_channels, in_channels // self.groups, + *self.kernel_size)) + + self.reset_parameters() + + def reset_parameters(self): + n = self.in_channels + for k in self.kernel_size: + n *= k + stdv = 1. / math.sqrt(n) + self.weight.data.uniform_(-stdv, stdv) + + def forward(self, x, offset): + return deform_conv(x, offset, self.weight, self.stride, self.padding, + self.dilation, self.groups, self.deformable_groups) + + +class DeformConvPack(DeformConv): + + def __init__(self, *args, **kwargs): + super(DeformConvPack, self).__init__(*args, **kwargs) + + self.conv_offset = nn.Conv2d( + self.in_channels, + self.deformable_groups * 2 * self.kernel_size[0] * + self.kernel_size[1], + kernel_size=self.kernel_size, + stride=_pair(self.stride), + padding=_pair(self.padding), + bias=True) + self.init_offset() + + def init_offset(self): + self.conv_offset.weight.data.zero_() + self.conv_offset.bias.data.zero_() + + def forward(self, x): + offset = self.conv_offset(x) + return deform_conv(x, offset, self.weight, self.stride, self.padding, + self.dilation, self.groups, self.deformable_groups) + + +class ModulatedDeformConv(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + deformable_groups=1, + bias=True): + super(ModulatedDeformConv, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + self.deformable_groups = deformable_groups + self.with_bias = bias + + self.weight = nn.Parameter( + torch.Tensor(out_channels, in_channels // groups, + *self.kernel_size)) + if bias: + self.bias = nn.Parameter(torch.Tensor(out_channels)) + else: + self.register_parameter('bias', None) + self.reset_parameters() + + def reset_parameters(self): + n = self.in_channels + for k in self.kernel_size: + n *= k + stdv = 1. / math.sqrt(n) + self.weight.data.uniform_(-stdv, stdv) + if self.bias is not None: + self.bias.data.zero_() + + def forward(self, x, offset, mask): + return modulated_deform_conv(x, offset, mask, self.weight, self.bias, + self.stride, self.padding, self.dilation, + self.groups, self.deformable_groups) + + +class ModulatedDeformConvPack(ModulatedDeformConv): + + def __init__(self, *args, **kwargs): + super(ModulatedDeformConvPack, self).__init__(*args, **kwargs) + + self.conv_offset_mask = nn.Conv2d( + self.in_channels, + self.deformable_groups * 3 * self.kernel_size[0] * + self.kernel_size[1], + kernel_size=self.kernel_size, + stride=_pair(self.stride), + padding=_pair(self.padding), + bias=True) + self.init_offset() + + def init_offset(self): + self.conv_offset_mask.weight.data.zero_() + self.conv_offset_mask.bias.data.zero_() + + def forward(self, x): + out = self.conv_offset_mask(x) + o1, o2, mask = torch.chunk(out, 3, dim=1) + offset = torch.cat((o1, o2), dim=1) + mask = torch.sigmoid(mask) + return modulated_deform_conv(x, offset, mask, self.weight, self.bias, + self.stride, self.padding, self.dilation, + self.groups, self.deformable_groups) diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/modules/deform_pool.py b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/modules/deform_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..5e0196753ee1b427263bc397e0ae842af6a9938b --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/modules/deform_pool.py @@ -0,0 +1,172 @@ +from torch import nn + +from ..functions.deform_pool import deform_roi_pooling + + +class DeformRoIPooling(nn.Module): + + def __init__(self, + spatial_scale, + out_size, + out_channels, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0): + super(DeformRoIPooling, self).__init__() + self.spatial_scale = spatial_scale + self.out_size = out_size + self.out_channels = out_channels + self.no_trans = no_trans + self.group_size = group_size + self.part_size = out_size if part_size is None else part_size + self.sample_per_part = sample_per_part + self.trans_std = trans_std + + def forward(self, data, rois, offset): + if self.no_trans: + offset = data.new_empty(0) + return deform_roi_pooling( + data, rois, offset, self.spatial_scale, self.out_size, + self.out_channels, self.no_trans, self.group_size, self.part_size, + self.sample_per_part, self.trans_std) + + +class DeformRoIPoolingPack(DeformRoIPooling): + + def __init__(self, + spatial_scale, + out_size, + out_channels, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0, + num_offset_fcs=3, + deform_fc_channels=1024): + super(DeformRoIPoolingPack, + self).__init__(spatial_scale, out_size, out_channels, no_trans, + group_size, part_size, sample_per_part, trans_std) + + self.num_offset_fcs = num_offset_fcs + self.deform_fc_channels = deform_fc_channels + + if not no_trans: + seq = [] + ic = self.out_size * self.out_size * self.out_channels + for i in range(self.num_offset_fcs): + if i < self.num_offset_fcs - 1: + oc = self.deform_fc_channels + else: + oc = self.out_size * self.out_size * 2 + seq.append(nn.Linear(ic, oc)) + ic = oc + if i < self.num_offset_fcs - 1: + seq.append(nn.ReLU(inplace=True)) + self.offset_fc = nn.Sequential(*seq) + self.offset_fc[-1].weight.data.zero_() + self.offset_fc[-1].bias.data.zero_() + + def forward(self, data, rois): + assert data.size(1) == self.out_channels + if self.no_trans: + offset = data.new_empty(0) + return deform_roi_pooling( + data, rois, offset, self.spatial_scale, self.out_size, + self.out_channels, self.no_trans, self.group_size, + self.part_size, self.sample_per_part, self.trans_std) + else: + n = rois.shape[0] + offset = data.new_empty(0) + x = deform_roi_pooling(data, rois, offset, self.spatial_scale, + self.out_size, self.out_channels, True, + self.group_size, self.part_size, + self.sample_per_part, self.trans_std) + offset = self.offset_fc(x.view(n, -1)) + offset = offset.view(n, 2, self.out_size, self.out_size) + return deform_roi_pooling( + data, rois, offset, self.spatial_scale, self.out_size, + self.out_channels, self.no_trans, self.group_size, + self.part_size, self.sample_per_part, self.trans_std) + + +class ModulatedDeformRoIPoolingPack(DeformRoIPooling): + + def __init__(self, + spatial_scale, + out_size, + out_channels, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0, + num_offset_fcs=3, + num_mask_fcs=2, + deform_fc_channels=1024): + super(ModulatedDeformRoIPoolingPack, self).__init__( + spatial_scale, out_size, out_channels, no_trans, group_size, + part_size, sample_per_part, trans_std) + + self.num_offset_fcs = num_offset_fcs + self.num_mask_fcs = num_mask_fcs + self.deform_fc_channels = deform_fc_channels + + if not no_trans: + offset_fc_seq = [] + ic = self.out_size * self.out_size * self.out_channels + for i in range(self.num_offset_fcs): + if i < self.num_offset_fcs - 1: + oc = self.deform_fc_channels + else: + oc = self.out_size * self.out_size * 2 + offset_fc_seq.append(nn.Linear(ic, oc)) + ic = oc + if i < self.num_offset_fcs - 1: + offset_fc_seq.append(nn.ReLU(inplace=True)) + self.offset_fc = nn.Sequential(*offset_fc_seq) + self.offset_fc[-1].weight.data.zero_() + self.offset_fc[-1].bias.data.zero_() + + mask_fc_seq = [] + ic = self.out_size * self.out_size * self.out_channels + for i in range(self.num_mask_fcs): + if i < self.num_mask_fcs - 1: + oc = self.deform_fc_channels + else: + oc = self.out_size * self.out_size + mask_fc_seq.append(nn.Linear(ic, oc)) + ic = oc + if i < self.num_mask_fcs - 1: + mask_fc_seq.append(nn.ReLU(inplace=True)) + else: + mask_fc_seq.append(nn.Sigmoid()) + self.mask_fc = nn.Sequential(*mask_fc_seq) + self.mask_fc[-2].weight.data.zero_() + self.mask_fc[-2].bias.data.zero_() + + def forward(self, data, rois): + assert data.size(1) == self.out_channels + if self.no_trans: + offset = data.new_empty(0) + return deform_roi_pooling( + data, rois, offset, self.spatial_scale, self.out_size, + self.out_channels, self.no_trans, self.group_size, + self.part_size, self.sample_per_part, self.trans_std) + else: + n = rois.shape[0] + offset = data.new_empty(0) + x = deform_roi_pooling(data, rois, offset, self.spatial_scale, + self.out_size, self.out_channels, True, + self.group_size, self.part_size, + self.sample_per_part, self.trans_std) + offset = self.offset_fc(x.view(n, -1)) + offset = offset.view(n, 2, self.out_size, self.out_size) + mask = self.mask_fc(x.view(n, -1)) + mask = mask.view(n, 1, self.out_size, self.out_size) + return deform_roi_pooling( + data, rois, offset, self.spatial_scale, self.out_size, + self.out_channels, self.no_trans, self.group_size, + self.part_size, self.sample_per_part, self.trans_std) * mask diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/setup.py b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..5a9a0ecb742599cbeaa7ccc753418087704e1cfc --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/setup.py @@ -0,0 +1,19 @@ +import os +PATH ="{}:{}".format(os.environ['PATH'], "/opt/cuda/bin") +# os.environ['CUDA_VISIBLE_DEVICES'] = "1" +os.environ['PATH'] = PATH +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +setup( + name='deform_conv', + ext_modules=[ + CUDAExtension('deform_conv_cuda', [ + 'src/deform_conv_cuda.cpp', + 'src/deform_conv_cuda_kernel.cu', + ]), + CUDAExtension('deform_pool_cuda', [ + 'src/deform_pool_cuda.cpp', 'src/deform_pool_cuda_kernel.cu' + ]), + ], + cmdclass={'build_ext': BuildExtension}) diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/src/deform_conv_cuda.cpp b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/src/deform_conv_cuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e45155b94442f228760db21536f61948d7f1056e --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/src/deform_conv_cuda.cpp @@ -0,0 +1,695 @@ +// modify from +// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c + +#include + +#include +#include + +void deformable_im2col(const at::Tensor data_im, const at::Tensor data_offset, + const int channels, const int height, const int width, + const int ksize_h, const int ksize_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int parallel_imgs, const int deformable_group, + at::Tensor data_col); + +void deformable_col2im(const at::Tensor data_col, const at::Tensor data_offset, + const int channels, const int height, const int width, + const int ksize_h, const int ksize_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int parallel_imgs, const int deformable_group, + at::Tensor grad_im); + +void deformable_col2im_coord( + const at::Tensor data_col, const at::Tensor data_im, + const at::Tensor data_offset, const int channels, const int height, + const int width, const int ksize_h, const int ksize_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, const int parallel_imgs, + const int deformable_group, at::Tensor grad_offset); + +void modulated_deformable_im2col_cuda( + const at::Tensor data_im, const at::Tensor data_offset, + const at::Tensor data_mask, const int batch_size, const int channels, + const int height_im, const int width_im, const int height_col, + const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, const int deformable_group, + at::Tensor data_col); + +void modulated_deformable_col2im_cuda( + const at::Tensor data_col, const at::Tensor data_offset, + const at::Tensor data_mask, const int batch_size, const int channels, + const int height_im, const int width_im, const int height_col, + const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, const int deformable_group, + at::Tensor grad_im); + +void modulated_deformable_col2im_coord_cuda( + const at::Tensor data_col, const at::Tensor data_im, + const at::Tensor data_offset, const at::Tensor data_mask, + const int batch_size, const int channels, const int height_im, + const int width_im, const int height_col, const int width_col, + const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int dilation_h, + const int dilation_w, const int deformable_group, at::Tensor grad_offset, + at::Tensor grad_mask); + +void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput, + at::Tensor weight, int kH, int kW, int dH, int dW, int padH, + int padW, int dilationH, int dilationW, int group, + int deformable_group) { + TORCH_CHECK(weight.ndimension() == 4, + "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, " + "but got: %s", + weight.ndimension()); + + TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); + + TORCH_CHECK(kW > 0 && kH > 0, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, + kW); + + TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW), + "kernel size should be consistent with weight, ", + "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH, + kW, weight.size(2), weight.size(3)); + + TORCH_CHECK(dW > 0 && dH > 0, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + + TORCH_CHECK( + dilationW > 0 && dilationH > 0, + "dilation should be greater than 0, but got dilationH: %d dilationW: %d", + dilationH, dilationW); + + int ndim = input.ndimension(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s", + ndim); + + long nInputPlane = weight.size(1) * group; + long inputHeight = input.size(dimh); + long inputWidth = input.size(dimw); + long nOutputPlane = weight.size(0); + long outputHeight = + (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + long outputWidth = + (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + + TORCH_CHECK(nInputPlane % deformable_group == 0, + "input channels must divide deformable group size"); + + if (outputWidth < 1 || outputHeight < 1) + AT_ERROR( + "Given input size: (%ld x %ld x %ld). " + "Calculated output size: (%ld x %ld x %ld). Output size is too small", + nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight, + outputWidth); + + TORCH_CHECK(input.size(1) == nInputPlane, + "invalid number of input planes, expected: %d, but got: %d", + nInputPlane, input.size(1)); + + TORCH_CHECK((inputHeight >= kH && inputWidth >= kW), + "input image is smaller than kernel"); + + TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth), + "invalid spatial size of offset, expected height: %d width: %d, but " + "got height: %d width: %d", + outputHeight, outputWidth, offset.size(2), offset.size(3)); + + TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW), + "invalid number of channels of offset"); + + if (gradOutput != NULL) { + TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane, + "invalid number of gradOutput planes, expected: %d, but got: %d", + nOutputPlane, gradOutput->size(dimf)); + + TORCH_CHECK((gradOutput->size(dimh) == outputHeight && + gradOutput->size(dimw) == outputWidth), + "invalid size of gradOutput, expected height: %d width: %d , but " + "got height: %d width: %d", + outputHeight, outputWidth, gradOutput->size(dimh), + gradOutput->size(dimw)); + } +} + +int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight, + at::Tensor offset, at::Tensor output, + at::Tensor columns, at::Tensor ones, int kW, + int kH, int dW, int dH, int padW, int padH, + int dilationW, int dilationH, int group, + int deformable_group, int im2col_step) { + // todo: resize columns to include im2col: done + // todo: add im2col_step as input + // todo: add new output buffer and transpose it to output (or directly + // transpose output) todo: possibly change data indexing because of + // parallel_imgs + + shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW, + dilationH, dilationW, group, deformable_group); + + input = input.contiguous(); + offset = offset.contiguous(); + weight = weight.contiguous(); + + int batch = 1; + if (input.ndimension() == 3) { + // Force batch + batch = 0; + input.unsqueeze_(0); + offset.unsqueeze_(0); + } + + // todo: assert batchsize dividable by im2col_step + + long batchSize = input.size(0); + long nInputPlane = input.size(1); + long inputHeight = input.size(2); + long inputWidth = input.size(3); + + long nOutputPlane = weight.size(0); + + long outputWidth = + (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = + (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); + + output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane, + outputHeight, outputWidth}); + columns = at::zeros( + {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, + input.options()); + + if (ones.ndimension() != 2 || + ones.size(0) * ones.size(1) < outputHeight * outputWidth) { + ones = at::ones({outputHeight, outputWidth}, input.options()); + } + + input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, + inputHeight, inputWidth}); + offset = + offset.view({batchSize / im2col_step, im2col_step, + deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + at::Tensor output_buffer = + at::zeros({batchSize / im2col_step, nOutputPlane, + im2col_step * outputHeight, outputWidth}, + output.options()); + + output_buffer = output_buffer.view( + {output_buffer.size(0), group, output_buffer.size(1) / group, + output_buffer.size(2), output_buffer.size(3)}); + + for (int elt = 0; elt < batchSize / im2col_step; elt++) { + deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight, + inputWidth, kH, kW, padH, padW, dH, dW, dilationH, + dilationW, im2col_step, deformable_group, columns); + + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + weight = weight.view({group, weight.size(0) / group, weight.size(1), + weight.size(2), weight.size(3)}); + + for (int g = 0; g < group; g++) { + output_buffer[elt][g] = output_buffer[elt][g] + .flatten(1) + .addmm_(weight[g].flatten(1), columns[g]) + .view_as(output_buffer[elt][g]); + } + } + + output_buffer = output_buffer.view( + {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2), + output_buffer.size(3), output_buffer.size(4)}); + + output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane, + im2col_step, outputHeight, outputWidth}); + output_buffer.transpose_(1, 2); + output.copy_(output_buffer); + output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth}); + + input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); + offset = offset.view( + {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + if (batch == 0) { + output = output.view({nOutputPlane, outputHeight, outputWidth}); + input = input.view({nInputPlane, inputHeight, inputWidth}); + offset = offset.view({offset.size(1), offset.size(2), offset.size(3)}); + } + + return 1; +} + +int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset, + at::Tensor gradOutput, at::Tensor gradInput, + at::Tensor gradOffset, at::Tensor weight, + at::Tensor columns, int kW, int kH, int dW, + int dH, int padW, int padH, int dilationW, + int dilationH, int group, + int deformable_group, int im2col_step) { + shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW, + dilationH, dilationW, group, deformable_group); + + input = input.contiguous(); + offset = offset.contiguous(); + gradOutput = gradOutput.contiguous(); + weight = weight.contiguous(); + + int batch = 1; + + if (input.ndimension() == 3) { + // Force batch + batch = 0; + input = input.view({1, input.size(0), input.size(1), input.size(2)}); + offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)}); + gradOutput = gradOutput.view( + {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)}); + } + + long batchSize = input.size(0); + long nInputPlane = input.size(1); + long inputHeight = input.size(2); + long inputWidth = input.size(3); + + long nOutputPlane = weight.size(0); + + long outputWidth = + (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = + (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset"); + gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth}); + columns = at::zeros( + {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, + input.options()); + + // change order of grad output + gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step, + nOutputPlane, outputHeight, outputWidth}); + gradOutput.transpose_(1, 2); + + gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane, + inputHeight, inputWidth}); + input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, + inputHeight, inputWidth}); + gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step, + deformable_group * 2 * kH * kW, outputHeight, + outputWidth}); + offset = + offset.view({batchSize / im2col_step, im2col_step, + deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + for (int elt = 0; elt < batchSize / im2col_step; elt++) { + // divide into groups + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + weight = weight.view({group, weight.size(0) / group, weight.size(1), + weight.size(2), weight.size(3)}); + gradOutput = gradOutput.view( + {gradOutput.size(0), group, gradOutput.size(1) / group, + gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)}); + + for (int g = 0; g < group; g++) { + columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1), + gradOutput[elt][g].flatten(1), 0.0f, 1.0f); + } + + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + gradOutput = gradOutput.view( + {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2), + gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)}); + + deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane, + inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + dilationH, dilationW, im2col_step, deformable_group, + gradOffset[elt]); + + deformable_col2im(columns, offset[elt], nInputPlane, inputHeight, + inputWidth, kH, kW, padH, padW, dH, dW, dilationH, + dilationW, im2col_step, deformable_group, gradInput[elt]); + } + + gradOutput.transpose_(1, 2); + gradOutput = + gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth}); + + gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth}); + input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); + gradOffset = gradOffset.view( + {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + offset = offset.view( + {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + if (batch == 0) { + gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth}); + input = input.view({nInputPlane, inputHeight, inputWidth}); + gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth}); + offset = offset.view({offset.size(1), offset.size(2), offset.size(3)}); + gradOffset = + gradOffset.view({offset.size(1), offset.size(2), offset.size(3)}); + } + + return 1; +} + +int deform_conv_backward_parameters_cuda( + at::Tensor input, at::Tensor offset, at::Tensor gradOutput, + at::Tensor gradWeight, // at::Tensor gradBias, + at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, + int padW, int padH, int dilationW, int dilationH, int group, + int deformable_group, float scale, int im2col_step) { + // todo: transpose and reshape outGrad + // todo: reshape columns + // todo: add im2col_step as input + + shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH, + padW, dilationH, dilationW, group, deformable_group); + + input = input.contiguous(); + offset = offset.contiguous(); + gradOutput = gradOutput.contiguous(); + + int batch = 1; + + if (input.ndimension() == 3) { + // Force batch + batch = 0; + input = input.view( + at::IntList({1, input.size(0), input.size(1), input.size(2)})); + gradOutput = gradOutput.view( + {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)}); + } + + long batchSize = input.size(0); + long nInputPlane = input.size(1); + long inputHeight = input.size(2); + long inputWidth = input.size(3); + + long nOutputPlane = gradWeight.size(0); + + long outputWidth = + (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = + (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); + + columns = at::zeros( + {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, + input.options()); + + gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step, + nOutputPlane, outputHeight, outputWidth}); + gradOutput.transpose_(1, 2); + + at::Tensor gradOutputBuffer = at::zeros_like(gradOutput); + gradOutputBuffer = + gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step, + outputHeight, outputWidth}); + gradOutputBuffer.copy_(gradOutput); + gradOutputBuffer = + gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, + im2col_step * outputHeight, outputWidth}); + + gradOutput.transpose_(1, 2); + gradOutput = + gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth}); + + input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, + inputHeight, inputWidth}); + offset = + offset.view({batchSize / im2col_step, im2col_step, + deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + for (int elt = 0; elt < batchSize / im2col_step; elt++) { + deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight, + inputWidth, kH, kW, padH, padW, dH, dW, dilationH, + dilationW, im2col_step, deformable_group, columns); + + // divide into group + gradOutputBuffer = gradOutputBuffer.view( + {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group, + gradOutputBuffer.size(2), gradOutputBuffer.size(3)}); + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + gradWeight = + gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1), + gradWeight.size(2), gradWeight.size(3)}); + + for (int g = 0; g < group; g++) { + gradWeight[g] = gradWeight[g] + .flatten(1) + .addmm_(gradOutputBuffer[elt][g].flatten(1), + columns[g].transpose(1, 0), 1.0, scale) + .view_as(gradWeight[g]); + } + gradOutputBuffer = gradOutputBuffer.view( + {gradOutputBuffer.size(0), + gradOutputBuffer.size(1) * gradOutputBuffer.size(2), + gradOutputBuffer.size(3), gradOutputBuffer.size(4)}); + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1), + gradWeight.size(2), gradWeight.size(3), + gradWeight.size(4)}); + } + + input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); + offset = offset.view( + {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + if (batch == 0) { + gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth}); + input = input.view({nInputPlane, inputHeight, inputWidth}); + } + + return 1; +} + +void modulated_deform_conv_cuda_forward( + at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, + at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns, + int kernel_h, int kernel_w, const int stride_h, const int stride_w, + const int pad_h, const int pad_w, const int dilation_h, + const int dilation_w, const int group, const int deformable_group, + const bool with_bias) { + TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_out = weight.size(0); + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + + if (kernel_h_ != kernel_h || kernel_w_ != kernel_w) + AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).", + kernel_h_, kernel_w, kernel_h_, kernel_w_); + if (channels != channels_kernel * group) + AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).", + channels, channels_kernel * group); + + const int height_out = + (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = + (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + if (ones.ndimension() != 2 || + ones.size(0) * ones.size(1) < height_out * width_out) { + // Resize plane and fill with ones... + ones = at::ones({height_out, width_out}, input.options()); + } + + // resize output + output = output.view({batch, channels_out, height_out, width_out}).zero_(); + // resize temporary columns + columns = + at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out}, + input.options()); + + output = output.view({output.size(0), group, output.size(1) / group, + output.size(2), output.size(3)}); + + for (int b = 0; b < batch; b++) { + modulated_deformable_im2col_cuda( + input[b], offset[b], mask[b], 1, channels, height, width, height_out, + width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, columns); + + // divide into group + weight = weight.view({group, weight.size(0) / group, weight.size(1), + weight.size(2), weight.size(3)}); + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + + for (int g = 0; g < group; g++) { + output[b][g] = output[b][g] + .flatten(1) + .addmm_(weight[g].flatten(1), columns[g]) + .view_as(output[b][g]); + } + + weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), + weight.size(3), weight.size(4)}); + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + } + + output = output.view({output.size(0), output.size(1) * output.size(2), + output.size(3), output.size(4)}); + + if (with_bias) { + output += bias.view({1, bias.size(0), 1, 1}); + } +} + +void modulated_deform_conv_cuda_backward( + at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, + at::Tensor offset, at::Tensor mask, at::Tensor columns, + at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias, + at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output, + int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, + int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, + const bool with_bias) { + TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + if (kernel_h_ != kernel_h || kernel_w_ != kernel_w) + AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).", + kernel_h_, kernel_w, kernel_h_, kernel_w_); + if (channels != channels_kernel * group) + AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).", + channels, channels_kernel * group); + + const int height_out = + (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = + (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + if (ones.ndimension() != 2 || + ones.size(0) * ones.size(1) < height_out * width_out) { + // Resize plane and fill with ones... + ones = at::ones({height_out, width_out}, input.options()); + } + + grad_input = grad_input.view({batch, channels, height, width}); + columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out}, + input.options()); + + grad_output = + grad_output.view({grad_output.size(0), group, grad_output.size(1) / group, + grad_output.size(2), grad_output.size(3)}); + + for (int b = 0; b < batch; b++) { + // divide int group + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + weight = weight.view({group, weight.size(0) / group, weight.size(1), + weight.size(2), weight.size(3)}); + + for (int g = 0; g < group; g++) { + columns[g].addmm_(weight[g].flatten(1).transpose(0, 1), + grad_output[b][g].flatten(1), 0.0f, 1.0f); + } + + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), + weight.size(3), weight.size(4)}); + + // gradient w.r.t. input coordinate data + modulated_deformable_col2im_coord_cuda( + columns, input[b], offset[b], mask[b], 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, + stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b], + grad_mask[b]); + // gradient w.r.t. input data + modulated_deformable_col2im_cuda( + columns, offset[b], mask[b], 1, channels, height, width, height_out, + width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, grad_input[b]); + + // gradient w.r.t. weight, dWeight should accumulate across the batch and + // group + modulated_deformable_im2col_cuda( + input[b], offset[b], mask[b], 1, channels, height, width, height_out, + width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, columns); + + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + grad_weight = grad_weight.view({group, grad_weight.size(0) / group, + grad_weight.size(1), grad_weight.size(2), + grad_weight.size(3)}); + if (with_bias) + grad_bias = grad_bias.view({group, grad_bias.size(0) / group}); + + for (int g = 0; g < group; g++) { + grad_weight[g] = + grad_weight[g] + .flatten(1) + .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1)) + .view_as(grad_weight[g]); + if (with_bias) { + grad_bias[g] = + grad_bias[g] + .view({-1, 1}) + .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1})) + .view(-1); + } + } + + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1), + grad_weight.size(2), grad_weight.size(3), + grad_weight.size(4)}); + if (with_bias) + grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)}); + } + grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1), + grad_output.size(2), grad_output.size(3), + grad_output.size(4)}); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("deform_conv_forward_cuda", &deform_conv_forward_cuda, + "deform forward (CUDA)"); + m.def("deform_conv_backward_input_cuda", &deform_conv_backward_input_cuda, + "deform_conv_backward_input (CUDA)"); + m.def("deform_conv_backward_parameters_cuda", + &deform_conv_backward_parameters_cuda, + "deform_conv_backward_parameters (CUDA)"); + m.def("modulated_deform_conv_cuda_forward", + &modulated_deform_conv_cuda_forward, + "modulated deform conv forward (CUDA)"); + m.def("modulated_deform_conv_cuda_backward", + &modulated_deform_conv_cuda_backward, + "modulated deform conv backward (CUDA)"); +} diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/src/deform_conv_cuda_kernel.cu b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/src/deform_conv_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..48c6d8825387ce4b248f07f77f5eeb65ab9bcb49 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/src/deform_conv_cuda_kernel.cu @@ -0,0 +1,866 @@ +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2018 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file modulated_deformable_im2col.cuh + * \brief Function definitions of converting an image to + * column matrix based on kernel, padding, dilation, and offset. + * These functions are mainly used in deformable convolution operators. + * \ref: https://arxiv.org/abs/1703.06211 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng + */ + +// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu + +#include +#include +#include +#include +#include + +using namespace at; + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +const int kMaxGridNum = 65535; + +inline int GET_BLOCKS(const int N) +{ + return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS); +} + +template +__device__ scalar_t deformable_im2col_bilinear(const scalar_t *bottom_data, const int data_width, + const int height, const int width, scalar_t h, scalar_t w) +{ + + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + scalar_t lh = h - h_low; + scalar_t lw = w - w_low; + scalar_t hh = 1 - lh, hw = 1 - lw; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + v1 = bottom_data[h_low * data_width + w_low]; + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__device__ scalar_t get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w, + const int h, const int w, const int height, const int width) +{ + + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + scalar_t weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +template +__device__ scalar_t get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w, + const int height, const int width, const scalar_t *im_data, + const int data_width, const int bp_dir) +{ + + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + scalar_t weight = 0; + + if (bp_dir == 0) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + else if (bp_dir == 1) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +template +__global__ void deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, const int channel_per_deformable_group, + const int batch_size, const int num_channels, const int deformable_group, + const int height_col, const int width_col, + scalar_t *data_col) +{ + CUDA_KERNEL_LOOP(index, n) + { + // index index of output matrix + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + const int b_col = (index / width_col / height_col) % batch_size; + const int c_im = (index / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + //const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; + const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; + const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) + { + for (int j = 0; j < kernel_w; ++j) + { + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + scalar_t val = static_cast(0); + const scalar_t h_im = h_in + i * dilation_h + offset_h; + const scalar_t w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) + { + //const scalar_t map_h = i * dilation_h + offset_h; + //const scalar_t map_w = j * dilation_w + offset_w; + //const int cur_height = height - h_in; + //const int cur_width = width - w_in; + //val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); + val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} + +void deformable_im2col( + const at::Tensor data_im, const at::Tensor data_offset, const int channels, + const int height, const int width, const int ksize_h, const int ksize_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, const int parallel_imgs, + const int deformable_group, at::Tensor data_col) +{ + // num_axes should be smaller than block size + // todo: check parallel_imgs is correctly passed in + int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; + int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; + int num_kernels = channels * height_col * width_col * parallel_imgs; + int channel_per_deformable_group = channels / deformable_group; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_im.type(), "deformable_im2col_gpu", ([&] { + const scalar_t *data_im_ = data_im.data(); + const scalar_t *data_offset_ = data_offset.data(); + scalar_t *data_col_ = data_col.data(); + + deformable_im2col_gpu_kernel<<>>( + num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, + channel_per_deformable_group, parallel_imgs, channels, deformable_group, + height_col, width_col, data_col_); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in deformable_im2col: %s\n", cudaGetErrorString(err)); + } +} + +template +__global__ void deformable_col2im_gpu_kernel( + const int n, const scalar_t *data_col, const scalar_t *data_offset, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int deformable_group, + const int height_col, const int width_col, + scalar_t *grad_im) +{ + CUDA_KERNEL_LOOP(index, n) + { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * + 2 * kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h; + const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const scalar_t cur_top_grad = data_col[index]; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) + { + for (int dx = -2; dx <= 2; dx++) + { + if (cur_h + dy >= 0 && cur_h + dy < height && + cur_w + dx >= 0 && cur_w + dx < width && + abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) + { + int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + scalar_t weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + +void deformable_col2im( + const at::Tensor data_col, const at::Tensor data_offset, const int channels, + const int height, const int width, const int ksize_h, + const int ksize_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int parallel_imgs, const int deformable_group, + at::Tensor grad_im) +{ + + // todo: make sure parallel_imgs is passed in correctly + int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; + int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; + int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs; + int channel_per_deformable_group = channels / deformable_group; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_col.type(), "deformable_col2im_gpu", ([&] { + const scalar_t *data_col_ = data_col.data(); + const scalar_t *data_offset_ = data_offset.data(); + scalar_t *grad_im_ = grad_im.data(); + + deformable_col2im_gpu_kernel<<>>( + num_kernels, data_col_, data_offset_, channels, height, width, ksize_h, + ksize_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + parallel_imgs, deformable_group, height_col, width_col, grad_im_); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in deformable_col2im: %s\n", cudaGetErrorString(err)); + } +} + +template +__global__ void deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col, + const scalar_t *data_im, const scalar_t *data_offset, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, scalar_t *grad_offset) +{ + CUDA_KERNEL_LOOP(index, n) + { + scalar_t val = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * + batch_size * width_col * height_col; + const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * + channel_per_deformable_group / kernel_h / kernel_w * height * width; + const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) + { + const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + scalar_t inv_h = h_in + i * dilation_h + offset_h; + scalar_t inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) + { + inv_h = inv_w = -2; + } + const scalar_t weight = get_coordinate_weight( + inv_h, inv_w, + height, width, data_im_ptr + cnt * height * width, width, bp_dir); + val += weight * data_col_ptr[col_pos]; + cnt += 1; + } + + grad_offset[index] = val; + } +} + +void deformable_col2im_coord( + const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, + const int channels, const int height, const int width, const int ksize_h, + const int ksize_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, const int dilation_h, const int dilation_w, + const int parallel_imgs, const int deformable_group, at::Tensor grad_offset) +{ + + int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; + int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; + int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs; + int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_col.type(), "deformable_col2im_coord_gpu", ([&] { + const scalar_t *data_col_ = data_col.data(); + const scalar_t *data_im_ = data_im.data(); + const scalar_t *data_offset_ = data_offset.data(); + scalar_t *grad_offset_ = grad_offset.data(); + + deformable_col2im_coord_gpu_kernel<<>>( + num_kernels, data_col_, data_im_, data_offset_, channels, height, width, + ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group, + height_col, width_col, grad_offset_); + })); +} + +template +__device__ scalar_t dmcn_im2col_bilinear(const scalar_t *bottom_data, const int data_width, + const int height, const int width, scalar_t h, scalar_t w) +{ + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + scalar_t lh = h - h_low; + scalar_t lw = w - w_low; + scalar_t hh = 1 - lh, hw = 1 - lw; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + v1 = bottom_data[h_low * data_width + w_low]; + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__device__ scalar_t dmcn_get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w, + const int h, const int w, const int height, const int width) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + scalar_t weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +template +__device__ scalar_t dmcn_get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w, + const int height, const int width, const scalar_t *im_data, + const int data_width, const int bp_dir) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + scalar_t weight = 0; + + if (bp_dir == 0) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + else if (bp_dir == 1) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +template +__global__ void modulated_deformable_im2col_gpu_kernel(const int n, + const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int num_channels, const int deformable_group, + const int height_col, const int width_col, + scalar_t *data_col) +{ + CUDA_KERNEL_LOOP(index, n) + { + // index index of output matrix + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + const int b_col = (index / width_col / height_col) % batch_size; + const int c_im = (index / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; + const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; + const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + + const scalar_t *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) + { + for (int j = 0; j < kernel_w; ++j) + { + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; + const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; + scalar_t val = static_cast(0); + const scalar_t h_im = h_in + i * dilation_h + offset_h; + const scalar_t w_im = w_in + j * dilation_w + offset_w; + //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) + { + //const float map_h = i * dilation_h + offset_h; + //const float map_w = j * dilation_w + offset_w; + //const int cur_height = height - h_in; + //const int cur_width = width - w_in; + //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); + val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + data_col_ptr += batch_size * height_col * width_col; + //data_col_ptr += height_col * width_col; + } + } + } +} + +template +__global__ void modulated_deformable_col2im_gpu_kernel(const int n, + const scalar_t *data_col, const scalar_t *data_offset, const scalar_t *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int deformable_group, + const int height_col, const int width_col, + scalar_t *grad_im) +{ + CUDA_KERNEL_LOOP(index, n) + { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; + const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h; + const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const scalar_t cur_top_grad = data_col[index] * mask; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) + { + for (int dx = -2; dx <= 2; dx++) + { + if (cur_h + dy >= 0 && cur_h + dy < height && + cur_w + dx >= 0 && cur_w + dx < width && + abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) + { + int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + scalar_t weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + +template +__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n, + const scalar_t *data_col, const scalar_t *data_im, + const scalar_t *data_offset, const scalar_t *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, + scalar_t *grad_offset, scalar_t *grad_mask) +{ + CUDA_KERNEL_LOOP(index, n) + { + scalar_t val = 0, mval = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; + const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; + const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) + { + const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); + const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; + scalar_t inv_h = h_in + i * dilation_h + offset_h; + scalar_t inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) + { + inv_h = inv_w = -2; + } + else + { + mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); + } + const scalar_t weight = dmcn_get_coordinate_weight( + inv_h, inv_w, + height, width, data_im_ptr + cnt * height * width, width, bp_dir); + val += weight * data_col_ptr[col_pos] * mask; + cnt += 1; + } + // KERNEL_ASSIGN(grad_offset[index], offset_req, val); + grad_offset[index] = val; + if (offset_c % 2 == 0) + // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); + grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; + } +} + +void modulated_deformable_im2col_cuda( + const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, at::Tensor data_col) +{ + // num_axes should be smaller than block size + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * batch_size * height_col * width_col; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_im.type(), "modulated_deformable_im2col_gpu", ([&] { + const scalar_t *data_im_ = data_im.data(); + const scalar_t *data_offset_ = data_offset.data(); + const scalar_t *data_mask_ = data_mask.data(); + scalar_t *data_col_ = data_col.data(); + + modulated_deformable_im2col_gpu_kernel<<>>( + num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, + batch_size, channels, deformable_group, height_col, width_col, data_col_); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + // printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); + } +} + +void modulated_deformable_col2im_cuda( + const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, at::Tensor grad_im) +{ + + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_col.type(), "modulated_deformable_col2im_gpu", ([&] { + const scalar_t *data_col_ = data_col.data(); + const scalar_t *data_offset_ = data_offset.data(); + const scalar_t *data_mask_ = data_mask.data(); + scalar_t *grad_im_ = grad_im.data(); + + modulated_deformable_col2im_gpu_kernel<<>>( + num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, deformable_group, height_col, width_col, grad_im_); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); + } +} + +void modulated_deformable_col2im_coord_cuda( + const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, + at::Tensor grad_offset, at::Tensor grad_mask) +{ + const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; + const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_col.type(), "modulated_deformable_col2im_coord_gpu", ([&] { + const scalar_t *data_col_ = data_col.data(); + const scalar_t *data_im_ = data_im.data(); + const scalar_t *data_offset_ = data_offset.data(); + const scalar_t *data_mask_ = data_mask.data(); + scalar_t *grad_offset_ = grad_offset.data(); + scalar_t *grad_mask_ = grad_mask.data(); + + modulated_deformable_col2im_coord_gpu_kernel<<>>( + num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, + grad_offset_, grad_mask_); + })); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); + } +} diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/src/deform_pool_cuda.cpp b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/src/deform_pool_cuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e19cf42aee6149a52d45c54f09dcb9afdc9dbe92 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/src/deform_pool_cuda.cpp @@ -0,0 +1,87 @@ +// modify from +// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c + +// based on +// author: Charles Shang +// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu + +#include + +#include +#include + +void DeformablePSROIPoolForward( + const at::Tensor data, const at::Tensor bbox, const at::Tensor trans, + at::Tensor out, at::Tensor top_count, const int batch, const int channels, + const int height, const int width, const int num_bbox, + const int channels_trans, const int no_trans, const float spatial_scale, + const int output_dim, const int group_size, const int pooled_size, + const int part_size, const int sample_per_part, const float trans_std); + +void DeformablePSROIPoolBackwardAcc( + const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox, + const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad, + at::Tensor trans_grad, const int batch, const int channels, + const int height, const int width, const int num_bbox, + const int channels_trans, const int no_trans, const float spatial_scale, + const int output_dim, const int group_size, const int pooled_size, + const int part_size, const int sample_per_part, const float trans_std); + +void deform_psroi_pooling_cuda_forward( + at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out, + at::Tensor top_count, const int no_trans, const float spatial_scale, + const int output_dim, const int group_size, const int pooled_size, + const int part_size, const int sample_per_part, const float trans_std) { + TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + + const int num_bbox = bbox.size(0); + if (num_bbox != out.size(0)) + AT_ERROR("Output shape and bbox number wont match: (%d vs %d).", + out.size(0), num_bbox); + + DeformablePSROIPoolForward( + input, bbox, trans, out, top_count, batch, channels, height, width, + num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size, + pooled_size, part_size, sample_per_part, trans_std); +} + +void deform_psroi_pooling_cuda_backward( + at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans, + at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad, + const int no_trans, const float spatial_scale, const int output_dim, + const int group_size, const int pooled_size, const int part_size, + const int sample_per_part, const float trans_std) { + TORCH_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous"); + TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + + const int num_bbox = bbox.size(0); + if (num_bbox != out_grad.size(0)) + AT_ERROR("Output shape and bbox number wont match: (%d vs %d).", + out_grad.size(0), num_bbox); + + DeformablePSROIPoolBackwardAcc( + out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch, + channels, height, width, num_bbox, channels_trans, no_trans, + spatial_scale, output_dim, group_size, pooled_size, part_size, + sample_per_part, trans_std); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("deform_psroi_pooling_cuda_forward", &deform_psroi_pooling_cuda_forward, + "deform psroi pooling forward(CUDA)"); + m.def("deform_psroi_pooling_cuda_backward", + &deform_psroi_pooling_cuda_backward, + "deform psroi pooling backward(CUDA)"); +} diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/src/deform_pool_cuda_kernel.cu b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/src/deform_pool_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..e49446005679c0d8d7b7bd6fb84250325c37828f --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/backbone/assets/dcn/src/deform_pool_cuda_kernel.cu @@ -0,0 +1,364 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file deformable_psroi_pooling.cu + * \brief + * \author Yi Li, Guodong Zhang, Jifeng Dai +*/ +/***************** Adapted by Charles Shang *********************/ +// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/cuda/deform_psroi_pooling_cuda.cu + +#include +#include +#include +#include +#include + +using namespace at; + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__device__ scalar_t bilinear_interp( + const scalar_t *data, + const scalar_t x, + const scalar_t y, + const int width, + const int height) +{ + int x1 = floor(x); + int x2 = ceil(x); + int y1 = floor(y); + int y2 = ceil(y); + scalar_t dist_x = (scalar_t)(x - x1); + scalar_t dist_y = (scalar_t)(y - y1); + scalar_t value11 = data[y1 * width + x1]; + scalar_t value12 = data[y2 * width + x1]; + scalar_t value21 = data[y1 * width + x2]; + scalar_t value22 = data[y2 * width + x2]; + scalar_t value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22; + return value; +} + +template +__global__ void DeformablePSROIPoolForwardKernel( + const int count, + const scalar_t *bottom_data, + const scalar_t spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const scalar_t *bottom_rois, const scalar_t *bottom_trans, + const int no_trans, + const scalar_t trans_std, + const int sample_per_part, + const int output_dim, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class, + scalar_t *top_data, + scalar_t *top_count) +{ + CUDA_KERNEL_LOOP(index, count) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const scalar_t *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height); + scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width); + + scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part); + scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part); + + int part_h = floor((scalar_t)(ph) / pooled_height * part_size); + int part_w = floor((scalar_t)(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; + scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; + + scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + scalar_t sum = 0; + int count = 0; + int gw = floor((scalar_t)(pw)*group_size / pooled_width); + int gh = floor((scalar_t)(ph)*group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + scalar_t w = wstart + iw * sub_bin_size_w; + scalar_t h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_size + gh) * group_size + gw; + scalar_t val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height); + sum += val; + count++; + } + } + top_data[index] = count == 0 ? (scalar_t)(0) : sum / count; + top_count[index] = count; + } +} + +template +__global__ void DeformablePSROIPoolBackwardAccKernel( + const int count, + const scalar_t *top_diff, + const scalar_t *top_count, + const int num_rois, + const scalar_t spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int output_dim, + scalar_t *bottom_data_diff, scalar_t *bottom_trans_diff, + const scalar_t *bottom_data, + const scalar_t *bottom_rois, + const scalar_t *bottom_trans, + const int no_trans, + const scalar_t trans_std, + const int sample_per_part, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class) +{ + CUDA_KERNEL_LOOP(index, count) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const scalar_t *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height); + scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width); + + scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part); + scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part); + + int part_h = floor((scalar_t)(ph) / pooled_height * part_size); + int part_w = floor((scalar_t)(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; + scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; + + scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) + { + continue; + } + scalar_t diff_val = top_diff[index] / top_count[index]; + const scalar_t *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; + scalar_t *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; + int gw = floor((scalar_t)(pw)*group_size / pooled_width); + int gh = floor((scalar_t)(ph)*group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + scalar_t w = wstart + iw * sub_bin_size_w; + scalar_t h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_size + gh) * group_size + gw; + // backward on feature + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + scalar_t dist_x = w - x0, dist_y = h - y0; + scalar_t q00 = (1 - dist_x) * (1 - dist_y); + scalar_t q01 = (1 - dist_x) * dist_y; + scalar_t q10 = dist_x * (1 - dist_y); + scalar_t q11 = dist_x * dist_y; + int bottom_index_base = c * height * width; + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val); + + if (no_trans) + { + continue; + } + scalar_t U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; + scalar_t U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; + scalar_t U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; + scalar_t U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; + scalar_t diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val; + diff_x *= roi_width; + scalar_t diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val; + diff_y *= roi_height; + + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x); + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y); + } + } + } +} + +void DeformablePSROIPoolForward(const at::Tensor data, + const at::Tensor bbox, + const at::Tensor trans, + at::Tensor out, + at::Tensor top_count, + const int batch, + const int channels, + const int height, + const int width, + const int num_bbox, + const int channels_trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + const int pooled_height = pooled_size; + const int pooled_width = pooled_size; + const int count = num_bbox * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data.type(), "deformable_psroi_pool_forward", ([&] { + const scalar_t *bottom_data = data.data(); + const scalar_t *bottom_rois = bbox.data(); + const scalar_t *bottom_trans = no_trans ? NULL : trans.data(); + scalar_t *top_data = out.data(); + scalar_t *top_count_data = top_count.data(); + + DeformablePSROIPoolForwardKernel<<>>( + count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width, + bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, output_dim, + group_size, part_size, num_classes, channels_each_class, top_data, top_count_data); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err)); + } +} + +void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad, + const at::Tensor data, + const at::Tensor bbox, + const at::Tensor trans, + const at::Tensor top_count, + at::Tensor in_grad, + at::Tensor trans_grad, + const int batch, + const int channels, + const int height, + const int width, + const int num_bbox, + const int channels_trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + // LOG(INFO) << "DeformablePSROIPoolBackward"; + const int num_rois = num_bbox; + const int pooled_height = pooled_size; + const int pooled_width = pooled_size; + const int count = num_bbox * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + out_grad.type(), "deformable_psroi_pool_backward_acc", ([&] { + const scalar_t *top_diff = out_grad.data(); + const scalar_t *bottom_data = data.data(); + const scalar_t *bottom_rois = bbox.data(); + const scalar_t *bottom_trans = no_trans ? NULL : trans.data(); + scalar_t *bottom_data_diff = in_grad.data(); + scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data(); + const scalar_t *top_count_data = top_count.data(); + + DeformablePSROIPoolBackwardAccKernel<<>>( + count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width, + pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff, + bottom_data, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, + group_size, part_size, num_classes, channels_each_class); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err)); + } +} \ No newline at end of file diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/resnet.py b/IndicPhotoOCR/detection/textbpn/network/backbone/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..bab4346d6115ace46a085496751291864a576bea --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/backbone/resnet.py @@ -0,0 +1,336 @@ +import torch.nn as nn +import math +import torch.utils.model_zoo as model_zoo +BatchNorm2d = nn.BatchNorm2d + +__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', + 'resnet152'] + + +model_urls = { + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', +} + + +def constant_init(module, constant, bias=0): + nn.init.constant_(module.weight, constant) + if hasattr(module, 'bias'): + nn.init.constant_(module.bias, bias) + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None, dcn=None): + super(BasicBlock, self).__init__() + self.with_dcn = dcn is not None + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.with_modulated_dcn = False + if self.with_dcn: + fallback_on_stride = dcn.get('fallback_on_stride', False) + self.with_modulated_dcn = dcn.get('modulated', False) + # self.conv2 = conv3x3(planes, planes) + if not self.with_dcn or fallback_on_stride: + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, + padding=1, bias=False) + else: + deformable_groups = dcn.get('deformable_groups', 1) + if not self.with_modulated_dcn: + from network.backbone.assets.dcn import DeformConv + conv_op = DeformConv + offset_channels = 18 + else: + from network.backbone.assets.dcn import ModulatedDeformConv + conv_op = ModulatedDeformConv + offset_channels = 27 + self.conv2_offset = nn.Conv2d( + planes, + deformable_groups * offset_channels, + kernel_size=3, + padding=1) + self.conv2 = conv_op( + planes, + planes, + kernel_size=3, + padding=1, + deformable_groups=deformable_groups, + bias=False) + self.bn2 = BatchNorm2d(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + # out = self.conv2(out) + if not self.with_dcn: + out = self.conv2(out) + elif self.with_modulated_dcn: + offset_mask = self.conv2_offset(out) + offset = offset_mask[:, :18, :, :] + mask = offset_mask[:, -9:, :, :].sigmoid() + out = self.conv2(out, offset, mask) + else: + offset = self.conv2_offset(out) + out = self.conv2(out, offset) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None, dcn=None): + super(Bottleneck, self).__init__() + self.with_dcn = dcn is not None + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = BatchNorm2d(planes) + fallback_on_stride = False + self.with_modulated_dcn = False + if self.with_dcn: + fallback_on_stride = dcn.get('fallback_on_stride', False) + self.with_modulated_dcn = dcn.get('modulated', False) + if not self.with_dcn or fallback_on_stride: + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, + stride=stride, padding=1, bias=False) + else: + deformable_groups = dcn.get('deformable_groups', 1) + if not self.with_modulated_dcn: + from network.backbone.assets.dcn import DeformConv + conv_op = DeformConv + offset_channels = 18 + else: + from network.backbone.assets.dcn import ModulatedDeformConv + conv_op = ModulatedDeformConv + offset_channels = 27 + self.conv2_offset = nn.Conv2d( + planes, deformable_groups * offset_channels, + kernel_size=3, + padding=1) + self.conv2 = conv_op( + planes, planes, kernel_size=3, padding=1, stride=stride, + deformable_groups=deformable_groups, bias=False) + self.bn2 = BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = BatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + self.dcn = dcn + self.with_dcn = dcn is not None + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + # out = self.conv2(out) + if not self.with_dcn: + out = self.conv2(out) + elif self.with_modulated_dcn: + offset_mask = self.conv2_offset(out) + offset = offset_mask[:, :18, :, :] + mask = offset_mask[:, -9:, :, :].sigmoid() + out = self.conv2(out, offset, mask) + else: + offset = self.conv2_offset(out) + out = self.conv2(out, offset) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNet(nn.Module): + def __init__(self, block, layers, num_classes=1000, + dcn=None, stage_with_dcn=(False, False, False, False)): + self.dcn = dcn + self.stage_with_dcn = stage_with_dcn + self.inplanes = 64 + super(ResNet, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, + bias=False) + self.bn1 = BatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer( + block, 128, layers[1], stride=2, dcn=dcn) + self.layer3 = self._make_layer( + block, 256, layers[2], stride=2, dcn=dcn) + self.layer4 = self._make_layer( + block, 512, layers[3], stride=2, dcn=dcn) + self.avgpool = nn.AvgPool2d(7, stride=1) + self.fc = nn.Linear(512 * block.expansion, num_classes) + + self.smooth = nn.Conv2d(2048, 256, kernel_size=1, stride=1, padding=1) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + if self.dcn is not None: + for m in self.modules(): + if isinstance(m, Bottleneck) or isinstance(m, BasicBlock): + if hasattr(m, 'conv2_offset'): + constant_init(m.conv2_offset, 0) + + def _make_layer(self, block, planes, blocks, stride=1, dcn=None): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, + stride, downsample, dcn=dcn)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes, dcn=dcn)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x1 = self.maxpool(x) + + x2 = self.layer1(x1) + x3 = self.layer2(x2) + x4 = self.layer3(x3) + x5 = self.layer4(x4) + + return x1, x2, x3, x4, x5 + + +def resnet18(pretrained=True, **kwargs): + """Constructs a ResNet-18 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) + if pretrained: + model.load_state_dict(model_zoo.load_url( + model_urls['resnet18']), strict=False) + return model + +def deformable_resnet18(pretrained=True, **kwargs): + """Constructs a ResNet-18 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(BasicBlock, [2, 2, 2, 2], + dcn=dict(modulated=True, + deformable_groups=1, + fallback_on_stride=False), + stage_with_dcn=[False, True, True, True], **kwargs) + if pretrained: + model.load_state_dict(model_zoo.load_url( + model_urls['resnet18']), strict=False) + return model + + +def resnet34(pretrained=True, **kwargs): + """Constructs a ResNet-34 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) + if pretrained: + model.load_state_dict(model_zoo.load_url( + model_urls['resnet34']), strict=False) + return model + + +def resnet50(pretrained=True, **kwargs): + """Constructs a ResNet-50 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) + if pretrained: + model.load_state_dict(model_zoo.load_url( + model_urls['resnet50']), strict=False) + return model + + +def deformable_resnet50(pretrained=True, **kwargs): + """Constructs a ResNet-50 model with deformable conv. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 4, 6, 3], + dcn=dict(modulated=True, + deformable_groups=1, + fallback_on_stride=False), + stage_with_dcn=[False, True, True, True], + **kwargs) + if pretrained: + model.load_state_dict(model_zoo.load_url( + model_urls['resnet50']), strict=False) + return model + + +def resnet101(pretrained=True, **kwargs): + """Constructs a ResNet-101 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) + if pretrained: + model.load_state_dict(model_zoo.load_url( + model_urls['resnet101']), strict=False) + return model + + +def resnet152(pretrained=True, **kwargs): + """Constructs a ResNet-152 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) + if pretrained: + model.load_state_dict(model_zoo.load_url( + model_urls['resnet152']), strict=False) + return model diff --git a/IndicPhotoOCR/detection/textbpn/network/backbone/vgg.py b/IndicPhotoOCR/detection/textbpn/network/backbone/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..0932835b7a213614d826dcb832c7adf9d89f07d5 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/backbone/vgg.py @@ -0,0 +1,60 @@ +import torch.nn as nn +import torch.utils.model_zoo as model_zoo +import torchvision.models as models + +model_urls = { + 'vgg11': 'https://download.pytorch.org/models/vgg11-bbd30ac9.pth', + 'vgg16': 'https://download.pytorch.org/models/vgg16-397923af.pth', + 'vgg19': 'https://download.pytorch.org/models/vgg19-dcbb9e9d.pth', + 'vgg11_bn': 'https://download.pytorch.org/models/vgg11_bn-6002323d.pth', + 'vgg16_bn': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth', + 'vgg19_bn': 'https://download.pytorch.org/models/vgg19_bn-c79401a0.pth', +} + + +class VggNet(nn.Module): + def __init__(self, name="vgg16", pretrain=True): + super().__init__() + if name == "vgg16": + base_net = models.vgg16(pretrained=False) + elif name == "vgg16_bn": + base_net = models.vgg16_bn(pretrained=False) + else: + print(" base model is not support !") + if pretrain: + print("load the {} weight from ./cache".format(name)) + base_net.load_state_dict(model_zoo.load_url(model_urls[name], model_dir="./cache")) + + if name == "vgg16": + self.stage1 = nn.Sequential(*[base_net.features[layer] for layer in range(0, 5)]) + self.stage2 = nn.Sequential(*[base_net.features[layer] for layer in range(5, 10)]) + self.stage3 = nn.Sequential(*[base_net.features[layer] for layer in range(10, 17)]) + self.stage4 = nn.Sequential(*[base_net.features[layer] for layer in range(17, 24)]) + self.stage5 = nn.Sequential(*[base_net.features[layer] for layer in range(24, 31)]) + elif name == "vgg16_bn": + self.stage1 = nn.Sequential(*[base_net.features[layer] for layer in range(0, 7)]) + self.stage2 = nn.Sequential(*[base_net.features[layer] for layer in range(7, 14)]) + self.stage3 = nn.Sequential(*[base_net.features[layer] for layer in range(14, 24)]) + self.stage4 = nn.Sequential(*[base_net.features[layer] for layer in range(24, 34)]) + self.stage5 = nn.Sequential(*[base_net.features[layer] for layer in range(34, 44)]) + + def forward(self, x): + C1 = self.stage1(x) + C2 = self.stage2(C1) + C3 = self.stage3(C2) + C4 = self.stage4(C3) + C5 = self.stage5(C4) + + return C1, C2, C3, C4, C5 + + +if __name__ == '__main__': + import torch + input = torch.randn((4, 3, 512, 512)) + net = VggNet() + C1, C2, C3, C4, C5 = net(input) + print(C1.size()) + print(C2.size()) + print(C3.size()) + print(C4.size()) + print(C5.size()) diff --git a/IndicPhotoOCR/detection/textbpn/network/layers/Adaptive_Deformation.py b/IndicPhotoOCR/detection/textbpn/network/layers/Adaptive_Deformation.py new file mode 100644 index 0000000000000000000000000000000000000000..89557292a4cec6733f658d397fbe213b864e5685 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/layers/Adaptive_Deformation.py @@ -0,0 +1,88 @@ +################################################################### +# File Name: AdaptiveDeformation.py +# Author: S.X.Zhang +################################################################### + +from __future__ import print_function +from __future__ import division +from __future__ import absolute_import + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import init + + +class MeanAggregator(nn.Module): + def __init__(self): + super(MeanAggregator, self).__init__() + + def forward(self, features, A): + x = torch.bmm(A, features) + return x + + +class GraphConv(nn.Module): + def __init__(self, in_dim, out_dim, agg): + super(GraphConv, self).__init__() + self.in_dim = in_dim + self.out_dim = out_dim + self.weight = nn.Parameter(torch.FloatTensor(in_dim * 2, out_dim)) + self.bias = nn.Parameter(torch.FloatTensor(out_dim)) + init.xavier_uniform_(self.weight) + init.constant_(self.bias, 0) + self.agg = agg() + + def forward(self, features, A): + b, n, d = features.shape + assert (d == self.in_dim) + agg_feats = self.agg(features, A) + cat_feats = torch.cat([features, agg_feats], dim=2) + out = torch.einsum('bnd,df->bnf', (cat_feats, self.weight)) + out = F.relu(out + self.bias) + return out + + +class AdaptiveDeformation(nn.Module): + def __init__(self, input, state_dim): + super(AdaptiveDeformation, self).__init__() + self.bn0 = nn.BatchNorm1d(input, affine=False) + self.conv1 = nn.Conv1d(input, state_dim, 1) + self.rnn = nn.LSTM(input, state_dim, 1, bidirectional=True) + self.gconv1 = GraphConv(input, 256, MeanAggregator) + self.gconv2 = GraphConv(256, 1024, MeanAggregator) + self.gconv3 = GraphConv(1024, 512, MeanAggregator) + self.gconv4 = GraphConv(512, state_dim, MeanAggregator) + + self.prediction = nn.Sequential( + nn.Conv1d(4*state_dim, 128, 1), + nn.ReLU(inplace=True), + nn.Dropout(0.1), + nn.Conv1d(128, 64, 1), + nn.ReLU(inplace=True), + nn.Dropout(0.1), + nn.Conv1d(64, 2, 1)) + + def forward(self, x, A): + x = self.bn0(x) + + # # rnn block + yl = x.permute(2, 0, 1) + yl, _ = self.rnn(yl) + yl = yl.permute(1, 2, 0) + + # # gcn block + yg = x.permute(0, 2, 1) + b, n, c = yg.shape + A = A.expand(b, n, n) + yg = self.gconv1(yg, A) + yg = self.gconv2(yg, A) + yg = self.gconv3(yg, A) + yg = self.gconv4(yg, A) + yg = yg.permute(0, 2, 1) + + # res block + x = torch.cat([yl, yg, self.conv1(x)], dim=1) + pred = self.prediction(x) + + return pred diff --git a/IndicPhotoOCR/detection/textbpn/network/layers/CircConv.py b/IndicPhotoOCR/detection/textbpn/network/layers/CircConv.py new file mode 100644 index 0000000000000000000000000000000000000000..7a4d24d097ad1b9ceb5f92503eef4094e235f0e4 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/layers/CircConv.py @@ -0,0 +1,91 @@ +import torch.nn as nn +import torch + + +class CircConv(nn.Module): + def __init__(self, state_dim, out_state_dim=None, n_adj=4): + super(CircConv, self).__init__() + + self.n_adj = n_adj + out_state_dim = state_dim if out_state_dim is None else out_state_dim + self.fc = nn.Conv1d(state_dim, out_state_dim, kernel_size=self.n_adj*2+1) + + def forward(self, input, adj): + input = torch.cat([input[..., -self.n_adj:], input, input[..., :self.n_adj]], dim=2) + return self.fc(input) + + +class DilatedCircConv(nn.Module): + def __init__(self, state_dim, out_state_dim=None, n_adj=4, dilation=1): + super(DilatedCircConv, self).__init__() + + self.n_adj = n_adj + self.dilation = dilation + out_state_dim = state_dim if out_state_dim is None else out_state_dim + self.fc = nn.Conv1d(state_dim, out_state_dim, kernel_size=self.n_adj*2+1, dilation=self.dilation) + + def forward(self, input, adj): + if self.n_adj != 0: + input = torch.cat([input[..., -self.n_adj*self.dilation:], input, input[..., :self.n_adj*self.dilation]], dim=2) + return self.fc(input) + + +_conv_factory = { + 'grid': CircConv, + 'dgrid': DilatedCircConv +} + + +class BasicBlock(nn.Module): + def __init__(self, state_dim, out_state_dim, conv_type, n_adj=4, dilation=1): + super(BasicBlock, self).__init__() + + self.conv = _conv_factory[conv_type](state_dim, out_state_dim, n_adj, dilation) + self.relu = nn.ReLU(inplace=True) + self.norm = nn.BatchNorm1d(out_state_dim) + + def forward(self, x, adj=None): + x = self.conv(x, adj) + x = self.relu(x) + x = self.norm(x) + return x + + +class DeepSnake(nn.Module): + def __init__(self, state_dim, feature_dim, conv_type='dgrid'): + super(DeepSnake, self).__init__() + + self.head = BasicBlock(feature_dim, state_dim, conv_type) + + self.res_layer_num = 7 + dilation = [1, 1, 1, 2, 2, 4, 4] + for i in range(self.res_layer_num): + conv = BasicBlock(state_dim, state_dim, conv_type, n_adj=4, dilation=dilation[i]) + self.__setattr__('res'+str(i), conv) + + fusion_state_dim = 256 + self.fusion = nn.Conv1d(state_dim * (self.res_layer_num + 1), fusion_state_dim, 1) + self.prediction = nn.Sequential( + nn.Conv1d(state_dim * (self.res_layer_num + 1) + fusion_state_dim, 256, 1), + nn.ReLU(inplace=True), + nn.Conv1d(256, 64, 1), + nn.ReLU(inplace=True), + nn.Conv1d(64, 2, 1) + ) + + def forward(self, x, adj): + states = [] + + x = self.head(x, adj) + states.append(x) + for i in range(self.res_layer_num): + x = self.__getattr__('res'+str(i))(x, adj) + x + states.append(x) + + state = torch.cat(states, dim=1) + global_state = torch.max(self.fusion(state), dim=2, keepdim=True)[0] + global_state = global_state.expand(global_state.size(0), global_state.size(1), state.size(2)) + state = torch.cat([global_state, state], dim=1) + x = self.prediction(state) + + return x diff --git a/IndicPhotoOCR/detection/textbpn/network/layers/GCN.py b/IndicPhotoOCR/detection/textbpn/network/layers/GCN.py new file mode 100644 index 0000000000000000000000000000000000000000..f2a5aa045b46e04c7393a1900b0e652de15de41d --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/layers/GCN.py @@ -0,0 +1,77 @@ +################################################################### +# File Name: GCN.py +# Author: S.X.Zhang +################################################################### + +from __future__ import print_function +from __future__ import division +from __future__ import absolute_import + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import init + + +class MeanAggregator(nn.Module): + def __init__(self): + super(MeanAggregator, self).__init__() + + def forward(self, features, A): + x = torch.bmm(A, features) + return x + + +class GraphConv(nn.Module): + def __init__(self, in_dim, out_dim, agg): + super(GraphConv, self).__init__() + self.in_dim = in_dim + self.out_dim = out_dim + self.weight = nn.Parameter(torch.FloatTensor(in_dim * 2, out_dim)) + self.bias = nn.Parameter(torch.FloatTensor(out_dim)) + init.xavier_uniform_(self.weight) + init.constant_(self.bias, 0) + self.agg = agg() + + def forward(self, features, A): + b, n, d = features.shape + assert (d == self.in_dim) + agg_feats = self.agg(features, A) + cat_feats = torch.cat([features, agg_feats], dim=2) + out = torch.einsum('bnd,df->bnf', (cat_feats, self.weight)) + out = F.relu(out + self.bias) + return out + + +class GCN(nn.Module): + def __init__(self, in_dim, out_dim): + super(GCN, self).__init__() + self.bn0 = nn.BatchNorm1d(in_dim, affine=False) + + self.conv1 = GraphConv(in_dim, 256, MeanAggregator) + self.conv2 = GraphConv(256, 1024, MeanAggregator) + self.conv3 = GraphConv(1024, 512, MeanAggregator) + self.conv4 = GraphConv(512, out_dim, MeanAggregator) + + self.prediction = nn.Sequential( + nn.Conv1d(out_dim, 128, 1), + nn.ReLU(inplace=True), + nn.Conv1d(128, 64, 1), + nn.ReLU(inplace=True), + nn.Conv1d(64, 2, 1)) + + def forward(self, x, A): + x = self.bn0(x) + x = x.permute(0, 2, 1) + b, n, c = x.shape + A = A.expand(b, n, n) + + x = self.conv1(x, A) + x = self.conv2(x, A) + x = self.conv3(x, A) + x = self.conv4(x, A) + + x = x.permute(0, 2, 1) + pred = self.prediction(x) + + return pred diff --git a/IndicPhotoOCR/detection/textbpn/network/layers/GraphConv.py b/IndicPhotoOCR/detection/textbpn/network/layers/GraphConv.py new file mode 100644 index 0000000000000000000000000000000000000000..5dd9f87320fbb5dbf14af519aff143b6c83a77e1 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/layers/GraphConv.py @@ -0,0 +1,45 @@ +import math + +import torch +from torch.nn.parameter import Parameter +from torch.nn.modules.module import Module +from torch.nn import init + + +class GraphConvolution(Module): + """ + Simple GCN layer, similar to https://arxiv.org/abs/1609.02907 + """ + + def __init__(self, in_features, out_features, bias=True): + super(GraphConvolution, self).__init__() + self.in_features = in_features + self.out_features = out_features + self.weight = Parameter(torch.FloatTensor(in_features, out_features)) + init.xavier_uniform_(self.weight) + if bias: + self.bias = Parameter(torch.FloatTensor(out_features)) + init.constant_(self.bias, 0) + else: + self.register_parameter('bias', None) + + self.reset_parameters() + + def reset_parameters(self): + stdv = 1. / math.sqrt(self.weight.size(1)) + self.weight.data.uniform_(-stdv, stdv) + if self.bias is not None: + self.bias.data.uniform_(-stdv, stdv) + + def forward(self, input, adj): + support = torch.mm(input, self.weight) + output = torch.spmm(adj, support) + if self.bias is not None: + return output + self.bias + else: + return output + + def __repr__(self): + return self.__class__.__name__ + ' (' \ + + str(self.in_features) + ' -> ' \ + + str(self.out_features) + ')' diff --git a/IndicPhotoOCR/detection/textbpn/network/layers/RNN.py b/IndicPhotoOCR/detection/textbpn/network/layers/RNN.py new file mode 100644 index 0000000000000000000000000000000000000000..8cafd28a0c920fe2bd799ec43868c73fc4e93c25 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/layers/RNN.py @@ -0,0 +1,35 @@ +################################################################### +# File Name: RNN.py +# Author: S.X.Zhang +################################################################### + +from __future__ import print_function +from __future__ import division +from __future__ import absolute_import + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import init + + +class RNN(nn.Module): + def __init__(self, input, state_dim): + super(RNN, self).__init__() + self.bn0 = nn.BatchNorm1d(input, affine=False) + self.rnn = nn.LSTM(input, state_dim, 1, dropout=0.1, bidirectional=True) + self.prediction = nn.Sequential( + nn.Conv1d(state_dim*2, 128, 1), + nn.ReLU(inplace=True), + nn.Conv1d(128, 64, 1), + nn.ReLU(inplace=True), + nn.Conv1d(64, 2, 1)) + + def forward(self, x, adj): + x = self.bn0(x) + x = x.permute(2, 0, 1) + x, _ = self.rnn(x) + x = x.permute(1, 2, 0) + pred = self.prediction(x) + + return pred diff --git a/IndicPhotoOCR/detection/textbpn/network/layers/Transformer.py b/IndicPhotoOCR/detection/textbpn/network/layers/Transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..9890b316bbea13537c8d3a839328028589225b41 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/layers/Transformer.py @@ -0,0 +1,140 @@ +################################################################### +# File Name: GCN.py +# Author: S.X.Zhang +################################################################### +import torch +from torch import nn, Tensor +import numpy as np +from IndicPhotoOCR.detection.textbpn.cfglib.config import config as cfg + + +class Positional_encoding(nn.Module): + def __init__(self, PE_size, n_position=256): + super(Positional_encoding, self).__init__() + self.PE_size = PE_size + self.n_position = n_position + self.register_buffer('pos_table', self.get_encoding_table(n_position, PE_size)) + + def get_encoding_table(self, n_position, PE_size): + position_table = np.array( + [[pos / np.power(10000, 2. * i / self.PE_size) for i in range(self.PE_size)] for pos in range(n_position)]) + position_table[:, 0::2] = np.sin(position_table[:, 0::2]) + position_table[:, 1::2] = np.cos(position_table[:, 1::2]) + return torch.FloatTensor(position_table).unsqueeze(0) + + def forward(self, inputs): + return inputs + self.pos_table[:, :inputs.size(1), :].clone().detach() + + +class MultiHeadAttention(nn.Module): + def __init__(self, num_heads, embed_dim, dropout=0.1, if_resi=True): + super(MultiHeadAttention, self).__init__() + self.layer_norm = nn.LayerNorm(embed_dim) + self.MultiheadAttention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout) + self.Q_proj = nn.Sequential(nn.Linear(embed_dim, embed_dim), nn.ReLU()) + self.K_proj = nn.Sequential(nn.Linear(embed_dim, embed_dim), nn.ReLU()) + self.V_proj = nn.Sequential(nn.Linear(embed_dim, embed_dim), nn.ReLU()) + self.if_resi = if_resi + + def forward(self, inputs): + query = self.layer_norm(inputs) + q = self.Q_proj(query) + k = self.K_proj(query) + v = self.V_proj(query) + attn_output, attn_output_weights = self.MultiheadAttention(q, k, v) + if self.if_resi: + attn_output += inputs + else: + attn_output = attn_output + + return attn_output + + +class FeedForward(nn.Module): + def __init__(self, in_channel, FFN_channel, if_resi=True): + super(FeedForward, self).__init__() + """ + 1024 2048 + """ + output_channel = (FFN_channel, in_channel) + self.fc1 = nn.Sequential(nn.Linear(in_channel, output_channel[0]), nn.ReLU()) + self.fc2 = nn.Linear(output_channel[0], output_channel[1]) + self.layer_norm = nn.LayerNorm(in_channel) + self.if_resi = if_resi + + def forward(self, inputs): + outputs = self.layer_norm(inputs) + outputs = self.fc1(outputs) + outputs = self.fc2(outputs) + if self.if_resi: + outputs += inputs + else: + outputs = outputs + return outputs + + +class TransformerLayer(nn.Module): + def __init__(self, out_dim, in_dim, num_heads, attention_size, + dim_feedforward=1024, drop_rate=0.1, if_resi=True, block_nums=3): + super(TransformerLayer, self).__init__() + self.block_nums = block_nums + self.if_resi = if_resi + self.linear = nn.Linear(in_dim, attention_size) + for i in range(self.block_nums): + self.__setattr__('MHA_self_%d' % i, MultiHeadAttention(num_heads, attention_size, + dropout=drop_rate, if_resi=if_resi)) + self.__setattr__('FFN_%d' % i, FeedForward(out_dim, dim_feedforward, if_resi=if_resi)) + + def forward(self, query): + inputs = self.linear(query) + # outputs = inputs + for i in range(self.block_nums): + outputs = self.__getattr__('MHA_self_%d' % i)(inputs) + outputs = self.__getattr__('FFN_%d' % i)(outputs) + if self.if_resi: + inputs = inputs+outputs + else: + inputs = outputs + # outputs = inputs + return inputs + + +class Transformer(nn.Module): + + def __init__(self, in_dim, out_dim, num_heads=8, + dim_feedforward=1024, drop_rate=0.1, if_resi=False, block_nums=3): + super().__init__() + + self.bn0 = nn.BatchNorm1d(in_dim, affine=False) + self.conv1 = nn.Conv1d(in_dim, out_dim, 1, dilation=1) + + # self.pos_embedding = Positional_encoding(in_dim) + self.transformer = TransformerLayer(out_dim, in_dim, num_heads, attention_size=out_dim, + dim_feedforward=dim_feedforward, drop_rate=drop_rate, + if_resi=if_resi, block_nums=block_nums) + + self.prediction = nn.Sequential( + nn.Conv1d(2*out_dim, 128, 1), + nn.ReLU(inplace=True), + nn.Dropout(0.1), + nn.Conv1d(128, 64, 1), + nn.ReLU(inplace=True), + # nn.Dropout(0.1), + nn.Conv1d(64, 2, 1)) + + def forward(self, x, adj): + x = self.bn0(x) + + x1 = x.permute(0, 2, 1) + # x1 = self.pos_embedding(x1) + x1 = self.transformer(x1) + x1 = x1.permute(0, 2, 1) + + x = torch.cat([x1, self.conv1(x)], dim=1) + # x = x1+self.conv1(x) + pred = self.prediction(x) + + return pred + + + diff --git a/IndicPhotoOCR/detection/textbpn/network/layers/Transformer_old.py b/IndicPhotoOCR/detection/textbpn/network/layers/Transformer_old.py new file mode 100644 index 0000000000000000000000000000000000000000..a2b7a7e485d7afb4b11048061391f6eb7ed2c274 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/layers/Transformer_old.py @@ -0,0 +1,171 @@ +################################################################### +# File Name: GCN.py +# Author: S.X.Zhang +################################################################### + +import torch +import torch.nn.functional as F +from torch import nn, Tensor +from torch.autograd import Variable +import numpy as np +from cfglib.config import config as cfg + + +class Positional_encoding(nn.Module): + def __init__(self, PE_size, n_position=200): + super(Positional_encoding, self).__init__() + self.PE_size = PE_size + self.n_position = n_position + self.register_buffer('pos_table', self.get_encoding_table(n_position, PE_size)) + + def get_encoding_table(self, n_position, PE_size): + position_table = np.array( + [[pos / np.power(10000, 2. * i / self.PE_size) for i in range(self.PE_size)] for pos in range(n_position)]) + position_table[:, 0::2] = np.sin(position_table[:, 0::2]) + position_table[:, 1::2] = np.cos(position_table[:, 1::2]) + return torch.FloatTensor(position_table).unsqueeze(0) + + def forward(self, inputs): + return inputs + self.pos_table[:, :inputs.size(1), :].clone().detach() + + +class MultiHeadAttention(nn.Module): + def __init__(self, num_heads, embedding_size, attention_size, + drop_rate, future_blind=True, query_mask=False, if_resi=True): + super(MultiHeadAttention, self).__init__() + self.num_heads = num_heads + self.embedding_size = embedding_size + self.attention_size = attention_size + self.drop_rate = drop_rate + self.future_blind = future_blind + + self.Q_proj = nn.Sequential(nn.Linear(self.embedding_size, self.attention_size), nn.ReLU()) + self.K_proj = nn.Sequential(nn.Linear(self.embedding_size, self.attention_size), nn.ReLU()) + self.V_proj = nn.Sequential(nn.Linear(self.embedding_size, self.attention_size), nn.ReLU()) + + self.drop_out = nn.Dropout(p=self.drop_rate) + self.layer_norm = nn.LayerNorm(self.attention_size) + self.if_resi = if_resi + + def forward(self, query, key, value): + q = self.Q_proj(query) + k = self.K_proj(key) + v = self.V_proj(value) + + q_ = torch.cat(torch.chunk(q, self.num_heads, dim=2), dim=0) + k_ = torch.cat(torch.chunk(k, self.num_heads, dim=2), dim=0) + v_ = torch.cat(torch.chunk(v, self.num_heads, dim=2), dim=0) + + outputs = torch.bmm(q_, k_.permute(0, 2, 1)) + outputs = outputs / (k_.size()[-1] ** 0.5) + + # key mask + + # future mask + if self.future_blind: + diag_vals = torch.ones_like(outputs[0, :, :]).to(cfg.device) + tril = torch.tril(diag_vals, diagonal=0) + masks = Variable(torch.unsqueeze(tril, 0).repeat(outputs.size()[0], 1, 1)) # (h*N,T_q,T_k) + padding = Variable(torch.ones_like(masks).to(cfg.device) * (-2 ** 32 + 1)) + condition = masks.eq(0) + outputs = torch.where(condition, padding, outputs) + + outputs = F.softmax(outputs, dim=-1) + # if self.future_blind==True:a + # print(outputs[0]) + outputs = self.drop_out(outputs) + + outputs = torch.bmm(outputs, v_) + outputs = torch.cat(torch.chunk(outputs, self.num_heads, dim=0), dim=2) # N,T_q,C + + if self.if_resi: + # outputs += query + outputs += q + else: + outputs = outputs + outputs = self.layer_norm(outputs) + + return outputs + + +class FeedForward(nn.Module): + def __init__(self, in_channel, FFN_channel, if_resi=True): + super(FeedForward, self).__init__() + """ + 1024 2048 + """ + output_channel = (FFN_channel, in_channel) + self.fc1 = nn.Sequential(nn.Linear(in_channel, output_channel[0]), nn.ReLU()) + self.fc2 = nn.Linear(output_channel[0], output_channel[1]) + self.layer_norm = nn.LayerNorm(in_channel) + self.if_resi = if_resi + + def forward(self, inputs): + outputs = self.fc1(inputs) + outputs = self.fc2(outputs) + if self.if_resi: + outputs += inputs + else: + outputs = outputs + outputs = self.layer_norm(outputs) + return outputs + + +class TransformerLayer(nn.Module): + def __init__(self, out_dim, num_heads, embedding_size, attention_size, + dim_feedforward=1024, drop_rate=0.1, if_resi=True, block_nums=3): + super(TransformerLayer, self).__init__() + self.block_nums = block_nums + self.if_resi = if_resi + for i in range(self.block_nums): + self.__setattr__('MHA_self_%d' % i, MultiHeadAttention(num_heads, embedding_size, attention_size, + drop_rate, future_blind=False, if_resi=if_resi)) + self.__setattr__('FFN_%d' % i, FeedForward(out_dim, dim_feedforward, if_resi=if_resi)) + + def forward(self, query): + outputs = None + for i in range(self.block_nums): + outputs = self.__getattr__('MHA_self_%d' % i)(query, query, query) + outputs = self.__getattr__('FFN_%d' % i)(outputs) + return outputs + + +class Transformer(nn.Module): + + def __init__(self, in_dim, out_dim, num_heads=8, + dim_feedforward=1024, drop_rate=0.1, if_resi=False, block_nums=3): + super().__init__() + + self.bn0 = nn.BatchNorm1d(in_dim, affine=False) + self.conv1 = nn.Conv1d(in_dim, out_dim, 1, dilation=1) + + embed_dim = in_dim + # self.pos_embedding = Positional_encoding(embed_dim) + self.transformer = TransformerLayer(out_dim, num_heads, embedding_size=embed_dim, + attention_size=out_dim, dim_feedforward=dim_feedforward, + drop_rate=drop_rate, if_resi=if_resi, block_nums=block_nums) + + self.prediction = nn.Sequential( + nn.Conv1d(out_dim*2, 128, 1), + nn.ReLU(inplace=True), + nn.Dropout(0.1), + nn.Conv1d(128, 64, 1), + nn.ReLU(inplace=True), + # nn.Dropout(0.1), + nn.Conv1d(64, 2, 1)) + + def forward(self, x, adj): + x = self.bn0(x) + + x1 = x.permute(0, 2, 1) + x1 = self.transformer(x1) + x1 = x1.permute(0, 2, 1) + + x = torch.cat([x1, self.conv1(x)], dim=1) + # x = x1+self.conv1(x) + pred = self.prediction(x) + + return pred + + + diff --git a/IndicPhotoOCR/detection/textbpn/network/layers/__init__.py b/IndicPhotoOCR/detection/textbpn/network/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/IndicPhotoOCR/detection/textbpn/network/layers/gcn_utils.py b/IndicPhotoOCR/detection/textbpn/network/layers/gcn_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6605ee61ae43097f1e6ef8848adbb3cff100829f --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/layers/gcn_utils.py @@ -0,0 +1,150 @@ +# -*- coding: utf-8 -*- +__author__ = "S.X.Zhang" +import torch +import numpy as np +import cv2 +import torch.nn as nn +from torch.autograd import Variable + + +def normalize_adj(A, type="AD"): + if type == "DAD": + A = A + np.eye(A.shape[0]) # A=A+I + d = np.sum(A, axis=0) + d_inv = np.power(d, -0.5).flatten() + d_inv[np.isinf(d_inv)] = 0.0 + d_inv = np.diag(d_inv) + G = A.dot(d_inv).transpose().dot(d_inv) # L = D^-1/2 A D^-1/2 + G = torch.from_numpy(G) + elif type == "AD": + A = A + np.eye(A.shape[0]) # A=A+I + A = torch.from_numpy(A) + D = A.sum(1, keepdim=True) + G = A.div(D) # L= A/D + else: + A = A + np.eye(A.shape[0]) # A=A+I + D = A.sum(1, keepdim=True) + D = np.diag(D) + G = torch.from_numpy(D - A) # L = D-A + return G + + +def np_to_variable(x, is_cuda=True, dtype=torch.FloatTensor): + v = Variable(torch.from_numpy(x).type(dtype)) + if is_cuda: + v = v.cuda() + return v + + +def set_trainable(model, requires_grad): + for param in model.parameters(): + param.requires_grad = requires_grad + + +def weights_normal_init(model, dev=0.01): + if isinstance(model, list): + for m in model: + weights_normal_init(m, dev) + else: + for m in model.modules(): + if isinstance(m, nn.Conv2d): + m.weight.data.normal_(0.0, dev) + elif isinstance(m, nn.Linear): + m.weight.data.normal_(0.0, dev) + + +def clip_gradient(model, clip_norm): + """Computes a gradient clipping coefficient based on gradient norm.""" + totalnorm = 0 + for p in model.parameters(): + if p.requires_grad: + modulenorm = p.grad.data.norm() + totalnorm += modulenorm ** 2 + totalnorm = np.sqrt(totalnorm) + + norm = clip_norm / max(totalnorm, clip_norm) + for p in model.parameters(): + if p.requires_grad: + p.grad.mul_(norm) + + +def EuclideanDistances(A, B): + BT = B.transpose() + vecProd = np.dot(A,BT) + SqA = A**2 + sumSqA = np.matrix(np.sum(SqA, axis=1)) + sumSqAEx = np.tile(sumSqA.transpose(), (1, vecProd.shape[1])) + + SqB = B**2 + sumSqB = np.sum(SqB, axis=1) + sumSqBEx = np.tile(sumSqB, (vecProd.shape[0], 1)) + SqED = sumSqBEx + sumSqAEx - 2*vecProd + SqED[SqED<0]=0.0 + ED = np.sqrt(SqED) + return ED + + +def get_center_feature(cnn_feature, img_poly, ind, h, w): + batch_size = cnn_feature.size(0) + for i in range(batch_size): + poly = img_poly[ind == i].cpu().numpy() + mask = np.zeros((h, w), dtype=np.uint8) + cv2.fillPoly(mask, poly.astype(np.int32), color=(1,)) + return None + + +def get_node_feature(cnn_feature, img_poly, ind, h, w): + img_poly = img_poly.clone().float() + img_poly[..., 0] = img_poly[..., 0] / (w / 2.) - 1 + img_poly[..., 1] = img_poly[..., 1] / (h / 2.) - 1 + + batch_size = cnn_feature.size(0) + gcn_feature = torch.zeros([img_poly.size(0), cnn_feature.size(1), img_poly.size(1)]).to(img_poly.device) + for i in range(batch_size): + poly = img_poly[ind == i].unsqueeze(0) + gcn_feature[ind == i] = torch.nn.functional.grid_sample(cnn_feature[i:i + 1], poly)[0].permute(1, 0, 2) + return gcn_feature + + +def get_adj_mat(n_adj, n_nodes): + a = np.zeros([n_nodes, n_nodes], dtype=np.float) + + for i in range(n_nodes): + for j in range(-n_adj // 2, n_adj // 2 + 1): + if j != 0: + a[i][(i + j) % n_nodes] = 1 + a[(i + j) % n_nodes][i] = 1 + return a + + +def get_adj_ind(n_adj, n_nodes, device): + ind = torch.tensor([i for i in range(-n_adj // 2, n_adj // 2 + 1) if i != 0]).long() + ind = (torch.arange(n_nodes)[:, None] + ind[None]) % n_nodes + return ind.to(device) + + +def coord_embedding(b, w, h, device): + x_range = torch.linspace(0, 1, w, device=device) + y_range = torch.linspace(0, 1, h, device=device) + y, x = torch.meshgrid(y_range, x_range) + y = y.expand([b, 1, -1, -1]) + x = x.expand([b, 1, -1, -1]) + coord_map = torch.cat([x, y], 1) + + return coord_map + + +def img_poly_to_can_poly(img_poly): + if len(img_poly) == 0: + return torch.zeros_like(img_poly) + x_min = torch.min(img_poly[..., 0], dim=-1)[0] + y_min = torch.min(img_poly[..., 1], dim=-1)[0] + can_poly = img_poly.clone() + can_poly[..., 0] = can_poly[..., 0] - x_min[..., None] + can_poly[..., 1] = can_poly[..., 1] - y_min[..., None] + # x_max = torch.max(img_poly[..., 0], dim=-1)[0] + # y_max = torch.max(img_poly[..., 1], dim=-1)[0] + # h, w = y_max - y_min + 1, x_max - x_min + 1 + # long_side = torch.max(h, w) + # can_poly = can_poly / long_side[..., None, None] + return can_poly diff --git a/IndicPhotoOCR/detection/textbpn/network/layers/model_block.py b/IndicPhotoOCR/detection/textbpn/network/layers/model_block.py new file mode 100644 index 0000000000000000000000000000000000000000..0a6b0bf9ba52f72cfa29c8f1ce389d90b5d04422 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/layers/model_block.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- +__author__ = "S.X.Zhang" +import torch +import torch.nn as nn +import torch.nn.functional as F +from IndicPhotoOCR.detection.textbpn.network.layers.vgg import VggNet +from IndicPhotoOCR.detection.textbpn.network.layers.resnet import ResNet +from IndicPhotoOCR.detection.textbpn.network.layers.resnet_dcn import ResNet_DCN +from IndicPhotoOCR.detection.textbpn.cfglib.config import config as cfg + + +class UpBlok(nn.Module): + + def __init__(self, in_channels, out_channels): + super().__init__() + self.conv1x1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + self.conv3x3 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + self.deconv = nn.ConvTranspose2d(out_channels, out_channels, kernel_size=4, stride=2, padding=1) + + def forward(self, upsampled, shortcut): + x = torch.cat([upsampled, shortcut], dim=1) + x = self.conv1x1(x) + x = F.relu(x) + x = self.conv3x3(x) + x = F.relu(x) + x = self.deconv(x) + return x + + +class MergeBlok(nn.Module): + def __init__(self, in_channels, out_channels): + super().__init__() + self.conv1x1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + self.conv3x3 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, upsampled, shortcut): + x = torch.cat([upsampled, shortcut], dim=1) + x = self.conv1x1(x) + x = F.relu(x) + x = self.conv3x3(x) + return x + + +class FPN(nn.Module): + + def __init__(self, backbone='resnet50', is_training=True): + super().__init__() + self.is_training = is_training + self.backbone_name = backbone + + if backbone in ['vgg_bn', 'vgg']: + self.backbone = VggNet(name=backbone, pretrain=is_training) + self.deconv5 = nn.ConvTranspose2d(512, 256, kernel_size=4, stride=2, padding=1) + self.merge4 = UpBlok(512 + 256, 128) + self.merge3 = UpBlok(256 + 128, 64) + if cfg.scale == 1: + self.merge2 = UpBlok(128 + 64, 32) # FPN 1/2 + self.merge1 = UpBlok(64 + 32, 32) # FPN 1/1 + elif cfg.scale == 2: + self.merge2 = UpBlok(128 + 64, 32) # FPN 1/2 + self.merge1 = MergeBlok(64 + 32, 32) # FPN 1/2 + elif cfg.scale == 4: + self.merge2 = MergeBlok(128 + 64, 32) # FPN 1/4 + + elif backbone in ['resnet50']: + self.backbone = ResNet(name=backbone, pretrain=is_training) + self.deconv5 = nn.ConvTranspose2d(2048, 256, kernel_size=4, stride=2, padding=1) + self.merge4 = UpBlok(1024 + 256, 128) + self.merge3 = UpBlok(512 + 128, 64) + if cfg.scale == 1: + self.merge2 = UpBlok(256 + 64, 32) # FPN 1/2 + self.merge1 = UpBlok(64 + 32, 32) # FPN 1/1 + elif cfg.scale == 2: + self.merge2 = UpBlok(256 + 64, 32) # FPN 1/2 + self.merge1 = MergeBlok(64 + 32, 32) # FPN 1/2 + elif cfg.scale == 4: + self.merge2 = MergeBlok(256 + 64, 32) # FPN 1/4 + self.merge1 = MergeBlok(64 + 32, 32) # FPN 1/4 + + elif backbone in ['resnet18']: + self.backbone = ResNet(name=backbone, pretrain=is_training) + self.deconv5 = nn.ConvTranspose2d(512, 256, kernel_size=4, stride=2, padding=1) + self.merge4 = UpBlok(256 + 256, 128) + self.merge3 = UpBlok(128 + 128, 64) + if cfg.scale == 1: + self.merge2 = UpBlok(64 + 64, 32) # FPN 1/2 + self.merge1 = UpBlok(64 + 32, 32) # FPN 1/1 + elif cfg.scale == 2: + self.merge2 = UpBlok(64 + 64, 32) # FPN 1/2 + self.merge1 = MergeBlok(64 + 32, 32) # FPN 1/2 + elif cfg.scale == 4: + self.merge2 = MergeBlok(64 + 64, 32) # FPN 1/4 + self.merge1 = MergeBlok(64 + 32, 32) # FPN 1/4 + + elif backbone in ["deformable_resnet18"]: + self.backbone = ResNet_DCN(name=backbone, pretrain=is_training) + self.deconv5 = nn.ConvTranspose2d(512, 256, kernel_size=4, stride=2, padding=1) + self.merge4 = UpBlok(256 + 256, 128) + self.merge3 = UpBlok(128 + 128, 64) + if cfg.scale == 1: + self.merge2 = UpBlok(64 + 64, 32) # FPN 1/2 + self.merge1 = UpBlok(64 + 32, 32) # FPN 1/1 + elif cfg.scale == 2: + self.merge2 = UpBlok(64 + 64, 32) # FPN 1/2 + self.merge1 = MergeBlok(64 + 32, 32) # FPN 1/2 + elif cfg.scale == 4: + self.merge2 = MergeBlok(64 + 64, 32) # FPN 1/4 + self.merge1 = MergeBlok(64 + 32, 32) # FPN 1/4 + + elif backbone in ["deformable_resnet50"]: + self.backbone = ResNet_DCN(name=backbone, pretrain=is_training) + self.deconv5 = nn.ConvTranspose2d(2048, 256, kernel_size=4, stride=2, padding=1) + self.merge4 = UpBlok(1024 + 256, 128) + self.merge3 = UpBlok(512 + 128, 64) + if cfg.scale == 1: + self.merge2 = UpBlok(256 + 64, 32) # FPN 1/2 + self.merge1 = UpBlok(64 + 32, 32) # FPN 1/1 + elif cfg.scale == 2: + self.merge2 = UpBlok(256 + 64, 32) # FPN 1/2 + self.merge1 = MergeBlok(64 + 32, 32) # FPN 1/2 + elif cfg.scale == 4: + self.merge2 = MergeBlok(256 + 64, 32) # FPN 1/4 + self.merge1 = MergeBlok(64 + 32, 32) # FPN 1/4 + else: + print("backbone is not support !") + + def forward(self, x): + C1, C2, C3, C4, C5 = self.backbone(x) + #print(C5.size()) + #print(C4.size()) + #print(C3.size()) + #print(C2.size()) + #print(C1.size()) + up5 = self.deconv5(C5) + up5 = F.relu(up5) + + up4 = self.merge4(C4, up5) + up4 = F.relu(up4) + + up3 = self.merge3(C3, up4) + up3 = F.relu(up3) + + up2 = self.merge2(C2, up3) + up2 = F.relu(up2) + + up1 = self.merge1(C1, up2) + up1 = F.relu(up1) + + return up1, up2, up3, up4, up5 diff --git a/IndicPhotoOCR/detection/textbpn/network/layers/position_encoding.py b/IndicPhotoOCR/detection/textbpn/network/layers/position_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..73ae39edf24659e226dc6d96c7c5cbf8bef579ca --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/layers/position_encoding.py @@ -0,0 +1,89 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Various positional encodings for the transformer. +""" +import math +import torch +from torch import nn + +from util.misc import NestedTensor + + +class PositionEmbeddingSine(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one + used by the Attention is all you need paper, generalized to work on images. + """ + def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, tensor_list: NestedTensor): + x = tensor_list.tensors + mask = tensor_list.mask + assert mask is not None + not_mask = ~mask + y_embed = not_mask.cumsum(1, dtype=torch.float32) + x_embed = not_mask.cumsum(2, dtype=torch.float32) + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +class PositionEmbeddingLearned(nn.Module): + """ + Absolute pos embedding, learned. + """ + def __init__(self, num_pos_feats=256): + super().__init__() + self.row_embed = nn.Embedding(50, num_pos_feats) + self.col_embed = nn.Embedding(50, num_pos_feats) + self.reset_parameters() + + def reset_parameters(self): + nn.init.uniform_(self.row_embed.weight) + nn.init.uniform_(self.col_embed.weight) + + def forward(self, tensor_list: NestedTensor): + x = tensor_list.tensors + h, w = x.shape[-2:] + i = torch.arange(w, device=x.device) + j = torch.arange(h, device=x.device) + x_emb = self.col_embed(i) + y_emb = self.row_embed(j) + pos = torch.cat([ + x_emb.unsqueeze(0).repeat(h, 1, 1), + y_emb.unsqueeze(1).repeat(1, w, 1), + ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) + return pos + + +def build_position_encoding(args): + N_steps = args.hidden_dim // 2 + if args.position_embedding in ('v2', 'sine'): + # TODO find a better way of exposing other arguments + position_embedding = PositionEmbeddingSine(N_steps, normalize=True) + elif args.position_embedding in ('v3', 'learned'): + position_embedding = PositionEmbeddingLearned(N_steps) + else: + raise ValueError(f"not supported {args.position_embedding}") + + return position_embedding diff --git a/IndicPhotoOCR/detection/textbpn/network/layers/resnet.py b/IndicPhotoOCR/detection/textbpn/network/layers/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..b6da7208d9e30bbe6022c3ae5b04e9f0c08f8483 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/layers/resnet.py @@ -0,0 +1,73 @@ +import torch +import torch.nn as nn +from torchvision.models import resnet +import torch.utils.model_zoo as model_zoo +from IndicPhotoOCR.detection.textbpn.cfglib.config import config as cfg + +model_urls = { + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', + +} + + +class ResNet(nn.Module): + def __init__(self, name="resnet50", pretrain=True): + super().__init__() + + if name == "resnet50": + base_net = resnet.resnet50(pretrained=False) + elif name == "resnet101": + base_net = resnet.resnet101(pretrained=False) + elif name == "resnet18": + base_net = resnet.resnet18(pretrained=False) + elif name == "resnet34": + base_net = resnet.resnet34(pretrained=False) + + else: + print(" base model is not support !") + + if pretrain: + print("load the {} weight from ./cache".format(name)) + base_net.load_state_dict(model_zoo.load_url(model_urls[name], model_dir="./cache", + map_location=torch.device(cfg.device)), strict=False) + # print(base_net) + self.stage1 = nn.Sequential( + base_net.conv1, + base_net.bn1, + base_net.relu, + base_net.maxpool + ) + self.stage2 = base_net.layer1 + self.stage3 = base_net.layer2 + self.stage4 = base_net.layer3 + self.stage5 = base_net.layer4 + self.up2 = nn.ConvTranspose2d(64, 64, kernel_size=4, stride=2, padding=1) + + def forward(self, x): + C1 = self.stage1(x) + C2 = self.stage2(C1) + C3 = self.stage3(C2) + C4 = self.stage4(C3) + C5 = self.stage5(C4) + + if cfg.scale == 2 or cfg.scale == 1: + # up2 --> 1/2 + C1 = self.up2(C1) + + return C1, C2, C3, C4, C5 + + +if __name__ == '__main__': + import torch + input = torch.randn((4, 3, 512, 512)) + net = ResNet() + C1, C2, C3, C4, C5 = net(input) + print(C1.size()) + print(C2.size()) + print(C3.size()) + print(C4.size()) + print(C5.size()) diff --git a/IndicPhotoOCR/detection/textbpn/network/layers/resnet_dcn.py b/IndicPhotoOCR/detection/textbpn/network/layers/resnet_dcn.py new file mode 100644 index 0000000000000000000000000000000000000000..918b5b84b8448adbc221c0ae93de3c7db71cfd1c --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/layers/resnet_dcn.py @@ -0,0 +1,59 @@ +import torch +import torch.nn as nn +from IndicPhotoOCR.detection.textbpn.network.backbone.resnet import deformable_resnet18,deformable_resnet50 +import torch.utils.model_zoo as model_zoo +from IndicPhotoOCR.detection.textbpn.cfglib.config import config as cfg + +model_urls = { + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', + +} + + +class ResNet_DCN(nn.Module): + def __init__(self, name="deformable_resnet18", pretrain=False): + super().__init__() + + if name == "deformable_resnet18": + self.base_net = deformable_resnet18(pretrained=False) + if pretrain: + print("load the {} weight from ./cache".format(name)) + self.base_net.load_state_dict( + model_zoo.load_url(model_urls["resnet18"], model_dir="./cache", + map_location=torch.device(cfg.device)), strict=False) + + elif name == "deformable_resnet50": + self.base_net = deformable_resnet50(pretrained=False) + if pretrain: + print("load the {} weight from ./cache".format(name)) + self.base_net.load_state_dict( + model_zoo.load_url(model_urls["resnet50"], model_dir="./cache", + map_location=torch.device(cfg.device)), strict=False) + else: + print(" base model is not support !") + + # print(base_net) + self.up2 = nn.ConvTranspose2d(64, 64, kernel_size=4, stride=2, padding=1) + + def forward(self, x): + C1, C2, C3, C4, C5 = self.base_net(x) + # up2 --> 1/2 + C1 = self.up2(C1) + + return C1, C2, C3, C4, C5 + + +if __name__ == '__main__': + import torch + input = torch.randn((4, 3, 512, 512)) + net = ResNet_DCN() + C1, C2, C3, C4, C5 = net(input) + print(C1.size()) + print(C2.size()) + print(C3.size()) + print(C4.size()) + print(C5.size()) diff --git a/IndicPhotoOCR/detection/textbpn/network/layers/vgg.py b/IndicPhotoOCR/detection/textbpn/network/layers/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..c1796e65aeba98c320163b9aa1852f03bb25ef99 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/layers/vgg.py @@ -0,0 +1,62 @@ +import torch.nn as nn +import torch.utils.model_zoo as model_zoo +import torchvision.models as models +from IndicPhotoOCR.detection.textbpn.cfglib.config import config as cfg + +model_urls = { + 'vgg11': 'https://download.pytorch.org/models/vgg11-bbd30ac9.pth', + 'vgg16': 'https://download.pytorch.org/models/vgg16-397923af.pth', + 'vgg19': 'https://download.pytorch.org/models/vgg19-dcbb9e9d.pth', + 'vgg11_bn': 'https://download.pytorch.org/models/vgg11_bn-6002323d.pth', + 'vgg16_bn': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth', + 'vgg19_bn': 'https://download.pytorch.org/models/vgg19_bn-c79401a0.pth', +} + + +class VggNet(nn.Module): + def __init__(self, name="vgg16", pretrain=True): + super().__init__() + if name == "vgg16": + base_net = models.vgg16(pretrained=False) + elif name == "vgg16_bn": + base_net = models.vgg16_bn(pretrained=False) + else: + print(" base model is not support !") + if pretrain: + print("load the {} weight from ./cache".format(name)) + base_net.load_state_dict(model_zoo.load_url(model_urls[name], + model_dir="./cache",map_location=torch.device(cfg.device))) + + if name == "vgg16": + self.stage1 = nn.Sequential(*[base_net.features[layer] for layer in range(0, 5)]) + self.stage2 = nn.Sequential(*[base_net.features[layer] for layer in range(5, 10)]) + self.stage3 = nn.Sequential(*[base_net.features[layer] for layer in range(10, 17)]) + self.stage4 = nn.Sequential(*[base_net.features[layer] for layer in range(17, 24)]) + self.stage5 = nn.Sequential(*[base_net.features[layer] for layer in range(24, 31)]) + elif name == "vgg16_bn": + self.stage1 = nn.Sequential(*[base_net.features[layer] for layer in range(0, 7)]) + self.stage2 = nn.Sequential(*[base_net.features[layer] for layer in range(7, 14)]) + self.stage3 = nn.Sequential(*[base_net.features[layer] for layer in range(14, 24)]) + self.stage4 = nn.Sequential(*[base_net.features[layer] for layer in range(24, 34)]) + self.stage5 = nn.Sequential(*[base_net.features[layer] for layer in range(34, 44)]) + + def forward(self, x): + C1 = self.stage1(x) + C2 = self.stage2(C1) + C3 = self.stage3(C2) + C4 = self.stage4(C3) + C5 = self.stage5(C4) + + return C1, C2, C3, C4, C5 + + +if __name__ == '__main__': + import torch + input = torch.randn((4, 3, 512, 512)) + net = VggNet() + C1, C2, C3, C4, C5 = net(input) + print(C1.size()) + print(C2.size()) + print(C3.size()) + print(C4.size()) + print(C5.size()) diff --git a/IndicPhotoOCR/detection/textbpn/network/loss.py b/IndicPhotoOCR/detection/textbpn/network/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..08392b9ba27e4bd9d49b87e8e90687e8e8249896 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/loss.py @@ -0,0 +1,187 @@ +# -*- coding: utf-8 -*- +# @Time : 10/1/21 +# @Author : GXYM +import torch +import torch.nn as nn +from cfglib.config import config as cfg +from network.Seg_loss import SegmentLoss +from network.Reg_loss import PolyMatchingLoss +import torch.nn.functional as F + + +class TextLoss(nn.Module): + + def __init__(self): + super().__init__() + self.MSE_loss = torch.nn.MSELoss(reduce=False, size_average=False) + self.BCE_loss = torch.nn.BCELoss(reduce=False, size_average=False) + self.PolyMatchingLoss = PolyMatchingLoss(cfg.num_points, cfg.device) + self.KL_loss = torch.nn.KLDivLoss(reduce=False, size_average=False) + + @staticmethod + def single_image_loss(pre_loss, loss_label): + batch_size = pre_loss.shape[0] + sum_loss = torch.mean(pre_loss.view(-1)) * 0 + pre_loss = pre_loss.view(batch_size, -1) + loss_label = loss_label.view(batch_size, -1) + eps = 0.001 + for i in range(batch_size): + average_number = 0 + positive_pixel = len(pre_loss[i][(loss_label[i] >= eps)]) + average_number += positive_pixel + if positive_pixel != 0: + posi_loss = torch.mean(pre_loss[i][(loss_label[i] >= eps)]) + sum_loss += posi_loss + if len(pre_loss[i][(loss_label[i] < eps)]) < 3 * positive_pixel: + nega_loss = torch.mean(pre_loss[i][(loss_label[i] < eps)]) + average_number += len(pre_loss[i][(loss_label[i] < eps)]) + else: + nega_loss = torch.mean(torch.topk(pre_loss[i][(loss_label[i] < eps)], 3 * positive_pixel)[0]) + average_number += 3 * positive_pixel + sum_loss += nega_loss + else: + nega_loss = torch.mean(torch.topk(pre_loss[i], 100)[0]) + average_number += 100 + sum_loss += nega_loss + # sum_loss += loss/average_number + + return sum_loss/batch_size + + def cls_ohem(self, predict, target, train_mask, negative_ratio=3.): + pos = (target * train_mask).bool() + neg = ((1 - target) * train_mask).bool() + + n_pos = pos.float().sum() + if n_pos.item() > 0: + loss_pos = self.BCE_loss(predict[pos], target[pos]).sum() + loss_neg = self.BCE_loss(predict[neg], target[neg]) + n_neg = min(int(neg.float().sum().item()), int(negative_ratio * n_pos.float())) + else: + loss_pos = torch.tensor(0.) + loss_neg = self.BCE_loss(predict[neg], target[neg]) + n_neg = 100 + loss_neg, _ = torch.topk(loss_neg, n_neg) + + return (loss_pos + loss_neg.sum()) / (n_pos + n_neg).float() + + @staticmethod + def loss_calc_flux(pred_flux, gt_flux, weight_matrix, mask, train_mask): + + # norm loss + gt_flux = 0.999999 * gt_flux / (gt_flux.norm(p=2, dim=1).unsqueeze(1) + 1e-3) + norm_loss = weight_matrix * torch.mean((pred_flux - gt_flux) ** 2, dim=1)*train_mask + norm_loss = norm_loss.sum(-1).mean() + # norm_loss = norm_loss.sum() + + # angle loss + mask = train_mask * mask + pred_flux = 0.999999 * pred_flux / (pred_flux.norm(p=2, dim=1).unsqueeze(1) + 1e-3) + # angle_loss = weight_matrix * (torch.acos(torch.sum(pred_flux * gt_flux, dim=1))) ** 2 + # angle_loss = angle_loss.sum(-1).mean() + angle_loss = (1 - torch.cosine_similarity(pred_flux, gt_flux, dim=1)) + angle_loss = angle_loss[mask].mean() + + return norm_loss, angle_loss + + @staticmethod + def get_poly_energy(energy_field, img_poly, ind, h, w): + img_poly = img_poly.clone().float() + img_poly[..., 0] = img_poly[..., 0] / (w / 2.) - 1 + img_poly[..., 1] = img_poly[..., 1] / (h / 2.) - 1 + + batch_size = energy_field.size(0) + gcn_feature = torch.zeros([img_poly.size(0), energy_field.size(1), img_poly.size(1)]).to(img_poly.device) + for i in range(batch_size): + poly = img_poly[ind == i].unsqueeze(0) + gcn_feature[ind == i] = torch.nn.functional.grid_sample(energy_field[i:i + 1], poly)[0].permute(1, 0, 2) + return gcn_feature + + def loss_energy_regularization(self, energy_field, img_poly, inds, h, w): + energys = [] + for i, py in enumerate(img_poly): + energy = self.get_poly_energy(energy_field.unsqueeze(1), py, inds, h, w) + energys.append(energy.squeeze(1).sum(-1)) + + regular_loss = torch.tensor(0.) + energy_loss = torch.tensor(0.) + for i, e in enumerate(energys[1:]): + regular_loss += torch.clamp(e - energys[i], min=0.0).mean() + energy_loss += torch.where(e <= 0.01, torch.tensor(0.), e).mean() + + return (energy_loss+regular_loss)/len(energys[1:]) + + def forward(self, input_dict, output_dict, eps=None): + """ + calculate boundary proposal network loss + """ + # tr_mask = tr_mask.permute(0, 3, 1, 2).contiguous() + + fy_preds = output_dict["fy_preds"] + py_preds = output_dict["py_preds"] + inds = output_dict["inds"] + + train_mask = input_dict['train_mask'] + tr_mask = input_dict['tr_mask'] > 0 + distance_field = input_dict['distance_field'] + direction_field = input_dict['direction_field'] + weight_matrix = input_dict['weight_matrix'] + gt_tags = input_dict['gt_points'] + + # # scale the prediction map + # fy_preds = F.interpolate(fy_preds, scale_factor=cfg.scale, mode='bilinear') + + if cfg.scale > 1: + train_mask = F.interpolate(train_mask.float().unsqueeze(1), + scale_factor=1/cfg.scale, mode='bilinear').squeeze().bool() + tr_mask = F.interpolate(tr_mask.float().unsqueeze(1), + scale_factor=1/cfg.scale, mode='bilinear').squeeze().bool() + + distance_field = F.interpolate(distance_field.unsqueeze(1), + scale_factor=1/cfg.scale, mode='bilinear').squeeze() + direction_field = F.interpolate(direction_field, + scale_factor=1 / cfg.scale, mode='bilinear') + weight_matrix = F.interpolate(weight_matrix.unsqueeze(1), + scale_factor=1/cfg.scale, mode='bilinear').squeeze() + + # pixel class loss + # cls_loss = self.cls_ohem(fy_preds[:, 0, :, :], tr_mask.float(), train_mask) + cls_loss = self.BCE_loss(fy_preds[:, 0, :, :], tr_mask.float()) + cls_loss = torch.mul(cls_loss, train_mask.float()).mean() + + # distance field loss + dis_loss = self.MSE_loss(fy_preds[:, 1, :, :], distance_field) + dis_loss = torch.mul(dis_loss, train_mask.float()) + dis_loss = self.single_image_loss(dis_loss, distance_field) + + # # direction field loss + norm_loss, angle_loss = self.loss_calc_flux(fy_preds[:, 2:4, :, :], direction_field, + weight_matrix, tr_mask, train_mask) + + # boundary point loss + point_loss = self.PolyMatchingLoss(py_preds[1:], gt_tags[inds]) + + # Minimum energy loss regularization + h, w = distance_field.size(1) * cfg.scale, distance_field.size(2) * cfg.scale + energy_loss = self.loss_energy_regularization(distance_field, py_preds, inds[0], h, w) + + if eps is None: + alpha = 1.0; beta = 3.0; theta=0.5; gama = 0.05 + else: + alpha = 1.0; beta = 3.0; theta=0.5; + gama = 0.1*torch.sigmoid(torch.tensor((eps - cfg.max_epoch)/cfg.max_epoch)) + loss = alpha*cls_loss + beta*dis_loss + theta*(norm_loss + angle_loss) + gama*(point_loss + energy_loss) + + loss_dict = { + 'total_loss': loss, + 'cls_loss': alpha*cls_loss, + 'distance loss': beta*dis_loss, + 'dir_loss': theta*(norm_loss + angle_loss), + 'norm_loss': theta*norm_loss, + 'angle_loss': theta*angle_loss, + 'point_loss': gama*point_loss, + 'energy_loss': gama*energy_loss, + + } + + return loss_dict + diff --git a/IndicPhotoOCR/detection/textbpn/network/loss_org.py b/IndicPhotoOCR/detection/textbpn/network/loss_org.py new file mode 100644 index 0000000000000000000000000000000000000000..ee12ea7acee5984cf89c6921228eaef0be8bb1bb --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/loss_org.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- +# @Time : 10/1/21 +# @Author : GXYM +import torch +import torch.nn as nn +from cfglib.config import config as cfg +from network.Seg_loss import SegmentLoss +from network.Reg_loss import PolyMatchingLoss + + +class TextLoss(nn.Module): + + def __init__(self): + super().__init__() + self.MSE_loss = torch.nn.MSELoss(reduce=False, size_average=False) + self.BCE_loss = torch.nn.BCELoss(reduce=False, size_average=False) + self.PolyMatchingLoss = PolyMatchingLoss(cfg.num_points, cfg.device) + self.KL_loss = torch.nn.KLDivLoss(reduce=False, size_average=False) + + @staticmethod + def single_image_loss(pre_loss, loss_label): + batch_size = pre_loss.shape[0] + sum_loss = torch.mean(pre_loss.view(-1)) * 0 + pre_loss = pre_loss.view(batch_size, -1) + loss_label = loss_label.view(batch_size, -1) + eps = 0.001 + for i in range(batch_size): + average_number = 0 + positive_pixel = len(pre_loss[i][(loss_label[i] >= eps)]) + average_number += positive_pixel + if positive_pixel != 0: + posi_loss = torch.mean(pre_loss[i][(loss_label[i] >= eps)]) + sum_loss += posi_loss + if len(pre_loss[i][(loss_label[i] < eps)]) < 3 * positive_pixel: + nega_loss = torch.mean(pre_loss[i][(loss_label[i] < eps)]) + average_number += len(pre_loss[i][(loss_label[i] < eps)]) + else: + nega_loss = torch.mean(torch.topk(pre_loss[i][(loss_label[i] < eps)], 3 * positive_pixel)[0]) + average_number += 3 * positive_pixel + sum_loss += nega_loss + else: + nega_loss = torch.mean(torch.topk(pre_loss[i], 100)[0]) + average_number += 100 + sum_loss += nega_loss + # sum_loss += loss/average_number + + return sum_loss/batch_size + + def cls_ohem(self, predict, target, train_mask, negative_ratio=3.): + pos = (target * train_mask).bool() + neg = ((1 - target) * train_mask).bool() + + n_pos = pos.float().sum() + + if n_pos.item() > 0: + loss_pos = self.BCE_loss(predict[pos], target[pos]).sum() + loss_neg = self.BCE_loss(predict[neg], target[neg]) + n_neg = min(int(neg.float().sum().item()), int(negative_ratio * n_pos.float())) + else: + loss_pos = torch.tensor(0.) + loss_neg = self.BCE_loss(predict[neg], target[neg]) + n_neg = 100 + loss_neg, _ = torch.topk(loss_neg, n_neg) + + return (loss_pos + loss_neg.sum()) / (n_pos + n_neg).float() + + @staticmethod + def loss_calc_flux(pred_flux, gt_flux, weight_matrix, mask, train_mask): + + # norm loss + gt_flux = 0.999999 * gt_flux / (gt_flux.norm(p=2, dim=1).unsqueeze(1) + 1e-9) + norm_loss = weight_matrix * torch.sum((pred_flux - gt_flux) ** 2, dim=1)*train_mask + norm_loss = norm_loss.sum(-1).mean() + + # angle loss + mask = train_mask * mask + pred_flux = 0.999999 * pred_flux / (pred_flux.norm(p=2, dim=1).unsqueeze(1) + 1e-9) + # angle_loss = weight_matrix * (torch.acos(torch.sum(pred_flux * gt_flux, dim=1))) ** 2 + # angle_loss = angle_loss.sum(-1).mean() + angle_loss = (1 - torch.cosine_similarity(pred_flux, gt_flux, dim=1)) + angle_loss = angle_loss[mask].mean() + + return norm_loss, angle_loss + + def forward(self, input_dict, output_dict, eps=None): + """ + calculate boundary proposal network loss + """ + # tr_mask = tr_mask.permute(0, 3, 1, 2).contiguous() + + fy_preds = output_dict["fy_preds"] + py_preds = output_dict["py_preds"] + inds = output_dict["inds"] + + train_mask = input_dict['train_mask'] + tr_mask = input_dict['tr_mask'] > 0 + distance_field = input_dict['distance_field'] + direction_field = input_dict['direction_field'] + weight_matrix = input_dict['weight_matrix'] + gt_tags = input_dict['gt_points'] + + # pixel class loss + cls_loss = self.cls_ohem(fy_preds[:, 0, :, :], tr_mask.float(), train_mask.bool()) + + # distance field loss + dis_loss = self.MSE_loss(fy_preds[:, 1, :, :], distance_field) + dis_loss = torch.mul(dis_loss, train_mask.float()) + dis_loss = self.single_image_loss(dis_loss, distance_field) + + # direction field loss + norm_loss, angle_loss = self.loss_calc_flux(fy_preds[:, 2:4, :, :], + direction_field, weight_matrix, tr_mask, train_mask) + + # boundary point loss + point_loss = self.PolyMatchingLoss(py_preds, gt_tags[inds]) + + if eps is None: + loss_b = 0.05*point_loss + loss = cls_loss + 3.0*dis_loss + norm_loss + angle_loss + loss_b + else: + loss_b = 0.1*(torch.sigmoid(torch.tensor((eps - cfg.max_epoch)/cfg.max_epoch))) * point_loss + loss = cls_loss + 3.0*dis_loss + norm_loss + angle_loss + loss_b + + loss_dict = { + 'total_loss': loss, + 'cls_loss': cls_loss, + 'distance loss': 3.0*dis_loss, + 'dir_loss': norm_loss + angle_loss, + 'point_loss': loss_b, + 'norm_loss': norm_loss, + 'angle_loss': angle_loss, + + } + + return loss_dict + diff --git a/IndicPhotoOCR/detection/textbpn/network/textnet.py b/IndicPhotoOCR/detection/textbpn/network/textnet.py new file mode 100644 index 0000000000000000000000000000000000000000..dae70d8453fc606354a66295f1896a78b881477f --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/network/textnet.py @@ -0,0 +1,216 @@ +# -*- coding: utf-8 -*- +# @Time : 10/1/21 +# @Author : GXYM +import torch +import torch.nn as nn +from IndicPhotoOCR.detection.textbpn.network.layers.model_block import FPN +from IndicPhotoOCR.detection.textbpn.cfglib.config import config as cfg +import numpy as np +from IndicPhotoOCR.detection.textbpn.network.layers.CircConv import DeepSnake +from IndicPhotoOCR.detection.textbpn.network.layers.GCN import GCN +from IndicPhotoOCR.detection.textbpn.network.layers.RNN import RNN +from IndicPhotoOCR.detection.textbpn.network.layers.Adaptive_Deformation import AdaptiveDeformation +# from IndicPhotoOCR.detection.textbpn.network.layers.Transformer_old import Transformer_old +from IndicPhotoOCR.detection.textbpn.network.layers.Transformer import Transformer +import cv2 +from IndicPhotoOCR.detection.textbpn.util.misc import get_sample_point, fill_hole +from IndicPhotoOCR.detection.textbpn.network.layers.gcn_utils import get_node_feature, \ + get_adj_mat, get_adj_ind, coord_embedding, normalize_adj +import torch.nn.functional as F +import time + + +class Evolution(nn.Module): + def __init__(self, node_num, adj_num, is_training=True, device=None, model="snake"): + super(Evolution, self).__init__() + self.node_num = node_num + self.adj_num = adj_num + self.device = device + self.is_training = is_training + self.clip_dis = 16 + + self.iter = 3 + if model == "gcn": + self.adj = get_adj_mat(self.adj_num, self.node_num) + self.adj = normalize_adj(self.adj, type="DAD").float().to(self.device) + for i in range(self.iter): + evolve_gcn = GCN(36, 128) + self.__setattr__('evolve_gcn' + str(i), evolve_gcn) + elif model == "rnn": + self.adj = None + for i in range(self.iter): + evolve_gcn = RNN(36, 128) + self.__setattr__('evolve_gcn' + str(i), evolve_gcn) + elif model == "AD": + self.adj = get_adj_mat(self.adj_num, self.node_num) + self.adj = normalize_adj(self.adj, type="DAD").float().to(self.device) + for i in range(self.iter): + evolve_gcn = AdaptiveDeformation(36, 128) + self.__setattr__('evolve_gcn' + str(i), evolve_gcn) + # elif model == "BT_old": + # self.adj = None + # for i in range(self.iter): + # evolve_gcn = Transformer_old(36, 512, num_heads=8, + # dim_feedforward=2048, drop_rate=0.0, if_resi=True, block_nums=4) + # self.__setattr__('evolve_gcn' + str(i), evolve_gcn) + elif model == "BT": + self.adj = None + for i in range(self.iter): + evolve_gcn = Transformer(36, 128, num_heads=8, + dim_feedforward=1024, drop_rate=0.0, if_resi=True, block_nums=3) + self.__setattr__('evolve_gcn' + str(i), evolve_gcn) + else: + self.adj = get_adj_ind(self.adj_num, self.node_num, self.device) + for i in range(self.iter): + evolve_gcn = DeepSnake(state_dim=128, feature_dim=36, conv_type='dgrid') + self.__setattr__('evolve_gcn' + str(i), evolve_gcn) + + for m in self.modules(): + if isinstance(m, nn.Conv1d) or isinstance(m, nn.Conv2d): + m.weight.data.normal_(0.0, 0.02) + # nn.init.kaiming_normal_(m.weight, mode='fan_in') + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + @staticmethod + def get_boundary_proposal(input=None, seg_preds=None, switch="gt"): + + if switch == "gt": + inds = torch.where(input['ignore_tags'] > 0) + # if len(inds[0]) > 320: + # inds = (inds[0][:320], inds[1][:320]) + init_polys = input['proposal_points'][inds] + else: + tr_masks = input['tr_mask'].cpu().numpy() + tcl_masks = seg_preds[:, 0, :, :].detach().cpu().numpy() > cfg.threshold + inds = [] + init_polys = [] + for bid, tcl_mask in enumerate(tcl_masks): + ret, labels = cv2.connectedComponents(tcl_mask.astype(np.uint8), connectivity=8) + for idx in range(1, ret): + text_mask = labels == idx + ist_id = int(np.sum(text_mask*tr_masks[bid])/np.sum(text_mask))-1 + inds.append([bid, ist_id]) + poly = get_sample_point(text_mask, cfg.num_points, cfg.approx_factor) + init_polys.append(poly) + inds = torch.from_numpy(np.array(inds)).permute(1, 0).to(input["img"].device) + init_polys = torch.from_numpy(np.array(init_polys)).to(input["img"].device) + + return init_polys, inds, None + + def get_boundary_proposal_eval(self, input=None, seg_preds=None): + + # if cfg.scale > 1: + # seg_preds = F.interpolate(seg_preds, scale_factor=cfg.scale, mode='bilinear') + cls_preds = seg_preds[:, 0, :, :].detach().cpu().numpy() + dis_preds = seg_preds[:, 1, :, ].detach().cpu().numpy() + + inds = [] + init_polys = [] + confidences = [] + for bid, dis_pred in enumerate(dis_preds): + # # dis_mask = (dis_pred / np.max(dis_pred)) > cfg.dis_threshold + dis_mask = dis_pred > cfg.dis_threshold + # dis_mask = fill_hole(dis_mask) + ret, labels = cv2.connectedComponents(dis_mask.astype(np.uint8), connectivity=8, ltype=cv2.CV_16U) + for idx in range(1, ret): + text_mask = labels == idx + confidence = round(cls_preds[bid][text_mask].mean(), 3) + # 50 for MLT2017 and ArT (or DCN is used in backone); else is all 150; + # just can set to 50, which has little effect on the performance + if np.sum(text_mask) < 50/(cfg.scale*cfg.scale) or confidence < cfg.cls_threshold: + continue + confidences.append(confidence) + inds.append([bid, 0]) + + poly = get_sample_point(text_mask, cfg.num_points, + cfg.approx_factor, scales=np.array([cfg.scale, cfg.scale])) + init_polys.append(poly) + + if len(inds) > 0: + inds = torch.from_numpy(np.array(inds)).permute(1, 0).to(input["img"].device, non_blocking=True) + init_polys = torch.from_numpy(np.array(init_polys)).to(input["img"].device, non_blocking=True).float() + else: + init_polys = torch.from_numpy(np.array(init_polys)).to(input["img"].device, non_blocking=True).float() + inds = torch.from_numpy(np.array(inds)).to(input["img"].device, non_blocking=True) + + return init_polys, inds, confidences + + def evolve_poly(self, snake, cnn_feature, i_it_poly, ind): + if len(i_it_poly) == 0: + return torch.zeros_like(i_it_poly) + h, w = cnn_feature.size(2)*cfg.scale, cnn_feature.size(3)*cfg.scale + node_feats = get_node_feature(cnn_feature, i_it_poly, ind, h, w) + i_poly = i_it_poly + torch.clamp(snake(node_feats, self.adj).permute(0, 2, 1), -self.clip_dis, self.clip_dis) + if self.is_training: + i_poly = torch.clamp(i_poly, 0, w-1) + else: + i_poly[:, :, 0] = torch.clamp(i_poly[:, :, 0], 0, w - 1) + i_poly[:, :, 1] = torch.clamp(i_poly[:, :, 1], 0, h - 1) + return i_poly + + def forward(self, embed_feature, input=None, seg_preds=None, switch="gt"): + if self.is_training: + init_polys, inds, confidences = self.get_boundary_proposal(input=input, seg_preds=seg_preds, switch=switch) + # TODO sample fix number + else: + init_polys, inds, confidences = self.get_boundary_proposal_eval(input=input, seg_preds=seg_preds) + if init_polys.shape[0] == 0: + return [init_polys for i in range(self.iter+1)], inds, confidences + + py_preds = [init_polys, ] + for i in range(self.iter): + evolve_gcn = self.__getattr__('evolve_gcn' + str(i)) + init_polys = self.evolve_poly(evolve_gcn, embed_feature, init_polys, inds[0]) + py_preds.append(init_polys) + + return py_preds, inds, confidences + + +class TextNet(nn.Module): + + def __init__(self, backbone='vgg', is_training=True): + super().__init__() + self.is_training = is_training + self.backbone_name = backbone + self.fpn = FPN(self.backbone_name, is_training=(not cfg.resume and is_training)) + + self.seg_head = nn.Sequential( + nn.Conv2d(32, 16, kernel_size=3, padding=2, dilation=2), + nn.PReLU(), + nn.Conv2d(16, 16, kernel_size=3, padding=4, dilation=4), + nn.PReLU(), + nn.Conv2d(16, 4, kernel_size=1, stride=1, padding=0), + ) + self.BPN = Evolution(cfg.num_points, adj_num=4, + is_training=is_training, device=cfg.device, model="BT") + + def load_model(self, model_path): + print('Loading from {}'.format(model_path)) + state_dict = torch.load(model_path, map_location=torch.device(cfg.device)) + self.load_state_dict(state_dict['model'], strict=(not self.is_training)) + + def forward(self, input_dict, test_speed=False): + output = {} + b, c, h, w = input_dict["img"].shape + if self.is_training or cfg.exp_name in ['ArT', 'MLT2017', "MLT2019"] or test_speed: + image = input_dict["img"] + else: + image = torch.zeros((b, c, cfg.test_size[1], cfg.test_size[1]), dtype=torch.float32).to(cfg.device) + image[:, :, :h, :w] = input_dict["img"][:, :, :, :] + + up1, _, _, _, _ = self.fpn(image) + up1 = up1[:, :, :h // cfg.scale, :w // cfg.scale] + + preds = self.seg_head(up1) + fy_preds = torch.cat([torch.sigmoid(preds[:, 0:2, :, :]), preds[:, 2:4, :, :]], dim=1) + cnn_feats = torch.cat([up1, fy_preds], dim=1) + + py_preds, inds, confidences = self.BPN(cnn_feats, input=input_dict, seg_preds=fy_preds, switch="gt") + + output["fy_preds"] = fy_preds + output["py_preds"] = py_preds + output["inds"] = inds + output["confidences"] = confidences + + return output diff --git a/IndicPhotoOCR/detection/textbpn/output.png b/IndicPhotoOCR/detection/textbpn/output.png new file mode 100644 index 0000000000000000000000000000000000000000..865e3cd37791f717b44344b2e9971898f4004b30 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/output.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44b8104e8e5d470e051d4b568214e3591e098f2677d1c0e1a0d6594e2d049636 +size 8695740 diff --git a/IndicPhotoOCR/detection/textbpn/textbpnpp_detector.py b/IndicPhotoOCR/detection/textbpn/textbpnpp_detector.py new file mode 100644 index 0000000000000000000000000000000000000000..f360a23918e941e97ce52a3cf0695ad774f44f57 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/textbpnpp_detector.py @@ -0,0 +1,197 @@ +import torch +import cv2 +import numpy as np +from IndicPhotoOCR.detection.textbpn.network.textnet import TextNet +from IndicPhotoOCR.detection.textbpn.cfglib.config import config as cfg +import warnings +import os +import requests +from tqdm import tqdm + +# Suppress warnings +warnings.filterwarnings("ignore") + +model_info = { + "textbpnpp": { + "path": "models/TextBPN_resnet50_300.pth", + "url" : "https://github.com/Bhashini-IITJ/SceneTextDetection/releases/download/TextBPN%2B%2B/TextBPN_resnet50_300.pth", + }, + "textbpnpp_deformable": { + "path":"models/TextBPN_deformable_resnet50_300.pth", + "url": "https://github.com/Bhashini-IITJ/SceneTextDetection/releases/download/TextBPN%2B%2B/TextBPN_deformable_resnet50_300.pth", + }, + "textbpn_resnet18" : { + "path":"models/TextBPN_resnet18_300.pth", + "url": "https://github.com/Bhashini-IITJ/SceneTextDetection/releases/download/TextBPN%2B%2B/TextBPN_resnet18_300.pth", + + } +} + # Ensure model file exists; download directly if not +def ensure_model(model_name): + model_path = model_info[model_name]["path"] + url = model_info[model_name]["url"] + root_model_dir = "IndicPhotoOCR/detection/textbpn" + model_path = os.path.join(root_model_dir, model_path) + + if not os.path.exists(model_path): + print(f"Model not found locally. Downloading {model_name} from {url}...") + + # Start the download with a progress bar + response = requests.get(url, stream=True) + total_size = int(response.headers.get('content-length', 0)) + os.makedirs(f"{root_model_dir}/models", exist_ok=True) + + with open(model_path, "wb") as f, tqdm( + desc=model_name, + total=total_size, + unit='B', + unit_scale=True, + unit_divisor=1024, + ) as bar: + for data in response.iter_content(chunk_size=1024): + f.write(data) + bar.update(len(data)) + + print(f"Downloaded model for {model_name}.") + + return model_path + +class TextBPNpp_detector: + def __init__(self, model_name="textbpnpp", backbone="resnet50", device="cpu"): + """ + Initialize the TextBPN model. + :param model_path: Path to the pre-trained model. + :param backbone: Backbone architecture (default: "resnet50"). + :param device: Device to run the model on (default: "cpu"). + """ + self.model_path = ensure_model(model_name) + self.device = torch.device(device) + self.model = TextNet(is_training=False, backbone=backbone) + self.model.load_model(self.model_path) + self.model.eval() + self.model.to(self.device) + + @staticmethod + def to_device(tensor, device): + """ + Move tensor to the specified device. + :param tensor: Tensor to move. + :param device: Target device. + :return: Tensor on the target device. + """ + return tensor.to(device, non_blocking=True) + + @staticmethod + def pad_image(image, stride=32): + """ + Pad the image to make its dimensions divisible by the stride. + :param image: Input image. + :param stride: Stride size. + :return: Padded image and original dimensions. + """ + h, w = image.shape[:2] + new_h = (h + stride - 1) // stride * stride + new_w = (w + stride - 1) // stride * stride + padded_image = cv2.copyMakeBorder( + image, 0, new_h - h, 0, new_w - w, cv2.BORDER_CONSTANT, value=(0, 0, 0) + ) + return padded_image, (h, w) + + @staticmethod + def rescale_result(image, bbox_contours, original_height, original_width): + """ + Rescale the bounding box contours to the original image size. + :param image: Image after resizing. + :param bbox_contours: Bounding box contours. + :param original_height: Original image height. + :param original_width: Original image width. + :return: Original image and rescaled contours. + """ + contours = [] + for cont in bbox_contours: + cont[:, 0] = (cont[:, 0] * original_width / image.shape[1]).astype(int) + cont[:, 1] = (cont[:, 1] * original_height / image.shape[0]).astype(int) + contours.append(cont) + return contours + + def detect(self, image_path): + """ + Perform text detection on the given image. + :param image_path: Path to the input image. + :return: Dictionary with detection results. + """ + image = cv2.imread(image_path) + if image is None: + raise ValueError(f"Failed to read the image at {image_path}") + + padded_image, original_size = self.pad_image(image) + padded_tensor = ( + torch.from_numpy(padded_image).permute(2, 0, 1).float() / 255.0 + ).unsqueeze(0) # Convert to tensor and add batch dimension + + cfg.test_size = [padded_image.shape[0], padded_image.shape[1]] + + input_dict = {"img": self.to_device(padded_tensor, self.device)} + with torch.no_grad(): + output_dict = self.model(input_dict, padded_image.shape) + + contours = output_dict["py_preds"][-1].int().cpu().numpy() + contours = self.rescale_result(image, contours, *original_size) + + bbox_result_dict = {"detections": []} + for contour in contours: + # x_min, y_min = np.min(contour, axis=0) + # x_max, y_max = np.max(contour, axis=0) + # bbox_result_dict["detections"].append([x_min, y_min, x_max, y_max]) + bbox_result_dict["detections"].append(contour.tolist()) + + return bbox_result_dict + + def visualize_detections(self, image_path, bbox_result_dict, output_path="output.png"): + """ + Visualize detections on the image. + :param image_path: Path to the input image. + :param bbox_result_dict: Detection results in the format: + {'detections': [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]], ...]}. + :param output_path: Path to save the visualized image. If None, the image is only displayed. + """ + # Load the image + image = cv2.imread(image_path) + if image is None: + raise ValueError(f"Failed to read the image at {image_path}") + + # Draw each detection + for bbox in bbox_result_dict.get("detections", []): + points = np.array(bbox, dtype=np.int32) # Convert to numpy array + cv2.polylines(image, [points], isClosed=True, color=(0, 255, 0), thickness=2) + + # Display or save the visualized image + if output_path: + cv2.imwrite(output_path, image) + print(f"Visualization saved to {output_path}") + else: + cv2.imshow("Detections", image) + cv2.waitKey(0) + cv2.destroyAllWindows() + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description='Text detection using EAST model') + parser.add_argument('--image_path', type=str, required=True, help='Path to the input image') + parser.add_argument('--device', type=str, default='cpu', help='Device to run the model on, e.g., "cpu" or "cuda"') + parser.add_argument('--model_name', type=str, required=True, help='Path to the model checkpoint file') + args = parser.parse_args() + + + + # model_path = "/DATA1/ocrteam/anik/git/IndicPhotoOCR/IndicPhotoOCR/detection/textbpn/models/TextBPN_resnet50_300.pth" + # image_path = "/DATA1/ocrteam/anik/splitonBSTD/detection/D/image_542.jpg" + + detector = TextBPNpp_detector(args.model_name, device="cpu") + result = detector.detect(args.image_path) + print(result) + # detector.visualize_detections(image_path, result) + + # python -m IndicPhotoOCR.detection.textbpn.textbpnpp_detector \ + # --image_path /DATA1/ocrteam/anik/splitonBSTD/detection/D/image_542.jpg \ + # --model_name textbpnpp \ No newline at end of file diff --git a/IndicPhotoOCR/detection/textbpn/util/__init__.py b/IndicPhotoOCR/detection/textbpn/util/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4df2e390c73c946d5974e63c65456acc55b7a080 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/__init__.py @@ -0,0 +1,2 @@ +from .visualize import * +from .pbox import * diff --git a/IndicPhotoOCR/detection/textbpn/util/augmentation.py b/IndicPhotoOCR/detection/textbpn/util/augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..09fe7f348f10d44903c5e0914b65903d91280b08 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/augmentation.py @@ -0,0 +1,794 @@ +# -*- coding: utf-8 -*- +__author__ = "S.X.Zhang" +import numpy as np +import math +import cv2 +import copy +import numpy.random as random +from shapely.geometry import Polygon +import torchvision.transforms as transforms +import torchvision.transforms.functional as F +from PIL import ImageEnhance, Image + + +###<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<### +###<<<<<<<<< Function >>>>>>>>>>>>### +###>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>### +def crop_first(image, polygons, scale =10): + polygons_new = copy.deepcopy(polygons) + h, w, _ = image.shape + pad_h = h // scale + pad_w = w // scale + h_array = np.zeros((h + pad_h * 2), dtype=np.int32) + w_array = np.zeros((w + pad_w * 2), dtype=np.int32) + + text_polys = [] + pos_polys = [] + for polygon in polygons_new: + rect = cv2.minAreaRect(polygon.points.astype(np.int32)) + box = cv2.boxPoints(rect) + box = np.int0(box) + text_polys.append([box[0], box[1], box[2], box[3]]) + if polygon.label != -1: + pos_polys.append([box[0], box[1], box[2], box[3]]) + + polys = np.array(text_polys, dtype=np.int32) + for poly in polys: + poly = np.round(poly, decimals=0).astype(np.int32) # 四舍五入 + minx = np.min(poly[:, 0]) + maxx = np.max(poly[:, 0]) + w_array[minx + pad_w:maxx + pad_w] = 1 + miny = np.min(poly[:, 1]) + maxy = np.max(poly[:, 1]) + h_array[miny + pad_h:maxy + pad_h] = 1 + # ensure the cropped area not across a text 保证截取区域不会横穿文字 + h_axis = np.where(h_array == 0)[0] + w_axis = np.where(w_array == 0)[0] + pp_polys = np.array(pos_polys, dtype=np.int32) + + return h_axis, w_axis, pp_polys + +####<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<#### +####<<<<<<<<<<< Class >>>>>>>>>>>>>#### +####>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>#### +class Compose(object): + """Composes several augmentations together. + Args: + transforms (List[Transform]): list of transforms to compose. + Example: + >>> augmentations.Compose([ + >>> transforms.CenterCrop(10), + >>> transforms.ToTensor(), + >>> ]) + """ + + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, img, pts=None): + for t in self.transforms: + img, pts = t(img, pts) + return img, pts + + +class Normalize(object): + def __init__(self, mean, std): + self.mean = np.array(mean) + self.std = np.array(std) + + def __call__(self, image, polygons=None): + image = image.astype(np.float32) + image /= 255.0 + image -= self.mean + image /= self.std + return image, polygons + + +class MinusMean(object): + def __init__(self, mean): + self.mean = np.array(mean) + + def __call__(self, image, polygons=None): + image = image.astype(np.float32) + image -= self.mean + return image, polygons + + +class RandomMirror(object): + # 镜像 + def __init__(self): + pass + + def __call__(self, image, polygons=None): + if polygons is None: + return image, polygons + if random.random()< 0.3: + image = np.ascontiguousarray(image[:, ::-1]) + _, width, _ = image.shape + for polygon in polygons: + polygon.points[:, 0] = width - polygon.points[:, 0] + return image, polygons + + +class AugmentColor(object): + # 颜色增强(添加噪声) + def __init__(self): + self.U = np.array([[-0.56543481, 0.71983482, 0.40240142], + [-0.5989477, -0.02304967, -0.80036049], + [-0.56694071, -0.6935729, 0.44423429]], dtype=np.float32) + self.EV = np.array([1.65513492, 0.48450358, 0.1565086], dtype=np.float32) + self.sigma = 0.1 + self.color_vec = None + + def __call__(self, img, polygons=None): + color_vec = self.color_vec + if self.color_vec is None: + if not self.sigma > 0.0: + color_vec = np.zeros(3, dtype=np.float32) + else: + color_vec = np.random.normal(0.0, self.sigma, 3) + + alpha = color_vec.astype(np.float32) * self.EV + noise = np.dot(self.U, alpha.T) * 255 + return np.clip(img + noise[np.newaxis, np.newaxis, :], 0, 255), polygons + + +class RandomContrast(object): + def __init__(self, lower=0.5, upper=1.5): + self.lower = lower + self.upper = upper + assert self.upper >= self.lower, "contrast upper must be >= lower." + assert self.lower >= 0, "contrast lower must be non-negative." + + # expects float image + def __call__(self, image, polygons=None): + if random.randint(2): + alpha = random.uniform(self.lower, self.upper) + image *= alpha + return np.clip(image, 0, 255), polygons + + +class RandomBrightness(object): + def __init__(self, delta=32): + assert delta >= 0.0 + assert delta <= 255.0 + self.delta = delta + + def __call__(self, image, polygons=None): + image = image.astype(np.float32) + if random.randint(2): + delta = random.uniform(-self.delta, self.delta) + image += delta + return np.clip(image, 0, 255), polygons + + +class RandomErasing(object): + def __init__(self, sr=(0.0004, 0.01), scale=(0.5, 3), ratio=0.2, Type ="Erasing"): + """ + + :param area: + :param type: Erasing or Cutout + """ + self.sr = sr + self.scale= scale + self.ratio=ratio + self.type=Type + + def __call__(self, img, polygons=None): + + if random.random()< self.ratio: + return img, polygons + area=img.shape[0]*img.shape[1] + target_area=random.randint(*self.sr)*area + aspect_ratio=random.uniform(*self.scale) + h = int(round(math.sqrt(target_area / aspect_ratio))) + w = int(round(math.sqrt(target_area * aspect_ratio))) + + if w < img.shape[1] and h < img.shape[0]: + x1 = random.randint(0, img.shape[1] - w) + y1 = random.randint(0, img.shape[0] - h) + if self.type == "Erasing": + color=(random.randint(0, 255),random.randint(0, 255),random.randint(0, 255)) + img[y1:y1+h, x1:x1+h,:]=color + else: + Gray_value=random.randint(0, 255) + color = (Gray_value, Gray_value ,Gray_value) + img[y1:y1 + h, x1:x1 + h, :] = color + + return img, polygons + + +class RandomMixUp(object): + def __init__(self, mixup_alpha=2): + self.mixup_alpha = mixup_alpha + + def __call__(self, img1, img2, label1=[], label2=[]): + beta=np.random.beta(self.mixup_alpha,self.mixup_alpha) + + #image = img1 * Gama + (1 - Gama) * img2 + image=cv2.addWeighted(img1, beta, img2, (1-beta), 0) + + if label1 is None or label2 is None: + return img1, label1 + if isinstance(label1, list) and isinstance(label2, list): + label=[] + for id in range(len(label1)): + lab = beta*label1[id]+ (1-beta)*label2[id] + label.append(lab) + return image, label + else: + print("Error: label is not a list type") + + return img1, label1 + + +class Rotate(object): + def __init__(self, up=30): + self.up = up + + @staticmethod + def rotate(center, pt, theta): # 二维图形学的旋转 + xr, yr = center + yr = -yr + x, y = pt[:, 0], pt[:, 1] + y = -y + + theta = theta / 180 * math.pi + cos = math.cos(theta) + sin = math.sin(theta) + + _x = xr + (x - xr) * cos - (y - yr) * sin + _y = yr + (x - xr) * sin + (y - yr) * cos + + return _x, -_y + + def __call__(self, img, polygons=None): + if np.random.randint(2): + return img, polygons + angle = np.random.normal(loc=0.0, scale=0.5) * self.up # angle 按照高斯分布 + rows, cols = img.shape[0:2] + M = cv2.getRotationMatrix2D((cols / 2, rows / 2), angle, 1.0) + img = cv2.warpAffine(img, M, (cols, rows), borderValue=[0, 0, 0]) + center = cols / 2.0, rows / 2.0 + if polygons is not None: + for polygon in polygons: + x, y = self.rotate(center, polygon.points, angle) + pts = np.vstack([x, y]).T + polygon.points = pts + return img, polygons + + +class RotatePadding(object): + def __init__(self, up=60,colors=True): + self.up = up + self.colors = colors + self.ratio = 0.5 + + @staticmethod + def rotate(center, pt, theta, movSize=[0, 0], scale=1): # 二维图形学的旋转 + (xr, yr) = center + yr = -yr + x, y = pt[:, 0], pt[:, 1] + y = -y + + theta = theta / 180 * math.pi + cos = math.cos(theta) + sin = math.sin(theta) + + x = (x - xr) * scale + y = (y - yr) * scale + + _x = xr + x * cos - y * sin + movSize[0] + _y = -(yr + x * sin + y * cos) + movSize[1] + + return _x, _y + + @staticmethod + def shift(size, degree): + angle = degree * math.pi / 180.0 + width = size[0] + height = size[1] + + alpha = math.cos(angle) + beta = math.sin(angle) + new_width = int(width * math.fabs(alpha) + height * math.fabs(beta)) + new_height = int(width * math.fabs(beta) + height * math.fabs(alpha)) + + size = [new_width, new_height] + return size + + def __call__(self, image, polygons=None, scale=1.0): + if np.random.random() <= self.ratio: + return image, polygons + angle = np.random.normal(loc=0.0, scale=0.5) * self.up # angle 按照高斯分布 + rows, cols = image.shape[0:2] + center = (cols / 2.0, rows / 2.0) + newSize = self.shift([cols * scale, rows * scale], angle) + movSize = [int((newSize[0] - cols) / 2), int((newSize[1] - rows) / 2)] + + M = cv2.getRotationMatrix2D(center, angle, scale) + M[0, 2] += int((newSize[0] - cols) / 2) + M[1, 2] += int((newSize[1] - rows) / 2) + + if self.colors: + H, W, _ = image.shape + mask = np.zeros_like(image) + (h_index, w_index) = (np.random.randint(0, H * 7 // 8), np.random.randint(0, W * 7 // 8)) + img_cut = image[h_index:(h_index + H // 9), w_index:(w_index + W // 9)] + img_cut = cv2.resize(img_cut, (newSize[0], newSize[1])) + mask = cv2.warpAffine(mask, M, (newSize[0], newSize[1]), borderValue=[1, 1, 1]) + image = cv2.warpAffine(image, M, (newSize[0], newSize[1]), borderValue=[0,0,0]) + image=image+img_cut*mask + else: + color = [0, 0, 0] + image = cv2.warpAffine(image, M, (newSize[0], newSize[1]), borderValue=color) + + if polygons is not None: + for polygon in polygons: + x, y = self.rotate(center, polygon.points, angle,movSize,scale) + pts = np.vstack([x, y]).T + polygon.points = pts + return image, polygons + + +class SquarePadding(object): + + def __call__(self, image, polygons=None): + + H, W, _ = image.shape + + if H == W: + return image, polygons + + padding_size = max(H, W) + (h_index, w_index) = (np.random.randint(0, H*7//8),np.random.randint(0, W*7//8)) + img_cut = image[h_index:(h_index+H//9),w_index:(w_index+W//9)] + expand_image = cv2.resize(img_cut,(padding_size, padding_size)) + #expand_image = np.zeros((padding_size, padding_size, 3), dtype=image.dtype) + #expand_image=img_cut[:,:,:] + if H > W: + y0, x0 = 0, (H - W) // 2 + else: + y0, x0 = (W - H) // 2, 0 + if polygons is not None: + for polygon in polygons: + polygon.points += np.array([x0, y0]) + expand_image[y0:y0+H, x0:x0+W] = image + image = expand_image + + return image, polygons + + +class RandomImgCropPatch(object): + def __init__(self, up=30, beta=0.3): + self.up = up + self.beta=0.3 + self.scale = 10 + + @staticmethod + def get_contour_min_area_box(contour): + rect = cv2.minAreaRect(contour) + box = cv2.boxPoints(rect) + box = np.int0(box) + return box + + def CropWH(self, image, cut_w, cut_h, polygons=None): + h_axis, w_axis, polys = crop_first(image, polygons, scale=self.scale) + h, w, _ = image.shape + pad_h = h // self.scale + pad_w = w // self.scale + # TODO try Flip + xx = np.random.choice(w_axis, size=2) + xmin = np.min(xx) - pad_w + xmax = xmin + cut_w + yy = np.random.choice(h_axis, size=2) + ymin = np.min(yy) - pad_h + ymax = ymin + cut_h + if polys.shape[0] != 0: + poly_axis_in_area = (polys[:, :, 0] >= xmin) & (polys[:, :, 0] <= xmax) \ + & (polys[:, :, 1] >= ymin) & (polys[:, :, 1] <= ymax) + selected_polys = np.where(np.sum(poly_axis_in_area, axis=1) == 4)[0] + else: + selected_polys = [] + + cropped = image[ymin:ymax + 1, xmin:xmax + 1, :] + polygons_new = [] + for idx in selected_polys: + polygon = polygons[idx] + polygon.points -= np.array([xmin, ymin]) + polygons_new.append(polygon) + image = cropped + polygon = polygons_new + + return image, polygon + + def __call__(self, images, polygons_list=None): + I_x, I_y = 1024,1024 + + w = int(round(I_x * random.beta(self.beta, self.beta))) + h = int(round(I_y * random.beta(self.beta, self.beta))) + w_ = [w, I_x - w, w, I_x - w] + h_ = [h, h, I_y - h, I_y - h] + new_img = np.zeros((I_x, I_y, 3), dtype=images[0].dtype) + imgs=[] + new_polygons=[] + for i, im in enumerate(images): + img, polygons = self.CropWH(im, w_[i], h_[i], polygons=polygons_list[i]) + imgs.append(img) + new_polygons.append(polygons) + new_img[0:w, 0:h, :] = imgs[0] + new_img[w:I_x, 0:h, :] = imgs[1] + new_img[0:w, h:I_y, :] = imgs[2] + new_img[w:I_x, h:I_y, :] = imgs[3] + for polygon in new_polygons[1]: + polygon.points += np.array([w, 0]) + for polygon in new_polygons[2]: + polygon.points += np.array([0, h]) + for polygon in new_polygons[3]: + polygon.points += np.array([w, h]) + + polygons=new_polygons[0]+new_polygons[1]+new_polygons[2]+new_polygons[3] + + return new_img, polygons + + +class RandomCropFlip(object): + + def __init__(self, min_crop_side_ratio=0.01): + self.scale = 10 + self.ratio = 0.2 + self.epsilon = 10.0 + self.min_crop_side_ratio = min_crop_side_ratio + + def __call__(self, image, polygons=None): + + if polygons is None: + return image, polygons + + if np.random.random() <= self.ratio: + return image, polygons + + # 计算 有效的Crop区域, 方便选取有效的种子点 + h_axis, w_axis, pp_polys = crop_first(image, polygons, scale =self.scale) + if len(h_axis) == 0 or len(w_axis) == 0: + return image, polygons + + # TODO try crop + attempt = 0 + h, w, _ = image.shape + area = h * w + pad_h = h // self.scale + pad_w = w // self.scale + while attempt < 10: + attempt += 1 + polygons_new = [] + xx = np.random.choice(w_axis, size=2) + xmin = np.min(xx) - pad_w + xmax = np.max(xx) - pad_w + xmin = np.clip(xmin, 0, w - 1) + xmax = np.clip(xmax, 0, w - 1) + yy = np.random.choice(h_axis, size=2) + ymin = np.min(yy) - pad_h + ymax = np.max(yy) - pad_h + ymin = np.clip(ymin, 0, h - 1) + ymax = np.clip(ymax, 0, h - 1) + if (xmax - xmin) * (ymax - ymin) < area * self.min_crop_side_ratio: + # area too small + continue + + pts = np.stack([[xmin, xmax, xmax, xmin], [ymin, ymin, ymax, ymax]]).T.astype(np.int32) + pp = Polygon(pts).buffer(0) + Fail_flag = False + for polygon in polygons: + ppi = Polygon(polygon.points).buffer(0) + ppiou = float(ppi.intersection(pp).area) + if np.abs(ppiou - float(ppi.area)) > self.epsilon and np.abs(ppiou) > self.epsilon: + Fail_flag = True + break + if np.abs(ppiou - float(ppi.area)) < self.epsilon: + polygons_new.append(polygon) + + if Fail_flag: + continue + else: + break + + if len(polygons_new) == 0: + cropped = image[ymin:ymax, xmin:xmax, :] + select_type = random.randint(3) + if select_type == 0: + img = np.ascontiguousarray(cropped[:, ::-1]) + elif select_type == 1: + img = np.ascontiguousarray(cropped[::-1, :]) + else: + img = np.ascontiguousarray(cropped[::-1, ::-1]) + image[ymin:ymax, xmin:xmax, :] = img + return image, polygons + + else: + cropped = image[ymin:ymax, xmin:xmax, :] + height, width, _ = cropped.shape + select_type = random.randint(3) + if select_type == 0: + img = np.ascontiguousarray(cropped[:, ::-1]) + for polygon in polygons_new: + polygon.points[:, 0] = width - polygon.points[:, 0] + 2 * xmin + elif select_type == 1: + img = np.ascontiguousarray(cropped[::-1, :]) + for polygon in polygons_new: + polygon.points[:, 1] = height - polygon.points[:, 1] + 2 * ymin + else: + img = np.ascontiguousarray(cropped[::-1, ::-1]) + for polygon in polygons_new: + polygon.points[:, 0] = width - polygon.points[:, 0] + 2 * xmin + polygon.points[:, 1] = height - polygon.points[:, 1] + 2 * ymin + image[ymin:ymax, xmin:xmax, :] = img + + return image, polygons + + +class RandomResizedCrop(object): + def __init__(self, min_crop_side_ratio=0.1): + self.scale = 10 + self.epsilon = 1e-2 + self.min_crop_side_ratio = min_crop_side_ratio + + def __call__(self, image, polygons): + + if polygons is None: + return image, polygons + + # 计算 有效的Crop区域, 方便选取有效的种子点 + h_axis, w_axis, pp_polys = crop_first(image, polygons, scale =self.scale) + if len(h_axis) == 0 or len(w_axis) == 0: + return image, polygons + + # TODO try crop + attempt = 0 + h, w, _ = image.shape + area = h * w + pad_h = h // self.scale + pad_w = w // self.scale + while attempt < 10: + attempt += 1 + xx = np.random.choice(w_axis, size=2) + xmin = np.min(xx) - pad_w + xmax = np.max(xx) - pad_w + xmin = np.clip(xmin, 0, w - 1) + xmax = np.clip(xmax, 0, w - 1) + yy = np.random.choice(h_axis, size=2) + ymin = np.min(yy) - pad_h + ymax = np.max(yy) - pad_h + ymin = np.clip(ymin, 0, h - 1) + ymax = np.clip(ymax, 0, h - 1) + if (xmax - xmin)*(ymax - ymin) = xmin) & (pp_polys[:, :, 0] <= xmax) \ + & (pp_polys[:, :, 1] >= ymin) & (pp_polys[:, :, 1] <= ymax) + selected_polys = np.where(np.sum(poly_axis_in_area, axis=1) == 4)[0] + else: + selected_polys = [] + + if len(selected_polys) == 0: + continue + else: + pts = np.stack([[xmin, xmax, xmax, xmin], [ymin, ymin, ymax, ymax]]).T.astype(np.int32) + pp = Polygon(pts).buffer(0) + polygons_new = [] + Fail_flag = False + for polygon in copy.deepcopy(polygons): + ppi = Polygon(polygon.points).buffer(0) + ppiou = float(ppi.intersection(pp).area) + if np.abs(ppiou - float(ppi.area)) > self.epsilon and np.abs(ppiou) > self.epsilon: + Fail_flag = True + break + elif np.abs(ppiou - float(ppi.area)) < self.epsilon: + # polygon.points -= np.array([xmin, ymin]) + polygons_new.append(polygon) + + if Fail_flag: + continue + else: + cropped = image[ymin:ymax + 1, xmin:xmax + 1, :] + for polygon in polygons_new: + polygon.points -= np.array([xmin, ymin]) + + return cropped, polygons_new + + return image, polygons + + +class RandomResizeScale(object): + def __init__(self, size=512, ratio=(3./4, 5./2)): + self.size = size + self.ratio = ratio + + def __call__(self, image, polygons=None): + + aspect_ratio = np.random.uniform(self.ratio[0], self.ratio[1]) + h, w, _ = image.shape + scales = self.size*1.0/max(h, w) + aspect_ratio = scales * aspect_ratio + aspect_ratio = int(w * aspect_ratio)*1.0/w + image = cv2.resize(image, (int(w * aspect_ratio), int(h*aspect_ratio))) + scales = np.array([aspect_ratio, aspect_ratio]) + if polygons is not None: + for polygon in polygons: + polygon.points = polygon.points * scales + + return image, polygons + + +class Resize(object): + def __init__(self, size=1024): + self.size = size + self.SP = SquarePadding() + + def __call__(self, image, polygons=None): + h, w, _ = image.shape + image = cv2.resize(image, (self.size, + self.size)) + scales = np.array([self.size / w, self.size / h]) + + if polygons is not None: + for polygon in polygons: + polygon.points = polygon.points * scales + + return image, polygons + + +class ResizeSquare(object): + def __init__(self, size=(480, 1280)): + self.size = size + + def __call__(self, image, polygons=None): + h, w, _ = image.shape + img_size_min = min(h, w) + img_size_max = max(h, w) + + if img_size_min < self.size[0]: + im_scale = float(self.size[0]) / float(img_size_min) # expand min to size[0] + if np.ceil(im_scale * img_size_max) > self.size[1]: # expand max can't > size[1] + im_scale = float(self.size[1]) / float(img_size_max) + elif img_size_max > self.size[1]: + im_scale = float(self.size[1]) / float(img_size_max) + else: + im_scale = 1.0 + + new_h = int(int(h * im_scale/32)*32) + new_w = int(int(w * im_scale/32)*32) + # if new_h*new_w > 1600*1920: + # im_scale = 1600 / float(img_size_max) + # new_h = int(int(h * im_scale/32)*32) + # new_w = int(int(w * im_scale/32)*32) + image = cv2.resize(image, (new_w, new_h)) + scales = np.array([new_w / w, new_h / h]) + if polygons is not None: + for polygon in polygons: + polygon.points = polygon.points * scales + + return image, polygons + + +class ResizeLimitSquare(object): + def __init__(self, size=512, ratio=0.6): + self.size = size + self.ratio = ratio + self.SP = SquarePadding() + + def __call__(self, image, polygons=None): + if np.random.random() <= self.ratio: + image, polygons = self.SP(image, polygons) + h, w, _ = image.shape + image = cv2.resize(image, (self.size,self.size)) + scales = np.array([self.size*1.0/ w, self.size*1.0 / h]) + + if polygons is not None: + for polygon in polygons: + polygon.points = polygon.points * scales + + return image, polygons + + +class RandomResizePadding(object): + def __init__(self, size=512, random_scale=np.array([0.75, 1.0, 1.25,1.5,2.0]),stride=32, ratio=0.6667): + self.random_scale = random_scale + self.size = size + self.ratio=ratio + self.stride=stride + self.SP=SquarePadding() + + ###########Random size for different eproches ######################## + rd_scale = np.random.choice(self.random_scale) + step_num = round(np.random.normal(loc=0.0, scale=0.35) * 8) # step 按照高斯分布 + self.input_size = np.clip(int(self.size * rd_scale + step_num * self.stride), + (int(self.size * self.random_scale[0] - self.stride)), + int(self.size * self.random_scale[-1] + self.stride)) + ############################ end ######################## + + def __call__(self, image, polygons=None): + + if np.random.random() <= self.ratio: + image, polygons = self.SP(image, polygons) + h, w, _ = image.shape + image = cv2.resize(image, (self.input_size,self.input_size)) + scales = np.array([self.input_size*1.0/ w, self.input_size*1.0 / h]) + + if polygons is not None: + for polygon in polygons: + polygon.points = polygon.points * scales + + return image, polygons + +transform_type_dict = dict( + brightness=ImageEnhance.Brightness, contrast=ImageEnhance.Contrast, + sharpness=ImageEnhance.Sharpness, color=ImageEnhance.Color +) + + +class RandomDistortion(object): + def __init__(self, transform_dict, prob=0.5): + self.transforms = [(transform_type_dict[k], transform_dict[k]) for k in transform_dict] + self.prob = prob + + def __call__(self, img, target): + if random.random() > self.prob: + return img, target + out = Image.fromarray(img) + rand_num = np.random.uniform(0, 1, len(self.transforms)) + + for i, (transformer, alpha) in enumerate(self.transforms): + r = alpha * (rand_num[i] * 2.0 - 1.0) + 1 # r in [1-alpha, 1+alpha) + out = transformer(out).enhance(r) + + return np.array(out), target + + +class Augmentation(object): + def __init__(self, size, mean, std): + self.size = size + self.mean = mean + self.std = std + self._transform_dict = {'brightness': 0.5, 'contrast': 0.5, 'sharpness': 0.8386, 'color': 0.5} + self.augmentation = Compose([ + RandomCropFlip(), + RandomResizeScale(size=self.size, ratio=(3. / 8, 5. / 2)), + RandomResizedCrop(), + RotatePadding(up=60, colors=True), # pretrain on Syn is "up=30", else is "up=60" + ResizeLimitSquare(size=self.size), + RandomMirror(), + RandomDistortion(self._transform_dict), + Normalize(mean=self.mean, std=self.std), + ]) + + def __call__(self, image, polygons=None): + return self.augmentation(image, polygons) + + +class BaseTransform(object): + def __init__(self, size, mean, std): + self.size = size + self.mean = mean + self.std = std + self.augmentation = Compose([ + # Resize(size=640), + ResizeSquare(size=self.size), + Normalize(mean, std) + ]) + + def __call__(self, image, polygons=None): + return self.augmentation(image, polygons) + + +class BaseTransformNresize(object): + def __init__(self, mean, std): + self.mean = mean + self.std = std + self.augmentation = Compose([ + Normalize(mean, std) + ]) + + def __call__(self, image, polygons=None): + return self.augmentation(image, polygons) diff --git a/IndicPhotoOCR/detection/textbpn/util/canvas.py b/IndicPhotoOCR/detection/textbpn/util/canvas.py new file mode 100644 index 0000000000000000000000000000000000000000..555da8fc3e47755371c41f8e0b160a901f25d68c --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/canvas.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +__author__ = '古溪' + +import numpy as np +import random +import matplotlib.pyplot as plt + + +def heatmap(im_gray): + cmap = plt.get_cmap('jet') + rgba_img = cmap(255 - im_gray) + Hmap = np.delete(rgba_img, 3, 2) + # print(Hmap.shape, Hmap.max(), Hmap.min()) + # cv2.imshow("heat_img", Hmap) + # cv2.waitKey(0) + return Hmap + + +def loss_ploy(loss_list, steps, period, name=""): + fig1, ax1 = plt.subplots(figsize=(16, 9)) + ax1.plot(range(steps // period), loss_list) + ax1.set_title("Average loss vs step*{}".format(period)) + ax1.set_xlabel("step*{}".format(period)) + ax1.set_ylabel("Current loss") + plt.savefig('{}@loss_vs_step*{}.png'.format(name,period)) + plt.clf() + + +def plt_ploys(ploys, period, name=""): + fig1, ax1 = plt.subplots(figsize=(16, 9)) + cnames = ['aliceblue','antiquewhite','aqua','aquamarine','azure', + 'blanchedalmond','blue','blueviolet','brown','burlywood', + 'coral','cornflowerblue','cornsilk','crimson','cyan', + 'darkblue','deeppink','deepskyblue','dodgerblue','forestgreen', + 'gold','goldenrod','green','greenyellow','honeydew','hotpink', + 'lawngreen','lightblue','lightgreen','lightpink','lightsalmon', + 'lightseagreen','lightsteelblue','lightyellow','lime','limegreen', + 'mediumseagreen','mediumspringgreen','midnightblue','orange','orangered', + 'pink','red','royalblue','seagreen','skyblue','springgreen','steelblue', + 'tan','teal','thistle','yellow','yellowgreen'] + + color = random.sample(cnames, len(ploys.keys())) + for ii, key in enumerate(ploys.keys()): + ax1.plot(range(1, len(ploys[key])+1), ploys[key],color=color[ii], label=key) + ax1.set_title("Loss Carve line") + ax1.set_xlabel("step*{}".format(period)) + ax1.set_ylabel("Current loss") + plt.legend(ploys.keys()) + plt.savefig('{}@loss_vs_step*{}.png'.format(name, period)) + plt.clf() + +if __name__ == '__main__': + # TODO ADD CODE + pass \ No newline at end of file diff --git a/IndicPhotoOCR/detection/textbpn/util/detection.py b/IndicPhotoOCR/detection/textbpn/util/detection.py new file mode 100644 index 0000000000000000000000000000000000000000..0f2dd4adb8ce8aa2d507e43c31386d9abd8e70c8 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/detection.py @@ -0,0 +1,48 @@ +# c++ version pse based on opencv 3+ +from pse import decode as pse_decode +from cfglib.config import config as cfg + + +class TextDetector(object): + + def __init__(self, model): + # evaluation mode + self.model = model + model.eval() + # parameter + self.scale = cfg.scale + self.threshold = cfg.threshold + + def detect(self, image, img_show): + # get model output + preds = self.model.forward(image) + preds, boxes, contours = pse_decode(preds[0], self.scale, self.threshold) + + output = { + 'image': image, + 'tr': preds, + 'bbox': boxes + } + return contours, output + + + + + + + + + + + + + + + + + + + + + + diff --git a/IndicPhotoOCR/detection/textbpn/util/eval.py b/IndicPhotoOCR/detection/textbpn/util/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..7b36cabb7b4f17276ca9f1b5d740354fab4d827c --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/eval.py @@ -0,0 +1,228 @@ +import os +import cv2 +import numpy as np +import subprocess +from cfglib.config import config as cfg +from util.misc import mkdirs + + +def osmkdir(out_dir): + import shutil + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + os.makedirs(out_dir) + + +def analysize_result(source_dir, fid_path, outpt_dir, name): + + bad_txt = open("{}/eval.txt".format(outpt_dir), 'w') + all_eval = open("{}/{}/{}_eval.txt".format(cfg.output_dir, "Analysis", name), 'a+') + sel_list = list() + with open(fid_path) as f: + lines = f.read().split("\n") + for line in lines: + line_items = line.split(" ") + id = line_items[0] + precision = float(line_items[2].split('=')[-1]) + recall = float(line_items[4].split('=')[-1]) + if id != "ALL" and (precision < 0.5 or recall < 0.5): + img_path = os.path.join(source_dir, line_items[0].replace(".txt", ".jpg")) + if os.path.exists(img_path): + os.system('cp {} {}'.format(img_path, outpt_dir)) + sel_list.append((int(id.replace(".txt", "").replace("img", "").replace("_", "")), line)) + if id == "ALL": + all_eval.write("{} {} {}\n".format( + outpt_dir.split('/')[-1], + "{}/{}".format(cfg.dis_threshold, cfg.cls_threshold), + line)) + sel_list = sorted(sel_list, key=lambda its: its[0]) + bad_txt.write('\n'.join([its[1] for its in sel_list])) + all_eval.close() + bad_txt.close() + + +def deal_eval_total_text(debug=False): + # compute DetEval + eval_dir = os.path.join(cfg.output_dir, "Analysis", "output_eval") + if not os.path.exists(eval_dir): + os.makedirs(eval_dir) + + print('Computing DetEval in {}/{}'.format(cfg.output_dir, cfg.exp_name)) + subprocess.call( + ['python', 'dataset/total_text/Evaluation_Protocol/Python_scripts/Deteval.py', cfg.exp_name, '--tr', '0.7', + '--tp', '0.6']) + subprocess.call( + ['python', 'dataset/total_text/Evaluation_Protocol/Python_scripts/Deteval.py', cfg.exp_name, '--tr', '0.8', + '--tp', '0.4']) + + if debug: + source_dir = os.path.join(cfg.vis_dir, '{}_test'.format(cfg.exp_name)) + outpt_dir_base = os.path.join(cfg.output_dir, "Analysis", "eval_view", "total_text") + if not os.path.exists(outpt_dir_base): + mkdirs(outpt_dir_base) + + outpt_dir1 = os.path.join(outpt_dir_base, "{}_{}_{}_{}_{}" + .format(cfg.test_size[0], cfg.test_size[1], cfg.checkepoch, 0.7, 0.6)) + osmkdir(outpt_dir1) + fid_path1 = '{}/Eval_TotalText_{}_{}.txt'.format(eval_dir, 0.7, 0.6) + + analysize_result(source_dir, fid_path1, outpt_dir1, "totalText") + + outpt_dir2 = os.path.join(outpt_dir_base, "{}_{}_{}_{}_{}" + .format(cfg.test_size[0], cfg.test_size[1], cfg.checkepoch, 0.8, 0.4)) + osmkdir(outpt_dir2) + fid_path2 = '{}/Eval_TotalText_{}_{}.txt'.format(eval_dir, 0.8, 0.4) + + analysize_result(source_dir, fid_path2, outpt_dir2, "totalText") + + print('End.') + + +def deal_eval_ctw1500(debug=False): + # compute DetEval + eval_dir = os.path.join(cfg.output_dir, "Analysis", "output_eval") + if not os.path.exists(eval_dir): + os.makedirs(eval_dir) + + print('Computing DetEval in {}/{}'.format(cfg.output_dir, cfg.exp_name)) + subprocess.call(['python', 'dataset/ctw1500/Evaluation_Protocol/ctw1500_eval.py', cfg.exp_name]) + + if debug: + source_dir = os.path.join(cfg.vis_dir, '{}_test'.format(cfg.exp_name)) + outpt_dir_base = os.path.join(cfg.output_dir, "Analysis", "eval_view", "ctw1500") + if not os.path.exists(outpt_dir_base): + mkdirs(outpt_dir_base) + + outpt_dir = os.path.join(outpt_dir_base, "{}_{}_{}".format(cfg.test_size[0], cfg.test_size[1], cfg.checkepoch)) + osmkdir(outpt_dir) + fid_path1 = '{}/Eval_ctw1500_{}.txt'.format(eval_dir, 0.5) + + analysize_result(source_dir, fid_path1, outpt_dir, "ctw1500") + + print('End.') + + +def deal_eval_icdar15(debug=False): + # compute DetEval + eval_dir = os.path.join(cfg.output_dir, "Analysis", "output_eval") + if not os.path.exists(eval_dir): + os.makedirs(eval_dir) + + input_dir = 'output/{}'.format(cfg.exp_name) + father_path = os.path.abspath(input_dir) + print(father_path) + print('Computing DetEval in {}/{}'.format(cfg.output_dir, cfg.exp_name)) + subprocess.call(['sh', 'dataset/icdar15/eval.sh', father_path]) + + if debug: + source_dir = os.path.join(cfg.vis_dir, '{}_test'.format(cfg.exp_name)) + outpt_dir_base = os.path.join(cfg.output_dir, "Analysis", "eval_view", "icdar15") + if not os.path.exists(outpt_dir_base): + mkdirs(outpt_dir_base) + + outpt_dir = os.path.join(outpt_dir_base, "{}_{}_{}".format(cfg.test_size[0], cfg.test_size[1], cfg.checkepoch)) + osmkdir(outpt_dir) + fid_path1 = '{}/Eval_icdar15.txt'.format(eval_dir) + + analysize_result(source_dir, fid_path1, outpt_dir, "icdar15") + + print('End.') + + pass + + +def deal_eval_TD500(debug=False): + # compute DetEval + eval_dir = os.path.join(cfg.output_dir, "Analysis", "output_eval") + if not os.path.exists(eval_dir): + os.makedirs(eval_dir) + + input_dir = 'output/{}'.format(cfg.exp_name) + father_path = os.path.abspath(input_dir) + print(father_path) + print('Computing DetEval in {}/{}'.format(cfg.output_dir, cfg.exp_name)) + subprocess.call(['sh', 'dataset/TD500/eval.sh', father_path]) + + if debug: + source_dir = os.path.join(cfg.vis_dir, '{}_test'.format(cfg.exp_name)) + outpt_dir_base = os.path.join(cfg.output_dir, "Analysis", "eval_view", "TD500") + if not os.path.exists(outpt_dir_base): + mkdirs(outpt_dir_base) + + outpt_dir = os.path.join(outpt_dir_base, "{}_{}_{}".format(cfg.test_size[0], cfg.test_size[1], cfg.checkepoch)) + osmkdir(outpt_dir) + fid_path1 = '{}/Eval_TD500.txt'.format(eval_dir) + + analysize_result(source_dir, fid_path1, outpt_dir, "TD500") + + print('End.') + + +def data_transfer_ICDAR(contours): + cnts = list() + for cont in contours: + rect = cv2.minAreaRect(cont) + if min(rect[1][0], rect[1][1]) <= 5: + continue + points = cv2.boxPoints(rect) + points = np.int0(points) + # print(points.shape) + # points = np.reshape(points, (4, 2)) + cnts.append(points) + return cnts + + +def data_transfer_TD500(contours, res_file, img=None): + with open(res_file, 'w') as f: + for cont in contours: + rect = cv2.minAreaRect(cont) + if min(rect[1][0], rect[1][1]) <= 5: + continue + points = cv2.boxPoints(rect) + box = np.int0(points) + cv2.drawContours(img, [box], 0, (0, 255, 0), 3) + + cx, cy = rect[0] + w_, h_ = rect[1] + angle = rect[2] + mid_ = 0 + if angle > 45: + angle = 90 - angle + mid_ = w_; + w_ = h_; + h_ = mid_ + elif angle < -45: + angle = 90 + angle + mid_ = w_; + w_ = h_; + h_ = mid_ + angle = angle / 180 * 3.141592653589 + + x_min = int(cx - w_ / 2) + x_max = int(cx + w_ / 2) + y_min = int(cy - h_ / 2) + y_max = int(cy + h_ / 2) + f.write('{},{},{},{},{}\r\n'.format(x_min, y_min, x_max, y_max, angle)) + + return img + + +def data_transfer_MLT2017(contours, res_file): + with open(res_file, 'w') as f: + for cont in contours: + rect = cv2.minAreaRect(cont) + if min(rect[1][0], rect[1][1]) <= 5: + continue + ploy_area = cv2.contourArea(cont) + rect_area = rect[1][0]*rect[1][1] + solidity = ploy_area/rect_area + width = rect[1][0] - np.clip(rect[1][0] * (1-np.sqrt(solidity)), 0, 6) + height = rect[1][1] - np.clip(rect[1][1] * (1-np.sqrt(solidity)), 0, 4) + points = cv2.boxPoints((rect[0], (width, height), rect[2])) + points = np.int0(points) + p = np.reshape(points, -1) + f.write('{},{},{},{},{},{},{},{},{}\r\n' + .format(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 1)) + + + diff --git a/IndicPhotoOCR/detection/textbpn/util/graph.py b/IndicPhotoOCR/detection/textbpn/util/graph.py new file mode 100644 index 0000000000000000000000000000000000000000..c0175ad72d59bf414a1b902bc8b6c3bcf918e4d5 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/graph.py @@ -0,0 +1,309 @@ +from __future__ import print_function +from __future__ import division +from __future__ import absolute_import + +import numpy as np +import time +from util.misc import norm2 + +class Data(object): + def __init__(self, name): + self.__name = name + self.__links = set() + + @property + def name(self): + return self.__name + + @property + def links(self): + return set(self.__links) + + def add_link(self, other, score): + self.__links.add(other) + other.__links.add(self) + + +def connected_components(nodes, score_dict, th): + ''' + conventional connected components searching + ''' + result = [] + nodes = set(nodes) + while nodes: + n = nodes.pop() + group = {n} + queue = [n] + while queue: + n = queue.pop(0) + if th is not None: + neighbors = {l for l in n.links if score_dict[tuple(sorted([n.name, l.name]))] >= th} + else: + neighbors = n.links + neighbors.difference_update(group) + nodes.difference_update(neighbors) + group.update(neighbors) + queue.extend(neighbors) + result.append(group) + return result + + +def connected_components_constraint(nodes, max_sz, score_dict=None, th=None): + ''' + only use edges whose scores are above `th` + if a component is larger than `max_sz`, all the nodes in this component are added into `remain` and returned for next iteration. + ''' + result = [] + remain = set() + nodes = set(nodes) + while nodes: + n = nodes.pop() + group = {n} + queue = [n] + valid = True + while queue: + n = queue.pop(0) + if th is not None: + neighbors = {l for l in n.links if score_dict[tuple(sorted([n.name, l.name]))] >= th} + else: + neighbors = n.links + neighbors.difference_update(group) + nodes.difference_update(neighbors) + group.update(neighbors) + queue.extend(neighbors) + if len(group) > max_sz or len(remain.intersection(neighbors)) > 0: + # if this group is larger than `max_sz`, add the nodes into `remain` + valid = False + remain.update(group) + break + if valid: # if this group is smaller than or equal to `max_sz`, finalize it. + result.append(group) + return result, remain + + +def graph_propagation_naive(edges, score, th, bboxs=None, dis_thresh=50, pool='avg'): + + edges = np.sort(edges, axis=1) + + score_dict = {} # score lookup table + if pool is None: + for i, e in enumerate(edges): + score_dict[e[0], e[1]] = score[i] + elif pool == 'avg': + for i, e in enumerate(edges): + if bboxs is not None: + box1 = bboxs[e[0]][:8].reshape(4, 2) + box2 = bboxs[e[1]][:8].reshape(4, 2) + c1 = np.mean(box1, 0); c2 = np.mean(box2, 0) + dst = norm2(c1 - c2) + if dst > dis_thresh: + score[i] = 0 + if (e[0], e[1]) in score_dict: + score_dict[e[0], e[1]] = 0.5 * (score_dict[e[0], e[1]] + score[i]) + else: + score_dict[e[0], e[1]] = score[i] + + elif pool == 'max': + for i, e in enumerate(edges): + if (e[0], e[1]) in score_dict: + score_dict[e[0], e[1]] = max(score_dict[e[0], e[1]], score[i]) + else: + score_dict[e[0], e[1]] = score[i] + else: + raise ValueError('Pooling operation not supported') + + nodes = np.sort(np.unique(edges.flatten())) + mapping = -1 * np.ones((nodes.max()+1), dtype=np.int) + mapping[nodes] = np.arange(nodes.shape[0]) + link_idx = mapping[edges] + vertex = [Data(n) for n in nodes] + for l, s in zip(link_idx, score): + vertex[l[0]].add_link(vertex[l[1]], s) + + # first iteration + comps = connected_components(vertex, score_dict,th) + + return comps + + +def graph_search(edges, scores, edges_num, th=None): + # graph search + scores = scores.reshape((-1, edges_num)) + select_index = np.argsort(scores, axis=1)[:, -2:] + edges = np.sort(edges, axis=1).reshape((-1, edges_num, 2)) + + score_dict = {} + for i, ips in enumerate(select_index): + edg = edges[i] + si = scores[i] + for j, idx in enumerate(ips): + e = edg[idx, :] + if (e[0], e[1]) in score_dict: + score_dict[e[0], e[1]] = 0.5 * (score_dict[e[0], e[1]] + si[j]) + else: + score_dict[e[0], e[1]] = si[j] + + nodes = np.sort(np.unique(edges.flatten())) + vertex = [Data(n) for n in nodes] + for (key, value) in score_dict.items(): + vertex[key[0]].add_link(vertex[key[1]], value) + + comps = connected_components(vertex, score_dict, th) + + return comps + + +def graph_propagation(edges, score, max_sz, step=0.1, beg_th=0.5, pool=None): + + edges = np.sort(edges, axis=1) + th = score.min() + # th = beg_th + # construct graph + score_dict = {} # score lookup table + if pool is None: + for i,e in enumerate(edges): + score_dict[e[0], e[1]] = score[i] + elif pool == 'avg': + for i,e in enumerate(edges): + if (e[0], e[1]) in score_dict: + score_dict[e[0], e[1]] = 0.5*(score_dict[e[0], e[1]] + score[i]) + else: + score_dict[e[0], e[1]] = score[i] + + elif pool == 'max': + for i,e in enumerate(edges): + if (e[0],e[1]) in score_dict: + score_dict[e[0], e[1]] = max(score_dict[e[0], e[1]] , score[i]) + else: + score_dict[e[0], e[1]] = score[i] + else: + raise ValueError('Pooling operation not supported') + + nodes = np.sort(np.unique(edges.flatten())) + mapping = -1 * np.ones((nodes.max()+1), dtype=np.int) + mapping[nodes] = np.arange(nodes.shape[0]) + link_idx = mapping[edges] + vertex = [Data(n) for n in nodes] + for l, s in zip(link_idx, score): + vertex[l[0]].add_link(vertex[l[1]], s) + + # first iteration + comps, remain = connected_components_constraint(vertex, max_sz) + + # iteration + components = comps[:] + while remain: + th = th + (1 - th) * step + comps, remain = connected_components_constraint(remain, max_sz, score_dict, th) + components.extend(comps) + return components + + +def graph_propagation_soft(edges, score, max_sz, step=0.1, **kwargs): + + edges = np.sort(edges, axis=1) + th = score.min() + + # construct graph + score_dict = {} # score lookup table + for i,e in enumerate(edges): + score_dict[e[0], e[1]] = score[i] + + nodes = np.sort(np.unique(edges.flatten())) + mapping = -1 * np.ones((nodes.max()+1), dtype=np.int) + mapping[nodes] = np.arange(nodes.shape[0]) + link_idx = mapping[edges] + vertex = [Data(n) for n in nodes] + for l, s in zip(link_idx, score): + vertex[l[0]].add_link(vertex[l[1]], s) + + # first iteration + comps, remain = connected_components_constraint(vertex, max_sz) + first_vertex_idx = np.array([mapping[n.name] for c in comps for n in c]) + fusion_vertex_idx = np.setdiff1d(np.arange(nodes.shape[0]), first_vertex_idx, assume_unique=True) + # iteration + components = comps[:] + while remain: + th = th + (1 - th) * step + comps, remain = connected_components_constraint(remain, max_sz, score_dict, th) + components.extend(comps) + label_dict = {} + for i,c in enumerate(components): + for n in c: + label_dict[n.name] = i + print('Propagation ...') + prop_vertex = [vertex[idx] for idx in fusion_vertex_idx] + label, label_fusion = diffusion(prop_vertex, label_dict, score_dict, **kwargs) + return label, label_fusion + + +def diffusion(vertex, label, score_dict, max_depth=5, weight_decay=0.6, normalize=True): + class BFSNode(): + def __init__(self, node, depth, value): + self.node = node + self.depth = depth + self.value = value + + label_fusion = {} + for name in label.keys(): + label_fusion[name] = {label[name]: 1.0} + prog = 0 + prog_step = len(vertex) // 20 + start = time.time() + for root in vertex: + if prog % prog_step == 0: + print("progress: {} / {}, elapsed time: {}".format(prog, len(vertex), time.time() - start)) + prog += 1 + #queue = {[root, 0, 1.0]} + queue = {BFSNode(root, 0, 1.0)} + visited = [root.name] + root_label = label[root.name] + while queue: + curr = queue.pop() + if curr.depth >= max_depth: # pruning + continue + neighbors = curr.node.links + tmp_value = [] + tmp_neighbor = [] + for n in neighbors: + if n.name not in visited: + sub_value = score_dict[tuple(sorted([curr.node.name, n.name]))] * weight_decay * curr.value + tmp_value.append(sub_value) + tmp_neighbor.append(n) + if root_label not in label_fusion[n.name].keys(): + label_fusion[n.name][root_label] = sub_value + else: + label_fusion[n.name][root_label] += sub_value + visited.append(n.name) + #queue.add([n, curr.depth+1, sub_value]) + sortidx = np.argsort(tmp_value)[::-1] + for si in sortidx: + queue.add(BFSNode(tmp_neighbor[si], curr.depth+1, tmp_value[si])) + if normalize: + for name in label_fusion.keys(): + summ = sum(label_fusion[name].values()) + for k in label_fusion[name].keys(): + label_fusion[name][k] /= summ + return label, label_fusion + + +def clusters2labels(clusters, n_nodes): + labels = (-1)* np.ones((n_nodes,)) + for ci, c in enumerate(clusters): + for xid in c: + labels[xid.name] = ci + assert np.sum(labels < 0) < 1 + return labels + + +def single_remove(bbox, pred): + single_idcs = np.zeros_like(pred) + pred_unique = np.unique(pred) + for u in pred_unique: + idcs = pred == u + if np.sum(idcs) == 1: + single_idcs[np.where(idcs)[0][0]] = 1 + remain_idcs = [i for i in range(len(pred)) if not single_idcs[i]] + remain_idcs = np.asarray(remain_idcs) + return bbox[remain_idcs, :], pred[remain_idcs] + diff --git a/IndicPhotoOCR/detection/textbpn/util/io.py b/IndicPhotoOCR/detection/textbpn/util/io.py new file mode 100644 index 0000000000000000000000000000000000000000..fdcdadc7b802e622d6079fc679d87e134541e0bf --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/io.py @@ -0,0 +1,233 @@ +#coding=utf-8 +''' +Created on 2016年9月27日 + +@author: dengdan + +Tool functions for file system operation and I/O. +In the style of linux shell commands +''' +import os +import pickle as pkl +import subprocess +import logging +from . import strs, io + + +def mkdir(path): + """ + If the target directory does not exists, it and its parent directories will created. + """ + path = get_absolute_path(path) + if not exists(path): + os.makedirs(path) + return path + +def make_parent_dir(path): + """make the parent directories for a file.""" + parent_dir = get_dir(path) + mkdir(parent_dir) + + +def pwd(): + return os.getcwd() + +def dump(path, obj): + path = get_absolute_path(path) + parent_path = get_dir(path) + mkdir(parent_path) + with open(path, 'w') as f: + logging.info('dumping file:' + path); + pkl.dump(obj, f) + +def load(path): + path = get_absolute_path(path) + with open(path, 'r') as f: + data = pkl.load(f) + return data + +def join_path(a, *p): + return os.path.join(a, *p) + +def is_dir(path): + path = get_absolute_path(path) + return os.path.isdir(path) + +is_directory = is_dir + +def is_path(path): + path = get_absolute_path(path) + return os.path.ispath(path) + +def get_dir(path): + ''' + return the directory it belongs to. + if path is a directory itself, itself will be return + ''' + path = get_absolute_path(path) + if is_dir(path): + return path; + return os.path.split(path)[0] + +def get_parent_dir(path): + current_dir = get_dir(path) + return get_absolute_path(join_path(current_dir, '..')) + +def get_filename(path): + return os.path.split(path)[1] + +def get_absolute_path(p): + if p.startswith('~'): + p = os.path.expanduser(p) + return os.path.abspath(p) + +def cd(p): + p = get_absolute_path(p) + os.chdir(p) + +def ls(path = '.', suffix = None): + """ + list files in a directory. + return file names in a list + """ + path = get_absolute_path(path) + files = os.listdir(path) + + if suffix is None: + return files + + filtered = [] + for f in files: + if string.ends_with(f, suffix, ignore_case = True): + filtered.append(f) + + return filtered + +def find_files(pattern): + import glob + return glob.glob(pattern) + +def read_lines(p): + """return the text in a file in lines as a list """ + p = get_absolute_path(p) + f = open(p,'r') + return f.readlines() + +def write_lines(p, lines, append_break = False): + p = get_absolute_path(p) + make_parent_dir(p) + with open(p, 'w') as f: + for line in lines: + if append_break: + f.write(line + '\n') + else: + f.write(line) + +def cat(p): + """return the text in a file as a whole""" + cmd = 'cat ' + p + return subprocess.getoutput(cmd) + +def exists(path): + path = get_absolute_path(path) + return os.path.exists(path) + +def not_exists(path): + return not exists(path) + +def load_mat(path): + import scipy.io as sio # type: ignore + path = get_absolute_path(path) + return sio.loadmat(path) + +def dump_mat(path, dict_obj, append = True): + import scipy.io as sio # type: ignore + path = get_absolute_path(path) + make_parent_dir(path) + sio.savemat(file_name = path, mdict = dict_obj, appendmat = append) + +def dir_mat(path): + ''' + list the variables in mat file. + return a list: [(name, shape, dtype), ...] + ''' + import scipy.io as sio # type: ignore + path = get_absolute_path(path) + return sio.whosmat(path) + +SIZE_UNIT_K = 1024 +SIZE_UNIT_M = SIZE_UNIT_K ** 2 +SIZE_UNIT_G = SIZE_UNIT_K ** 3 +def get_file_size(path, unit = SIZE_UNIT_K): + size = os.path.getsize(get_absolute_path(path)) + return size * 1.0 / unit + + +def create_h5(path): + import h5py # type: ignore + path = get_absolute_path(path) + make_parent_dir(path) + return h5py.File(path, 'w'); + +def open_h5(path, mode = 'r'): + import h5py + path = get_absolute_path(path) + return h5py.File(path, mode); + +def read_h5(h5, key): + return h5[key][:] +def read_h5_attrs(h5, key, attrs): + return h5[key].attrs[attrs] + +def copy(src, dest): + io.make_parent_dir(dest) + import shutil + shutil.copy(get_absolute_path(src), get_absolute_path(dest)) + +cp = copy + +def remove(p): + import os + os.remove(get_absolute_path(p)) +rm = remove + +def search(pattern, path, file_only = True): + """ + Search files whose name matches the give pattern. The search scope + is the directory and sub-directories of 'path'. + """ + path = get_absolute_path(path) + pattern_here = io.join_path(path, pattern) + targets = [] + + # find matchings in current directory + candidates = find_files(pattern_here) + for can in candidates: + if io.is_dir(can) and file_only: + continue + else: + targets.append(can) + + # find matching in sub-dirs + files = ls(path) + for f in files: + fpath = io.join_path(path, f) + if is_dir(fpath): + targets_in_sub_dir = search(pattern, fpath, file_only) + targets.extend(targets_in_sub_dir) + return targets + +def dump_json(path, data): + import ujson as json + path = get_absolute_path(path) + make_parent_dir(path) + + with open(path, 'w') as f: + json.dump(data, f) + return path + +def load_json(path): + import ujson as json + path = get_absolute_path(path) + with open(path, 'r') as f: + return json.load(f) diff --git a/IndicPhotoOCR/detection/textbpn/util/logging.py b/IndicPhotoOCR/detection/textbpn/util/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..e928a4ac25de0531d0542e2de5b2eaf686a45261 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/logging.py @@ -0,0 +1,129 @@ +from __future__ import absolute_import +import os +import sys +import numpy as np +import tensorflow as tf +import scipy.misc +try: + from StringIO import StringIO # Python 2.7 +except ImportError: + from io import BytesIO # Python 3.x + +from .osutils import mkdir_if_missing + +from config import get_args +global_args = get_args(sys.argv[1:]) + +if global_args.run_on_remote: + import moxing as mox + mox.file.shift("os", "mox") + +class Logger(object): + def __init__(self, fpath=None): + self.console = sys.stdout + self.file = None + if fpath is not None: + if global_args.run_on_remote: + dir_name = os.path.dirname(fpath) + if not mox.file.exists(dir_name): + mox.file.make_dirs(dir_name) + print('=> making dir ', dir_name) + self.file = mox.file.File(fpath, 'w') + # self.file = open(fpath, 'w') + else: + mkdir_if_missing(os.path.dirname(fpath)) + self.file = open(fpath, 'w') + + def __del__(self): + self.close() + + def __enter__(self): + pass + + def __exit__(self, *args): + self.close() + + def write(self, msg): + self.console.write(msg) + if self.file is not None: + self.file.write(msg) + + def flush(self): + self.console.flush() + if self.file is not None: + self.file.flush() + os.fsync(self.file.fileno()) + + def close(self): + self.console.close() + if self.file is not None: + self.file.close() + + +class TFLogger(object): + def __init__(self, log_dir=None): + """Create a summary writer logging to log_dir.""" + if log_dir is not None: + mkdir_if_missing(log_dir) + self.writer = tf.summary.FileWriter(log_dir) + + def scalar_summary(self, tag, value, step): + """Log a scalar variable.""" + summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)]) + self.writer.add_summary(summary, step) + self.writer.flush() + + def image_summary(self, tag, images, step): + """Log a list of images.""" + + img_summaries = [] + for i, img in enumerate(images): + # Write the image to a string + try: + s = StringIO() + except: + s = BytesIO() + scipy.misc.toimage(img).save(s, format="png") + + # Create an Image object + img_sum = tf.Summary.Image(encoded_image_string=s.getvalue(), + height=img.shape[0], + width=img.shape[1]) + # Create a Summary value + img_summaries.append(tf.Summary.Value(tag='%s/%d' % (tag, i), image=img_sum)) + + # Create and write Summary + summary = tf.Summary(value=img_summaries) + self.writer.add_summary(summary, step) + self.writer.flush() + + def histo_summary(self, tag, values, step, bins=1000): + """Log a histogram of the tensor of values.""" + + # Create a histogram using numpy + counts, bin_edges = np.histogram(values, bins=bins) + + # Fill the fields of the histogram proto + hist = tf.HistogramProto() + hist.min = float(np.min(values)) + hist.max = float(np.max(values)) + hist.num = int(np.prod(values.shape)) + hist.sum = float(np.sum(values)) + hist.sum_squares = float(np.sum(values**2)) + + # Drop the start of the first bin + bin_edges = bin_edges[1:] + + # Add bin edges and counts + for edge in bin_edges: + hist.bucket_limit.append(edge) + for c in counts: + hist.bucket.append(c) + + # Create and write Summary + summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)]) + self.writer.add_summary(summary, step) + self.writer.flush() + + def close(self): + self.writer.close() \ No newline at end of file diff --git a/IndicPhotoOCR/detection/textbpn/util/meters.py b/IndicPhotoOCR/detection/textbpn/util/meters.py new file mode 100644 index 0000000000000000000000000000000000000000..7b98c6fd2d3260be44b1bd2fdda28d6e75979952 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/meters.py @@ -0,0 +1,23 @@ +from __future__ import absolute_import + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count \ No newline at end of file diff --git a/IndicPhotoOCR/detection/textbpn/util/misc.py b/IndicPhotoOCR/detection/textbpn/util/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..4231e2b2bbb6f6214082f0a4ee8333588b268e06 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/misc.py @@ -0,0 +1,408 @@ +import numpy as np +import errno +import os +import cv2 +import math +from shapely.geometry import Polygon +from IndicPhotoOCR.detection.textbpn.cfglib.config import config as cfg +from scipy import ndimage as ndimg + +def to_device(*tensors): + if len(tensors) < 2: + return tensors[0].to(cfg.device, non_blocking=True) + return (t.to(cfg.device, non_blocking=True) for t in tensors) + + +def mkdirs(newdir): + """ + make directory with parent path + :param newdir: target path + """ + try: + if not os.path.exists(newdir): + os.makedirs(newdir) + except OSError as err: + # Reraise the error unless it's about an already existing directory + if err.errno != errno.EEXIST or not os.path.isdir(newdir): + raise + + +def rescale_result(image, bbox_contours, H, W): + ori_H, ori_W = image.shape[:2] + image = cv2.resize(image, (W, H)) + contours = list() + for cont in bbox_contours: + # if cv2.contourArea(cont) < 300: + # continue + cont[:, 0] = (cont[:, 0] * W / ori_W).astype(int) + cont[:, 1] = (cont[:, 1] * H / ori_H).astype(int) + contours.append(cont) + return image, contours + + +def fill_hole(input_mask): + h, w = input_mask.shape + canvas = np.zeros((h + 2, w + 2), np.uint8) + canvas[1:h + 1, 1:w + 1] = input_mask.copy() + + mask = np.zeros((h + 4, w + 4), np.uint8) + + cv2.floodFill(canvas, mask, (0, 0), 1) + canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool) + + return (~canvas | input_mask.astype(np.uint8)) + + +def regularize_sin_cos(sin, cos): + # regularization + scale = np.sqrt(1.0 / (sin ** 2 + cos ** 2)) + return sin * scale, cos * scale + + +def gaussian2D(shape, sigma=1): + m, n = [(ss - 1.) / 2. for ss in shape] + y, x = np.ogrid[-m:m + 1, -n:n + 1] + + h = np.exp(-(x * x + y * y) / (2 * sigma * sigma)) + h[h < np.finfo(h.dtype).eps * h.max()] = 0 + return h + + +def draw_gaussian(heatmap, center, radius, k=1, delte=6): + diameter = 2 * radius + 1 + gaussian = gaussian2D((diameter, diameter), sigma=diameter / delte) + + x, y = center + + height, width = heatmap.shape[0:2] + + left, right = min(x, radius), min(width - x, radius + 1) + top, bottom = min(y, radius), min(height - y, radius + 1) + + masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] + masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:radius + right] + np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap) + + +def gaussian_radius(det_size, min_overlap=0.7): + height, width = det_size + + a1 = 1 + b1 = (height + width) + c1 = width * height * (1 - min_overlap) / (1 + min_overlap) + sq1 = np.sqrt(b1 ** 2 - 4 * a1 * c1) + r1 = (b1 + sq1) / 2 + + a2 = 4 + b2 = 2 * (height + width) + c2 = (1 - min_overlap) * width * height + sq2 = np.sqrt(b2 ** 2 - 4 * a2 * c2) + r2 = (b2 + sq2) / 2 + + a3 = 4 * min_overlap + b3 = -2 * min_overlap * (height + width) + c3 = (min_overlap - 1) * width * height + sq3 = np.sqrt(b3 ** 2 - 4 * a3 * c3) + r3 = (b3 + sq3) / 2 + return min(r1, r2, r3) + + +def point_dist_to_line(line, p3): + # 计算点到直线的距离 + # line = (p1, p2) + # compute the distance from p3 to p1-p2 #cross(x,y)矩阵的叉积,norm()求范数 + # np.linalg.norm(np.cross(p2 - p1, p1 - p3)) * 1.0 / np.linalg.norm(p2 - p1) + # compute the distance from p3 to p1-p2 + p1, p2 = line + d = p2 - p1 + + def l2(p): + return math.sqrt(p[0] * p[0]+ p[1]*p[1]) + + if l2(d) > 0: + distance = abs(d[1] * p3[0] - d[0] * p3[1] + p2[0] * p1[1] - p2[1] * p1[0]) / l2(d) + else: + distance = math.sqrt((p3[0]-p2[0])**2 + (p3[1]-p2[1])**2) + + return distance + + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def norm2(x, axis=None): + if axis: + return np.sqrt(np.sum(x ** 2, axis=axis)) + return np.sqrt(np.sum(x ** 2)) + + +def cos(p1, p2): + return (p1 * p2).sum() / (norm2(p1) * norm2(p2)) + + +def vector_sin(v): + assert len(v) == 2 + # sin = y / (sqrt(x^2 + y^2)) + l = np.sqrt(v[0] ** 2 + v[1] ** 2) + 1e-5 + return v[1] / l + + +def vector_cos(v): + assert len(v) == 2 + # cos = x / (sqrt(x^2 + y^2)) + l = np.sqrt(v[0] ** 2 + v[1] ** 2) + 1e-5 + return v[0] / l + + +def find_bottom(pts): + + if len(pts) > 4: + e = np.concatenate([pts, pts[:3]]) + candidate = [] + for i in range(1, len(pts) + 1): + v_prev = e[i] - e[i - 1] + v_next = e[i + 2] - e[i + 1] + if cos(v_prev, v_next) < -0.875: + candidate.append((i % len(pts), (i + 1) % len(pts), norm2(e[i] - e[i + 1]))) + + if len(candidate) != 2 or candidate[0][0] == candidate[1][1] or candidate[0][1] == candidate[1][0]: + # if candidate number < 2, or two bottom are joined, select 2 farthest edge + mid_list = [] + dist_list = [] + if len(candidate) > 2: + + bottom_idx = np.argsort([angle for s1, s2, angle in candidate])[0:2] + bottoms = [candidate[bottom_idx[0]][:2], candidate[bottom_idx[1]][0:2]] + long_edge1, long_edge2 = find_long_edges(pts, bottoms) + edge_length1 = [norm2(pts[e1] - pts[e2]) for e1, e2 in long_edge1] + edge_length2 = [norm2(pts[e1] - pts[e2]) for e1, e2 in long_edge2] + l1 = sum(edge_length1) + l2 = sum(edge_length2) + len1 = len(edge_length1) + len2 = len(edge_length2) + + if l1 > 2*l2 or l2 > 2*l1 or len1 == 0 or len2 == 0: + for i in range(len(pts)): + mid_point = (e[i] + e[(i + 1) % len(pts)]) / 2 + mid_list.append((i, (i + 1) % len(pts), mid_point)) + + for i in range(len(pts)): + for j in range(len(pts)): + s1, e1, mid1 = mid_list[i] + s2, e2, mid2 = mid_list[j] + dist = norm2(mid1 - mid2) + dist_list.append((s1, e1, s2, e2, dist)) + bottom_idx = np.argsort([dist for s1, e1, s2, e2, dist in dist_list])[-1] + bottoms = [dist_list[bottom_idx][:2], dist_list[bottom_idx][2:4]] + else: + mid_list = [] + for i in range(len(pts)): + mid_point = (e[i] + e[(i + 1) % len(pts)]) / 2 + mid_list.append((i, (i + 1) % len(pts), mid_point)) + + dist_list = [] + for i in range(len(pts)): + for j in range(len(pts)): + s1, e1, mid1 = mid_list[i] + s2, e2, mid2 = mid_list[j] + dist = norm2(mid1 - mid2) + dist_list.append((s1, e1, s2, e2, dist)) + bottom_idx = np.argsort([dist for s1, e1, s2, e2, dist in dist_list])[-2:] + bottoms = [dist_list[bottom_idx[0]][:2], dist_list[bottom_idx[1]][:2]] + else: + bottoms = [candidate[0][:2], candidate[1][:2]] + else: + d1 = norm2(pts[1] - pts[0]) + norm2(pts[2] - pts[3]) + d2 = norm2(pts[2] - pts[1]) + norm2(pts[0] - pts[3]) + bottoms = [(0, 1), (2, 3)] if d1 < d2 else [(1, 2), (3, 0)] + # bottoms = [(0, 1), (2, 3)] if 2 * d1 < d2 and d1 > 32 else [(1, 2), (3, 0)] + assert len(bottoms) == 2, 'fewer than 2 bottoms' + return bottoms + + +def split_long_edges(points, bottoms): + """ + Find two long edge sequence of and polygon + """ + b1_start, b1_end = bottoms[0] + b2_start, b2_end = bottoms[1] + n_pts = len(points) + + i = b1_end + 1 + long_edge_1 = [] + while i % n_pts != b2_end: + long_edge_1.append((i - 1, i)) + i = (i + 1) % n_pts + + i = b2_end + 1 + long_edge_2 = [] + while i % n_pts != b1_end: + long_edge_2.append((i - 1, i)) + i = (i + 1) % n_pts + return long_edge_1, long_edge_2 + + +def find_long_edges(points, bottoms): + b1_start, b1_end = bottoms[0] + b2_start, b2_end = bottoms[1] + n_pts = len(points) + i = (b1_end + 1) % n_pts + long_edge_1 = [] + + while i % n_pts != b2_end: + start = (i - 1) % n_pts + end = i % n_pts + long_edge_1.append((start, end)) + i = (i + 1) % n_pts + + i = (b2_end + 1) % n_pts + long_edge_2 = [] + while i % n_pts != b1_end: + start = (i - 1) % n_pts + end = i % n_pts + long_edge_2.append((start, end)) + i = (i + 1) % n_pts + return long_edge_1, long_edge_2 + + +def split_edge_seqence(points, n_parts): + pts_num = points.shape[0] + long_edge = [(i, (i + 1) % pts_num) for i in range(pts_num)] + edge_length = [norm2(points[e1] - points[e2]) for e1, e2 in long_edge] + point_cumsum = np.cumsum([0] + edge_length) + total_length = sum(edge_length) + length_per_part = total_length / n_parts + + cur_node = 0 # first point + splited_result = [] + + for i in range(1, n_parts): + cur_end = i * length_per_part + + while cur_end > point_cumsum[cur_node + 1]: + cur_node += 1 + + e1, e2 = long_edge[cur_node] + e1, e2 = points[e1], points[e2] + + # start_point = points[long_edge[cur_node]] + end_shift = cur_end - point_cumsum[cur_node] + ratio = end_shift / edge_length[cur_node] + new_point = e1 + ratio * (e2 - e1) + # print(cur_end, point_cumsum[cur_node], end_shift, edge_length[cur_node], '=', new_point) + splited_result.append(new_point) + + # add first and last point + p_first = points[long_edge[0][0]] + p_last = points[long_edge[-1][1]] + splited_result = [p_first] + splited_result + [p_last] + return np.stack(splited_result) + + +def split_edge_seqence_with_cell_division(points, n_parts): + points_seq = list(points) + pts_num = len(points_seq) + + if pts_num <= n_parts: + long_edge = [(i, (i + 1) % pts_num) for i in range(pts_num)] + edge_length = [int(norm2(points[e1] - points[e2])) for e1, e2 in long_edge] + while pts_num < n_parts: + e = np.argmax(np.array(edge_length)) + new_pts = (points_seq[e] + points_seq[(e+1) % pts_num])*0.5 + points_seq.insert(e+1, new_pts) + d = int(0.5 * (edge_length[e]-1)) + edge_length[e] = d + edge_length.insert(e+1, d) + pts_num = len(points_seq) + else: + pass + + return np.stack(points_seq).astype(int) + + +def split_edge_seqence_by_step(points, long_edge1, long_edge2, step=16.0): + + edge_length1 = [norm2(points[e1] - points[e2]) for e1, e2 in long_edge1] + edge_length2 = [norm2(points[e1] - points[e2]) for e1, e2 in long_edge2] + # 取长边 计算bbox个数 + total_length = (sum(edge_length1)+sum(edge_length2))/2 + n_parts = math.ceil(float(total_length) / step) + try: + inner1 = split_edge_seqence(points, long_edge1, n_parts=n_parts) + inner2 = split_edge_seqence(points, long_edge2, n_parts=n_parts) + except: + print(edge_length1) + print(edge_length2) + + return inner1, inner2 + + +def disjoint_find(x, F): + if F[x] == x: + return x + F[x] = disjoint_find(F[x], F) + return F[x] + + +def disjoint_merge(x, y, F): + x = disjoint_find(x, F) + y = disjoint_find(y, F) + if x == y: + return False + F[y] = x + return True + + +def merge_polygons(polygons, merge_map): + + def merge_two_polygon(p1, p2): + p2 = Polygon(p2) + merged = p1.union(p2) + return merged + + merge_map = [disjoint_find(x, merge_map) for x in range(len(merge_map))] + merge_map = np.array(merge_map) + final_polygons = [] + + for i in np.unique(merge_map): + merge_idx = np.where(merge_map == i)[0] + if len(merge_idx) > 0: + merged = Polygon(polygons[merge_idx[0]]) + for j in range(1, len(merge_idx)): + merged = merge_two_polygon(merged, polygons[merge_idx[j]]) + x, y = merged.exterior.coords.xy + final_polygons.append(np.stack([x, y], axis=1).astype(int)) + + return final_polygons + + +def get_sample_point(text_mask, num_points, approx_factor, scales=None): + # get sample point in contours + contours, _ = cv2.findContours(text_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + epsilon = approx_factor * cv2.arcLength(contours[0], True) + approx = cv2.approxPolyDP(contours[0], epsilon, True).reshape((-1, 2)) + # approx = contours[0].reshape((-1, 2)) + if scales is None: + ctrl_points = split_edge_seqence(approx, num_points) + else: + ctrl_points = split_edge_seqence(approx*scales, num_points) + ctrl_points = np.array(ctrl_points[:num_points, :]).astype(np.int32) + + return ctrl_points + + diff --git a/IndicPhotoOCR/detection/textbpn/util/pbox.py b/IndicPhotoOCR/detection/textbpn/util/pbox.py new file mode 100644 index 0000000000000000000000000000000000000000..b43e1cbcef07f70086857e89fadb52bea957f036 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/pbox.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +__author__ = '古溪' + +import numpy as np +from typing import List + + +def functools_reduce(a): + # 使用functools內建模块 + import functools + import operator + return functools.reduce(operator.concat, a) + + +def minConnectPath(list_all: List[list]): + list_nodo = list_all.copy() + res = [] + ept = [0, 0] + + def norm2(a, b): + """计算两点之间的距离""" + return ((a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2) ** 0.5 + + dict00 = {} # 格式 {距离,(起点坐标,终点坐标)} + dict11 = {} # 格式 {距离,(起点坐标,终点坐标)} + # 放入一个初始值 + ept[0] = list_nodo[0] # left end point + ept[1] = list_nodo[0] # right end point + list_nodo.remove(list_nodo[0]) + while list_nodo: + for i in list_nodo: # i 待处理的 + length0 = norm2(i, ept[0]) # 端点0终点距离 + dict00[length0] = [i, ept[0]] + length1 = norm2(ept[1], i) # 端点0终点距离 + dict11[length1] = [ept[1], i] + key0 = min(dict00.keys()) + key1 = min(dict11.keys()) + + if key0 <= key1: + ss = dict00[key0][0] + ee = dict00[key0][1] + res.insert(0, [list_all.index(ss), list_all.index(ee)]) + list_nodo.remove(ss) + ept[0] = ss + else: + ss = dict11[key1][0] + ee = dict11[key1][1] + res.append([list_all.index(ss), list_all.index(ee)]) + list_nodo.remove(ee) + ept[1] = ee + + dict00 = {} + dict11 = {} + + path = functools_reduce(res) + path = sorted(set(path), key=path.index) # 去重 + + return res, path + + +def bbox_transfor_inv(radius_map, sin_map, cos_map, score_map, wclip=(2, 8), expend=1.0): + xy_text = np.argwhere(score_map > 0) + # sort the text boxes via the y axis + xy_text = xy_text[np.argsort(xy_text[:, 0])] + origin = xy_text + radius = radius_map[xy_text[:, 0], xy_text[:, 1], :] + sin = sin_map[xy_text[:, 0], xy_text[:, 1]] + cos = cos_map[xy_text[:, 0], xy_text[:, 1]] + dtx = radius[:, 0] * cos * expend + dty = radius[:, 0] * sin * expend + ddx = radius[:, 1] * cos * expend + ddy = radius[:, 1] * sin * expend + topp = origin + np.stack([dty, dtx], axis=-1) + botp = origin - np.stack([ddy, ddx], axis=-1) + width = (radius[:, 0] + radius[:, 1]) // 3 + width = np.clip(width, wclip[0], wclip[1]) + + top1 = topp - np.stack([width * cos, -width * sin], axis=-1) + top2 = topp + np.stack([width * cos, -width * sin], axis=-1) + bot1 = botp - np.stack([width * cos, -width * sin], axis=-1) + bot2 = botp + np.stack([width * cos, -width * sin], axis=-1) + + bbox = np.stack([top1, top2, bot2, bot1], axis=1)[:, :, ::-1] + bboxs = np.zeros((bbox.shape[0], 9), dtype=np.float32) + bboxs[:, :8] = bbox.reshape((-1, 8)) + bboxs[:, 8] = score_map[xy_text[:, 0], xy_text[:, 1]] + + return bboxs + + diff --git a/IndicPhotoOCR/detection/textbpn/util/serialization.py b/IndicPhotoOCR/detection/textbpn/util/serialization.py new file mode 100644 index 0000000000000000000000000000000000000000..8231f0139619b4fb9ef74382de46a3ac0663f59b --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/serialization.py @@ -0,0 +1,89 @@ +from __future__ import print_function, absolute_import +import json +import os +import sys +# import moxing as mox +import os.path as osp +import shutil + +import torch +from torch.nn import Parameter + +from .osutils import mkdir_if_missing + +from config import get_args +global_args = get_args(sys.argv[1:]) + +if global_args.run_on_remote: + import moxing as mox + + +def read_json(fpath): + with open(fpath, 'r') as f: + obj = json.load(f) + return obj + + +def write_json(obj, fpath): + mkdir_if_missing(osp.dirname(fpath)) + with open(fpath, 'w') as f: + json.dump(obj, f, indent=4, separators=(',', ': ')) + + +def save_checkpoint(state, is_best, fpath='checkpoint.pth.tar'): + print('=> saving checkpoint ', fpath) + if global_args.run_on_remote: + dir_name = osp.dirname(fpath) + if not mox.file.exists(dir_name): + mox.file.make_dirs(dir_name) + print('=> makding dir ', dir_name) + local_path = "local_checkpoint.pth.tar" + torch.save(state, local_path) + mox.file.copy(local_path, fpath) + if is_best: + mox.file.copy(local_path, osp.join(dir_name, 'model_best.pth.tar')) + else: + mkdir_if_missing(osp.dirname(fpath)) + torch.save(state, fpath) + if is_best: + shutil.copy(fpath, osp.join(osp.dirname(fpath), 'model_best.pth.tar')) + + +def load_checkpoint(fpath): + if global_args.run_on_remote: + mox.file.shift('os', 'mox') + checkpoint = torch.load(fpath) + print("=> Loaded checkpoint '{}'".format(fpath)) + return checkpoint + else: + load_path = fpath + + if osp.isfile(load_path): + checkpoint = torch.load(load_path) + print("=> Loaded checkpoint '{}'".format(load_path)) + return checkpoint + else: + raise ValueError("=> No checkpoint found at '{}'".format(load_path)) + + +def copy_state_dict(state_dict, model, strip=None): + tgt_state = model.state_dict() + copied_names = set() + for name, param in state_dict.items(): + if strip is not None and name.startswith(strip): + name = name[len(strip):] + if name not in tgt_state: + continue + if isinstance(param, Parameter): + param = param.data + if param.size() != tgt_state[name].size(): + print('mismatch:', name, param.size(), tgt_state[name].size()) + continue + tgt_state[name].copy_(param) + copied_names.add(name) + + missing = set(tgt_state.keys()) - copied_names + if len(missing) > 0: + print("missing keys in state_dict:", missing) + + return model \ No newline at end of file diff --git a/IndicPhotoOCR/detection/textbpn/util/shedule.py b/IndicPhotoOCR/detection/textbpn/util/shedule.py new file mode 100644 index 0000000000000000000000000000000000000000..338083533d0deb2970a36c60c5b0efb80d7064c9 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/shedule.py @@ -0,0 +1,28 @@ +from torch.optim.lr_scheduler import _LRScheduler + +class FixLR(_LRScheduler): + """Sets the learning rate of each parameter group to the initial lr + decayed by gamma every step_size epochs. When last_epoch=-1, sets + initial lr as lr. + + Args: + optimizer (Optimizer): Wrapped optimizer. + step_size (int): Period of learning rate decay. + gamma (float): Multiplicative factor of learning rate decay. + Default: 0.1. + last_epoch (int): The index of last epoch. Default: -1. + + Example: + >>> # Fixed leraning rate + >>> scheduler = FixLR(optimizer, step_size=30, gamma=0.1) + >>> for epoch in range(100): + >>> scheduler.step() + >>> train(...) + >>> validate(...) + """ + + def __init__(self, optimizer, last_epoch=-1): + super().__init__(optimizer, last_epoch) + + def get_lr(self): + return self.base_lrs diff --git a/IndicPhotoOCR/detection/textbpn/util/strs.py b/IndicPhotoOCR/detection/textbpn/util/strs.py new file mode 100644 index 0000000000000000000000000000000000000000..5009be2a0cb7f56ce603535c79485c39349d2fbc --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/strs.py @@ -0,0 +1,128 @@ +# encoding = utf-8 +def int_array_to_str(arr): + """turn an int array to a str""" + return "".join(map(chr, arr)) + + +def join(arr, splitter=','): + temp = [] + for e in arr: + temp.append(e) + temp.append(splitter) + temp.pop() + return "".join(temp) + + +def is_str(s): + return type(s) == str + + +def to_lowercase(s): + return str.lower(s) + + +def to_uppercase(s): + return str.upper(s) + + +def ends_with(s, suffix, ignore_case = False): + """ + suffix: str, list, or tuple + """ + if is_str(suffix): + suffix = [suffix] + suffix = list(suffix) + if ignore_case: + for idx, suf in enumerate(suffix): + suffix[idx] = to_lowercase(suf) + s = to_lowercase(s) + suffix = tuple(suffix) + return s.endswith(suffix) + + +def starts_with(s, prefix, ignore_case = False): + """ + prefix: str, list, or tuple + """ + if is_str(prefix): + prefix = [prefix] + prefix = list(prefix) + if ignore_case: + for idx, pre in enumerate(prefix): + prefix[idx] = to_lowercase(pre) + s = to_lowercase(s) + prefix = tuple(prefix) + return s.startswith(prefix) + + +def contains(s, target, ignore_case = False): + if ignore_case: + s = to_lowercase(s) + target = to_lowercase(target) + return s.find(target) >= 0 + + +def index_of(s, target): + return s.find(target) + + +def replace_all(s, old, new, reg = False): + if reg: + import re + targets = re.findall(old, s) + for t in targets: + s = s.replace(t, new) + else: + s = s.replace(old, new) + return s + + +def remove_all(s, sub): + return replace_all(s, sub, '') + + +def split(s, splitter, reg = False): + if not reg: + return s.split(splitter) + import re + return re.split(splitter, s) + + +def remove_invisible(s): + s = replace_all(s, ' ', '') + s = replace_all(s, '\n', '') + s = replace_all(s, '\t', '') + s = replace_all(s, '\r', '') + s = replace_all(s, '\xef\xbb\xbf', '') + return s + + +def find_all(s, pattern): + import re + return re.findall(pattern, s) + + +def is_none_or_empty(s): + if s is None: + return True + return len(s)==0; + + +def to_json(obj): + import ujson + return ujson.dumps(obj) + + +def to_list(obj): + items=obj.replace("(", '').replace(")","") + items=items.split(",") + lst=[float(i) for i in items] + + return lst + + +def to_tuple(obj): + items=obj.replace("(", '').replace(")","") + items=items.split(",") + tpl=tuple([float(i) for i in items]) + return tpl diff --git a/IndicPhotoOCR/detection/textbpn/util/summary.py b/IndicPhotoOCR/detection/textbpn/util/summary.py new file mode 100644 index 0000000000000000000000000000000000000000..b6e318549da42a0fbca488ea57b72e03e7e5b402 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/summary.py @@ -0,0 +1,26 @@ +from tensorboardX import SummaryWriter +from util.misc import mkdirs + + +class LogSummary(object): + + def __init__(self, log_path): + + mkdirs(log_path) + self.writer = SummaryWriter(log_path) + + def write_scalars(self, scalar_dict, n_iter, tag=None): + + for name, scalar in scalar_dict.items(): + if tag is not None: + name = '/'.join([tag, name]) + self.writer.add_scalar(name, scalar, n_iter) + + def write_hist_parameters(self, net, n_iter): + for name, param in net.named_parameters(): + self.writer.add_histogram(name, param.clone().cpu().numpy(), n_iter) + + + + + diff --git a/IndicPhotoOCR/detection/textbpn/util/vis_flux.py b/IndicPhotoOCR/detection/textbpn/util/vis_flux.py new file mode 100644 index 0000000000000000000000000000000000000000..4a80e5dab97e8fdb1f810af4bb86f69486e04475 --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/vis_flux.py @@ -0,0 +1,108 @@ +import sys +import scipy.io as sio +import math +import numpy as np +import cv2 +import matplotlib +matplotlib.use('agg') +import pylab as plt +from matplotlib import cm +import os + +def label2color(label): + + label = label.astype(np.uint16) + + height, width = label.shape + color3u = np.zeros((height, width, 3), dtype=np.uint8) + unique_labels = np.unique(label) + + if unique_labels[-1] >= 2**24: + raise RuntimeError('Error: label overflow!') + + for i in range(len(unique_labels)): + + binary = '{:024b}'.format(unique_labels[i]) + # r g b 3*8 24 + r = int(binary[::3][::-1], 2) + g = int(binary[1::3][::-1], 2) + b = int(binary[2::3][::-1], 2) + + color3u[label == unique_labels[i]] = np.array([r, g, b]) + + return color3u + + +def vis_direction_field(gt_flux): + + norm_gt = np.sqrt(gt_flux[1, :, :] ** 2 + gt_flux[0, :, :] ** 2) + angle_gt = 180 / math.pi * np.arctan2(gt_flux[1, :, :], gt_flux[0, :, :]) + + fig = plt.figure(figsize=(10, 6)) + + ax1 = fig.add_subplot(121) + ax1.set_title('Norm_gt') + ax1.set_autoscale_on(True) + im1 = ax1.imshow(norm_gt, cmap=cm.jet) + plt.colorbar(im1, shrink=0.5) + + ax2 = fig.add_subplot(122) + ax2.set_title('Angle_gt') + ax2.set_autoscale_on(True) + im2 = ax2.imshow(angle_gt, cmap=cm.jet) + plt.colorbar(im2, shrink=0.5) + + plt.savefig('1.png') + plt.close(fig) + + +def vis_flux(vis_image, pred_flux, gt_flux, gt_mask, image_name, save_dir): + + vis_image = vis_image.data.cpu().numpy()[0, ...] + pred_flux = pred_flux.data.cpu().numpy()[0, ...] + gt_flux = gt_flux.data.cpu().numpy()[0, ...] + gt_mask = gt_mask.data.cpu().numpy()[0, ...] + + image_name = image_name[0] + + norm_pred = np.sqrt(pred_flux[1,:,:]**2 + pred_flux[0,:,:]**2) + angle_pred = 180/math.pi*np.arctan2(pred_flux[1,:,:], pred_flux[0,:,:]) + + norm_gt = np.sqrt(gt_flux[1,:,:]**2 + gt_flux[0,:,:]**2) + angle_gt = 180/math.pi*np.arctan2(gt_flux[1,:,:], gt_flux[0,:,:]) + + fig = plt.figure(figsize=(10,6)) + + ax0 = fig.add_subplot(231) + ax0.imshow(vis_image[:,:,::-1]) + + ax1 = fig.add_subplot(232) + ax1.set_title('Norm_gt') + ax1.set_autoscale_on(True) + im1 = ax1.imshow(norm_gt, cmap=cm.jet) + plt.colorbar(im1,shrink=0.5) + + ax2 = fig.add_subplot(233) + ax2.set_title('Angle_gt') + ax2.set_autoscale_on(True) + im2 = ax2.imshow(angle_gt, cmap=cm.jet) + plt.colorbar(im2, shrink=0.5) + + ax5 = fig.add_subplot(234) + color_mask = label2color(gt_mask) + ax5.imshow(color_mask) + + ax4 = fig.add_subplot(235) + ax4.set_title('Norm_pred') + ax4.set_autoscale_on(True) + im4 = ax4.imshow(norm_pred, cmap=cm.jet) + plt.colorbar(im4,shrink=0.5) + + ax5 = fig.add_subplot(236) + ax5.set_title('Angle_pred') + ax5.set_autoscale_on(True) + im5 = ax5.imshow(angle_pred, cmap=cm.jet) + plt.colorbar(im5, shrink=0.5) + + plt.savefig(save_dir + image_name + '.png') + plt.close(fig) diff --git a/IndicPhotoOCR/detection/textbpn/util/visualize.py b/IndicPhotoOCR/detection/textbpn/util/visualize.py new file mode 100644 index 0000000000000000000000000000000000000000..cf6d441171f73842b97fc46ce926b2cea318ebde --- /dev/null +++ b/IndicPhotoOCR/detection/textbpn/util/visualize.py @@ -0,0 +1,245 @@ +import torch +import numpy as np +import cv2 +import os +import math +from IndicPhotoOCR.detection.textbpn.cfglib.config import config as cfg +from IndicPhotoOCR.detection.textbpn.util import canvas as cav +import matplotlib +matplotlib.use('agg') +import pylab as plt +from matplotlib import cm +import torch.nn.functional as F + + +def visualize_network_output(output_dict, input_dict, mode='train'): + vis_dir = os.path.join(cfg.vis_dir, cfg.exp_name + '_' + mode) + if not os.path.exists(vis_dir): + os.mkdir(vis_dir) + + fy_preds = F.interpolate(output_dict["fy_preds"], scale_factor=cfg.scale, mode='bilinear') + fy_preds = fy_preds.data.cpu().numpy() + + py_preds = output_dict["py_preds"][1:] + init_polys = output_dict["py_preds"][0] + inds = output_dict["inds"] + + image = input_dict['img'] + tr_mask = input_dict['tr_mask'].data.cpu().numpy() > 0 + distance_field = input_dict['distance_field'].data.cpu().numpy() + direction_field = input_dict['direction_field'] + weight_matrix = input_dict['weight_matrix'] + gt_tags = input_dict['gt_points'].cpu().numpy() + ignore_tags = input_dict['ignore_tags'].cpu().numpy() + + b, c, _, _ = fy_preds.shape + for i in range(b): + + fig = plt.figure(figsize=(12, 9)) + + mask_pred = fy_preds[i, 0, :, :] + distance_pred = fy_preds[i, 1, :, :] + norm_pred = np.sqrt(fy_preds[i, 2, :, :] ** 2 + fy_preds[i, 3, :, :] ** 2) + angle_pred = 180 / math.pi * np.arctan2(fy_preds[i, 2, :, :], fy_preds[i, 3, :, :] + 0.00001) + + ax1 = fig.add_subplot(341) + ax1.set_title('mask_pred') + # ax1.set_autoscale_on(True) + im1 = ax1.imshow(mask_pred, cmap=cm.jet) + # plt.colorbar(im1, shrink=0.5) + + ax2 = fig.add_subplot(342) + ax2.set_title('distance_pred') + # ax2.set_autoscale_on(True) + im2 = ax2.imshow(distance_pred, cmap=cm.jet) + # plt.colorbar(im2, shrink=0.5) + + ax3 = fig.add_subplot(343) + ax3.set_title('norm_pred') + # ax3.set_autoscale_on(True) + im3 = ax3.imshow(norm_pred, cmap=cm.jet) + # plt.colorbar(im3, shrink=0.5) + + ax4 = fig.add_subplot(344) + ax4.set_title('angle_pred') + # ax4.set_autoscale_on(True) + im4 = ax4.imshow(angle_pred, cmap=cm.jet) + # plt.colorbar(im4, shrink=0.5) + + mask_gt = tr_mask[i] + distance_gt = distance_field[i] + # gt_flux = 0.999999 * direction_field[i] / (direction_field[i].norm(p=2, dim=0) + 1e-9) + gt_flux = direction_field[i].cpu().numpy() + norm_gt = np.sqrt(gt_flux[0, :, :] ** 2 + gt_flux[1, :, :] ** 2) + angle_gt = 180 / math.pi * np.arctan2(gt_flux[0, :, :], gt_flux[1, :, :]+0.00001) + + ax11 = fig.add_subplot(345) + # ax11.set_title('mask_gt') + # ax11.set_autoscale_on(True) + im11 = ax11.imshow(mask_gt, cmap=cm.jet) + # plt.colorbar(im11, shrink=0.5) + + ax22 = fig.add_subplot(346) + # ax22.set_title('distance_gt') + # ax22.set_autoscale_on(True) + im22 = ax22.imshow(distance_gt, cmap=cm.jet) + # plt.colorbar(im22, shrink=0.5) + + ax33 = fig.add_subplot(347) + # ax33.set_title('norm_gt') + # ax33.set_autoscale_on(True) + im33 = ax33.imshow(norm_gt, cmap=cm.jet) + # plt.colorbar(im33, shrink=0.5) + + ax44 = fig.add_subplot(348) + # ax44.set_title('angle_gt') + # ax44.set_autoscale_on(True) + im44 = ax44.imshow(angle_gt, cmap=cm.jet) + # plt.colorbar(im44, shrink=0.5) + + img_show = image[i].permute(1, 2, 0).cpu().numpy() + img_show = ((img_show * cfg.stds + cfg.means) * 255).astype(np.uint8) + img_show = np.ascontiguousarray(img_show[:, :, ::-1]) + shows = [] + gt = gt_tags[i] + gt_idx = np.where(ignore_tags[i] > 0) + gt_py = gt[gt_idx[0], :, :] + index = torch.where(inds[0] == i)[0] + init_py = init_polys[index].detach().cpu().numpy() + + image_show = img_show.copy() + cv2.drawContours(image_show, init_py.astype(np.int32), -1, (255, 255, 0), 2) + cv2.drawContours(image_show, gt_py.astype(np.int32), -1, (0, 255, 0), 2) + shows.append(image_show) + for py in py_preds: + contours = py[index].detach().cpu().numpy() + image_show = img_show.copy() + cv2.drawContours(image_show, init_py.astype(np.int32), -1, (255, 255, 0), 2) + cv2.drawContours(image_show, gt_py.astype(np.int32), -1, (0, 255, 0), 2) + cv2.drawContours(image_show, contours.astype(np.int32), -1, (0, 0, 255), 2) + shows.append(image_show) + + for idx, im_show in enumerate(shows): + axb = fig.add_subplot(3, 4, 9+idx) + # axb.set_title('boundary_{}'.format(idx)) + # axb.set_autoscale_on(True) + im11 = axb.imshow(im_show, cmap=cm.jet) + # plt.colorbar(im11, shrink=0.5) + + path = os.path.join(vis_dir, '{}.png'.format(i)) + plt.savefig(path) + plt.close(fig) + + +def visualize_gt(image, contours, label_tag): + + image_show = image.copy() + image_show = np.ascontiguousarray(image_show[:, :, ::-1]) + + image_show = cv2.polylines(image_show, + [contours[i] for i, tag in enumerate(label_tag) if tag >0], True, (0, 0, 255), 3) + image_show = cv2.polylines(image_show, + [contours[i] for i, tag in enumerate(label_tag) if tag <0], True, (0, 255, 0), 3) + + show_gt = cv2.resize(image_show, (320, 320)) + + return show_gt + + +def visualize_detection(image, output_dict, meta=None): + image_show = image.copy() + image_show = np.ascontiguousarray(image_show[:, :, ::-1]) + + cls_preds = F.interpolate(output_dict["fy_preds"], scale_factor=cfg.scale, mode='bilinear') + cls_preds = cls_preds[0].data.cpu().numpy() + + py_preds = output_dict["py_preds"][1:] + init_polys = output_dict["py_preds"][0] + shows = [] + + init_py = init_polys.data.cpu().numpy() + path = os.path.join(cfg.vis_dir, '{}_test'.format(cfg.exp_name), + meta['image_id'][0].split(".")[0] + "_init.png") + + im_show0 = image_show.copy() + for i, bpts in enumerate(init_py.astype(np.int32)): + cv2.drawContours(im_show0, [bpts.astype(np.int32)], -1, (255, 255, 0), 2) + for j, pp in enumerate(bpts): + if j == 0: + cv2.circle(im_show0, (int(pp[0]), int(pp[1])), 3, (255, 0, 255), -1) + elif j == 1: + cv2.circle(im_show0, (int(pp[0]), int(pp[1])), 3, (0, 255, 255), -1) + else: + cv2.circle(im_show0, (int(pp[0]), int(pp[1])), 3, (0, 0, 255), -1) + + cv2.imwrite(path, im_show0) + + for idx, py in enumerate(py_preds): + im_show = im_show0.copy() + contours = py.data.cpu().numpy() + cv2.drawContours(im_show, contours.astype(np.int32), -1, (0, 0, 255), 2) + for ppts in contours: + for j, pp in enumerate(ppts): + if j == 0: + cv2.circle(im_show, (int(pp[0]), int(pp[1])), 3, (255, 0, 255), -1) + elif j == 1: + cv2.circle(im_show, (int(pp[0]), int(pp[1])), 3, (0, 255, 255), -1) + else: + cv2.circle(im_show, (int(pp[0]), int(pp[1])), 3, (0, 255, 0), -1) + path = os.path.join(cfg.vis_dir, '{}_test'.format(cfg.exp_name), + meta['image_id'][0].split(".")[0] + "_{}iter.png".format(idx)) + cv2.imwrite(path, im_show) + shows.append(im_show) + + # init_py = init_polys.data.cpu().numpy() + # im_show_score = image_show.copy() + # for in_py in init_py: + # mask = np.zeros_like(cls_preds[0], dtype=np.uint8) + # cv2.drawContours(mask, [in_py.astype(np.int32)], -1, (1,), -1) + # score = cls_preds[0][mask > 0].mean() + # if score > 0.9: + # cv2.drawContours(im_show_score, [in_py.astype(np.int32)], -1, (0, 255, 0), 2) + # else: + # cv2.drawContours(im_show_score, [in_py.astype(np.int32)], -1, (255, 0, 255), 2) + # cv2.putText(im_show_score, "{:.2f}".format(score), + # (int(np.mean(in_py[:, 0])), int(np.mean(in_py[:, 1]))), 1, 1, (0, 255, 255), 2) + # print(score) + + # path = os.path.join(cfg.vis_dir, '{}_test'.format(cfg.exp_name), + # meta['image_id'][0].split(".")[0] + "init.png") + # cv2.imwrite(path, im_show_score) + + show_img = np.concatenate(shows, axis=1) + show_boundary = cv2.resize(show_img, (320 * len(py_preds), 320)) + + # fig = plt.figure(figsize=(5, 4)) + # ax1 = fig.add_subplot(111) + # # ax1.set_title('distance_field') + # ax1.set_autoscale_on(True) + # im1 = ax1.imshow(cls_preds[0], cmap=cm.jet) + # plt.colorbar(im1, shrink=0.75) + # plt.axis("off") + # path = os.path.join(cfg.vis_dir, '{}_test'.format(cfg.exp_name), + # meta['image_id'][0].split(".")[0] + "_cls.png") + # plt.savefig(path, dpi=300) + # plt.close(fig) + # + # fig = plt.figure(figsize=(5, 4)) + # ax1 = fig.add_subplot(111) + # # ax1.set_title('distance_field') + # ax1.set_autoscale_on(True) + # im1 = ax1.imshow(np.array(cls_preds[1] / np.max(cls_preds[1])), cmap=cm.jet) + # plt.colorbar(im1, shrink=0.75) + # plt.axis("off") + # path = os.path.join(cfg.vis_dir, '{}_test'.format(cfg.exp_name), + # meta['image_id'][0].split(".")[0] + "_dis.png") + # plt.savefig(path, dpi=300) + # plt.close(fig) + + cls_pred = cav.heatmap(np.array(cls_preds[0] * 255, dtype=np.uint8)) + dis_pred = cav.heatmap(np.array(cls_preds[1] * 255, dtype=np.uint8)) + + heat_map = np.concatenate([cls_pred*255, dis_pred*255], axis=1) + heat_map = cv2.resize(heat_map, (320 * 2, 320)) + + return show_boundary, heat_map \ No newline at end of file diff --git a/IndicPhotoOCR/ocr.py b/IndicPhotoOCR/ocr.py index 7da65446ef69feb6d16c678b127dce727a648d77..939fab2064a674a65bec0f297d16b2351fe89184 100644 --- a/IndicPhotoOCR/ocr.py +++ b/IndicPhotoOCR/ocr.py @@ -6,10 +6,11 @@ import cv2 import numpy as np -from IndicPhotoOCR.detection.east_detector import EASTdetector +# from IndicPhotoOCR.detection.east_detector import EASTdetector from IndicPhotoOCR.script_identification.CLIP_identifier import CLIPidentifier from IndicPhotoOCR.recognition.parseq_recogniser import PARseqrecogniser import IndicPhotoOCR.detection.east_config as cfg +from IndicPhotoOCR.detection.textbpn.textbpnpp_detector import TextBPNpp_detector class OCR: @@ -18,18 +19,22 @@ class OCR: self.device = device self.verbose = verbose # self.image_path = image_path - self.detector = EASTdetector() + # self.detector = EASTdetector() + self.detector = TextBPNpp_detector(device=self.device) self.recogniser = PARseqrecogniser() self.identifier = CLIPidentifier() - def detect(self, image_path, detect_model_checkpoint=cfg.checkpoint): - """Run the detection model to get bounding boxes of text areas.""" + # def detect(self, image_path, detect_model_checkpoint=cfg.checkpoint): + # """Run the detection model to get bounding boxes of text areas.""" - if self.verbose: - print("Running text detection...") - detections = self.detector.detect(image_path, detect_model_checkpoint, self.device) - # print(detections) - return detections['detections'] + # if self.verbose: + # print("Running text detection...") + # detections = self.detector.detect(image_path, detect_model_checkpoint, self.device) + # # print(detections) + # return detections['detections'] + def detect(self, image_path): + self.detections = self.detector.detect(image_path) + return self.detections['detections'] def visualize_detection(self, image_path, detections, save_path=None, show=False): # Default save path if none is provided @@ -140,7 +145,7 @@ if __name__ == '__main__': sample_image_path = 'test_images/image_141.jpg' cropped_image_path = 'test_images/cropped_image/image_141_0.jpg' - ocr = OCR(device="cpu", verbose=False) + ocr = OCR(device="cuda", verbose=False) # detections = ocr.detect(sample_image_path) # print(detections) diff --git a/app.py b/app.py index dbb816d90fea6571f920562bdc2722720927ba6c..ccf119f62d51b84e4b822ff64517e88b2e18ad52 100644 --- a/app.py +++ b/app.py @@ -67,7 +67,7 @@ interface_html = """ # Links to GitHub and Dataset repositories with GitHub icon links_html = """