hantech commited on
Commit
0667c13
1 Parent(s): 1cfd79c

Upload 28 files

Browse files
train_old.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eed45bcd25593dca2576c20721a57a449a6b557faf40336bcd7690b2a82eb2e1
3
+ size 89572737
vgg-seq2seq.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ project: vietocr_new
2
+ name: Train
3
+
4
+ device: cuda:0
5
+
6
+ # change to list chars of your dataset or use default vietnamese chars
7
+ vocab: 'aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0123456789!"#$%&''()*+,-./:;<=>?@[\]^_`{|}~ '
8
+
9
+ seq_modeling: seq2seq
10
+ transformer:
11
+ encoder_hidden: 256
12
+ decoder_hidden: 256
13
+ img_channel: 256
14
+ decoder_embedded: 256
15
+ dropout: 0.1
16
+
17
+ optimizer:
18
+ max_lr: 0.001
19
+ pct_start: 0.1
20
+
21
+ trainer:
22
+ batch_size: 128
23
+ print_every: 100
24
+ valid_every: 500
25
+ test_every: 500
26
+ iters: 10000
27
+ # where to save our model for prediction
28
+ export: weights/train_model.pth
29
+ checkpoint: ./checkpoint/checkpoint_model.pth
30
+ log: ./train.log
31
+ # null to disable compuate accuracy, or change to number of sample to enable validiation while training
32
+ metrics: 49228
33
+ test_metrics: 28918
34
+ pretrained: false
35
+
36
+ dataset:
37
+ # path to image
38
+ data_root: /mnt/disk3/CGGANv2
39
+ # path to annotation
40
+ train_annotation: datasets/labels/train.txt
41
+ valid_annotation: datasets/labels/valid.txt
42
+ test_annotation: datasets/labels/test.txt
43
+ # path to lmdb datasets
44
+ train_lmdb: datasets/lmdb/train
45
+ valid_lmdb: datasets/lmdb/valid
46
+ test_lmdb: datasets/lmdb/test
47
+
48
+ # resize image to 32 height, larger height will increase accuracy
49
+ image_height: 32
50
+ image_min_width: 32
51
+ image_max_width: 512
52
+
53
+ dataloader:
54
+ num_workers: 12
55
+ pin_memory: true
56
+
57
+ aug:
58
+ image_aug: false
59
+ masked_language_model: false
60
+
61
+ predictor:
62
+ # disable or enable beamsearch while prediction, use beamsearch will be slower
63
+ beamsearch: false
64
+
65
+ quiet: false
66
+
67
+ # for train
68
+ pretrain: https://vocr.vn/data/vietocr/vgg_seq2seq.pth
69
+
70
+ # url or local path (for predict)
71
+ weights: https://vocr.vn/data/vietocr/vgg_seq2seq.pth
72
+
73
+ backbone: vgg19_bn
74
+ cnn:
75
+ # pooling stride size
76
+ ss:
77
+ - [2, 2]
78
+ - [2, 2]
79
+ - [2, 1]
80
+ - [2, 1]
81
+ - [1, 1]
82
+ # pooling kernel size
83
+ ks:
84
+ - [2, 2]
85
+ - [2, 2]
86
+ - [2, 1]
87
+ - [2, 1]
88
+ - [1, 1]
89
+ # dim of ouput feature map
90
+ hidden: 256
vietocr/__init__.py ADDED
File without changes
vietocr/model/__init__.py ADDED
File without changes
vietocr/model/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (174 Bytes). View file
 
vietocr/model/__pycache__/beam.cpython-311.pyc ADDED
Binary file (6.02 kB). View file
 
vietocr/model/__pycache__/trainer.cpython-311.pyc ADDED
Binary file (22.2 kB). View file
 
vietocr/model/__pycache__/transformerocr.cpython-311.pyc ADDED
Binary file (2.44 kB). View file
 
vietocr/model/__pycache__/vocab.cpython-311.pyc ADDED
Binary file (3.27 kB). View file
 
vietocr/model/backbone/__init__.py ADDED
File without changes
vietocr/model/backbone/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (183 Bytes). View file
 
vietocr/model/backbone/__pycache__/cnn.cpython-311.pyc ADDED
Binary file (2.18 kB). View file
 
vietocr/model/backbone/__pycache__/resnet.cpython-311.pyc ADDED
Binary file (9.41 kB). View file
 
vietocr/model/backbone/__pycache__/vgg.cpython-311.pyc ADDED
Binary file (3.06 kB). View file
 
vietocr/model/backbone/cnn.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+ import vietocr.model.backbone.vgg as vgg
5
+ from vietocr.model.backbone.resnet import Resnet50
6
+
7
+ class CNN(nn.Module):
8
+ def __init__(self, backbone, **kwargs):
9
+ super(CNN, self).__init__()
10
+
11
+ if backbone == 'vgg11_bn':
12
+ self.model = vgg.vgg11_bn(**kwargs)
13
+ elif backbone == 'vgg19_bn':
14
+ self.model = vgg.vgg19_bn(**kwargs)
15
+ elif backbone == 'resnet50':
16
+ self.model = Resnet50(**kwargs)
17
+
18
+ def forward(self, x):
19
+ return self.model(x)
20
+
21
+ def freeze(self):
22
+ for name, param in self.model.features.named_parameters():
23
+ if name != 'last_conv_1x1':
24
+ param.requires_grad = False
25
+
26
+ def unfreeze(self):
27
+ for param in self.model.features.parameters():
28
+ param.requires_grad = True
vietocr/model/backbone/resnet.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+ class BasicBlock(nn.Module):
5
+ expansion = 1
6
+
7
+ def __init__(self, inplanes, planes, stride=1, downsample=None):
8
+ super(BasicBlock, self).__init__()
9
+ self.conv1 = self._conv3x3(inplanes, planes)
10
+ self.bn1 = nn.BatchNorm2d(planes)
11
+ self.conv2 = self._conv3x3(planes, planes)
12
+ self.bn2 = nn.BatchNorm2d(planes)
13
+ self.relu = nn.ReLU(inplace=True)
14
+ self.downsample = downsample
15
+ self.stride = stride
16
+
17
+ def _conv3x3(self, in_planes, out_planes, stride=1):
18
+ "3x3 convolution with padding"
19
+ return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
20
+ padding=1, bias=False)
21
+
22
+ def forward(self, x):
23
+ residual = x
24
+
25
+ out = self.conv1(x)
26
+ out = self.bn1(out)
27
+ out = self.relu(out)
28
+
29
+ out = self.conv2(out)
30
+ out = self.bn2(out)
31
+
32
+ if self.downsample is not None:
33
+ residual = self.downsample(x)
34
+ out += residual
35
+ out = self.relu(out)
36
+
37
+ return out
38
+
39
+ class ResNet(nn.Module):
40
+
41
+ def __init__(self, input_channel, output_channel, block, layers):
42
+ super(ResNet, self).__init__()
43
+
44
+ self.output_channel_block = [int(output_channel / 4), int(output_channel / 2), output_channel, output_channel]
45
+
46
+ self.inplanes = int(output_channel / 8)
47
+ self.conv0_1 = nn.Conv2d(input_channel, int(output_channel / 16),
48
+ kernel_size=3, stride=1, padding=1, bias=False)
49
+ self.bn0_1 = nn.BatchNorm2d(int(output_channel / 16))
50
+ self.conv0_2 = nn.Conv2d(int(output_channel / 16), self.inplanes,
51
+ kernel_size=3, stride=1, padding=1, bias=False)
52
+ self.bn0_2 = nn.BatchNorm2d(self.inplanes)
53
+ self.relu = nn.ReLU(inplace=True)
54
+
55
+ self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
56
+ self.layer1 = self._make_layer(block, self.output_channel_block[0], layers[0])
57
+ self.conv1 = nn.Conv2d(self.output_channel_block[0], self.output_channel_block[
58
+ 0], kernel_size=3, stride=1, padding=1, bias=False)
59
+ self.bn1 = nn.BatchNorm2d(self.output_channel_block[0])
60
+
61
+ self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
62
+ self.layer2 = self._make_layer(block, self.output_channel_block[1], layers[1], stride=1)
63
+ self.conv2 = nn.Conv2d(self.output_channel_block[1], self.output_channel_block[
64
+ 1], kernel_size=3, stride=1, padding=1, bias=False)
65
+ self.bn2 = nn.BatchNorm2d(self.output_channel_block[1])
66
+
67
+ self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=(2, 1), padding=(0, 1))
68
+ self.layer3 = self._make_layer(block, self.output_channel_block[2], layers[2], stride=1)
69
+ self.conv3 = nn.Conv2d(self.output_channel_block[2], self.output_channel_block[
70
+ 2], kernel_size=3, stride=1, padding=1, bias=False)
71
+ self.bn3 = nn.BatchNorm2d(self.output_channel_block[2])
72
+
73
+ self.layer4 = self._make_layer(block, self.output_channel_block[3], layers[3], stride=1)
74
+ self.conv4_1 = nn.Conv2d(self.output_channel_block[3], self.output_channel_block[
75
+ 3], kernel_size=2, stride=(2, 1), padding=(0, 1), bias=False)
76
+ self.bn4_1 = nn.BatchNorm2d(self.output_channel_block[3])
77
+ self.conv4_2 = nn.Conv2d(self.output_channel_block[3], self.output_channel_block[
78
+ 3], kernel_size=2, stride=1, padding=0, bias=False)
79
+ self.bn4_2 = nn.BatchNorm2d(self.output_channel_block[3])
80
+
81
+ def _make_layer(self, block, planes, blocks, stride=1):
82
+ downsample = None
83
+ if stride != 1 or self.inplanes != planes * block.expansion:
84
+ downsample = nn.Sequential(
85
+ nn.Conv2d(self.inplanes, planes * block.expansion,
86
+ kernel_size=1, stride=stride, bias=False),
87
+ nn.BatchNorm2d(planes * block.expansion),
88
+ )
89
+
90
+ layers = []
91
+ layers.append(block(self.inplanes, planes, stride, downsample))
92
+ self.inplanes = planes * block.expansion
93
+ for i in range(1, blocks):
94
+ layers.append(block(self.inplanes, planes))
95
+
96
+ return nn.Sequential(*layers)
97
+
98
+ def forward(self, x):
99
+ x = self.conv0_1(x)
100
+ x = self.bn0_1(x)
101
+ x = self.relu(x)
102
+ x = self.conv0_2(x)
103
+ x = self.bn0_2(x)
104
+ x = self.relu(x)
105
+
106
+ x = self.maxpool1(x)
107
+ x = self.layer1(x)
108
+ x = self.conv1(x)
109
+ x = self.bn1(x)
110
+ x = self.relu(x)
111
+
112
+ x = self.maxpool2(x)
113
+ x = self.layer2(x)
114
+ x = self.conv2(x)
115
+ x = self.bn2(x)
116
+ x = self.relu(x)
117
+
118
+ x = self.maxpool3(x)
119
+ x = self.layer3(x)
120
+ x = self.conv3(x)
121
+ x = self.bn3(x)
122
+ x = self.relu(x)
123
+
124
+ x = self.layer4(x)
125
+ x = self.conv4_1(x)
126
+ x = self.bn4_1(x)
127
+ x = self.relu(x)
128
+ x = self.conv4_2(x)
129
+ x = self.bn4_2(x)
130
+ conv = self.relu(x)
131
+
132
+ conv = conv.transpose(-1, -2)
133
+ conv = conv.flatten(2)
134
+ conv = conv.permute(-1, 0, 1)
135
+
136
+ return conv
137
+
138
+ def Resnet50(ss, hidden):
139
+ return ResNet(3, hidden, BasicBlock, [1, 2, 5, 3])
140
+
vietocr/model/backbone/vgg.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from torchvision import models
4
+ from einops import rearrange
5
+ from torchvision.models._utils import IntermediateLayerGetter
6
+
7
+
8
+ class Vgg(nn.Module):
9
+ def __init__(self, name, ss, ks, hidden, pretrained=True, dropout=0.5):
10
+ super(Vgg, self).__init__()
11
+
12
+ if name == 'vgg11_bn':
13
+ cnn = models.vgg11_bn(weights='DEFAULT')
14
+ elif name == 'vgg19_bn':
15
+ cnn = models.vgg19_bn(weights='DEFAULT')
16
+
17
+ pool_idx = 0
18
+
19
+ for i, layer in enumerate(cnn.features):
20
+ if isinstance(layer, torch.nn.MaxPool2d):
21
+ cnn.features[i] = torch.nn.AvgPool2d(kernel_size=ks[pool_idx], stride=ss[pool_idx], padding=0)
22
+ pool_idx += 1
23
+
24
+ self.features = cnn.features
25
+ self.dropout = nn.Dropout(dropout)
26
+ self.last_conv_1x1 = nn.Conv2d(512, hidden, 1)
27
+
28
+ def forward(self, x):
29
+ """
30
+ Shape:
31
+ - x: (N, C, H, W)
32
+ - output: (W, N, C)
33
+ """
34
+
35
+ conv = self.features(x)
36
+ conv = self.dropout(conv)
37
+ conv = self.last_conv_1x1(conv)
38
+
39
+ # conv = rearrange(conv, 'b d h w -> b d (w h)')
40
+ conv = conv.transpose(-1, -2)
41
+ conv = conv.flatten(2)
42
+ conv = conv.permute(-1, 0, 1)
43
+ return conv
44
+
45
+ def vgg11_bn(ss, ks, hidden, pretrained=True, dropout=0.5):
46
+ return Vgg('vgg11_bn', ss, ks, hidden, pretrained, dropout)
47
+
48
+ def vgg19_bn(ss, ks, hidden, pretrained=True, dropout=0.5):
49
+ return Vgg('vgg19_bn', ss, ks, hidden, pretrained, dropout)
50
+
vietocr/model/seqmodel/__init__.py ADDED
File without changes
vietocr/model/seqmodel/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (183 Bytes). View file
 
vietocr/model/seqmodel/__pycache__/convseq2seq.cpython-311.pyc ADDED
Binary file (10.7 kB). View file
 
vietocr/model/seqmodel/__pycache__/seq2seq.cpython-311.pyc ADDED
Binary file (9.79 kB). View file
 
vietocr/model/seqmodel/__pycache__/transformer.cpython-311.pyc ADDED
Binary file (10.2 kB). View file
 
vietocr/model/seqmodel/convseq2seq.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.optim as optim
4
+ import torch.nn.functional as F
5
+
6
+ class Encoder(nn.Module):
7
+ def __init__(self,
8
+ emb_dim,
9
+ hid_dim,
10
+ n_layers,
11
+ kernel_size,
12
+ dropout,
13
+ device,
14
+ max_length = 512):
15
+ super().__init__()
16
+
17
+ assert kernel_size % 2 == 1, "Kernel size must be odd!"
18
+
19
+ self.device = device
20
+
21
+ self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
22
+
23
+ # self.tok_embedding = nn.Embedding(input_dim, emb_dim)
24
+ self.pos_embedding = nn.Embedding(max_length, emb_dim)
25
+
26
+ self.emb2hid = nn.Linear(emb_dim, hid_dim)
27
+ self.hid2emb = nn.Linear(hid_dim, emb_dim)
28
+
29
+ self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim,
30
+ out_channels = 2 * hid_dim,
31
+ kernel_size = kernel_size,
32
+ padding = (kernel_size - 1) // 2)
33
+ for _ in range(n_layers)])
34
+
35
+ self.dropout = nn.Dropout(dropout)
36
+
37
+ def forward(self, src):
38
+
39
+ #src = [batch size, src len]
40
+
41
+ src = src.transpose(0, 1)
42
+
43
+ batch_size = src.shape[0]
44
+ src_len = src.shape[1]
45
+ device = src.device
46
+
47
+ #create position tensor
48
+ pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(device)
49
+
50
+ #pos = [0, 1, 2, 3, ..., src len - 1]
51
+
52
+ #pos = [batch size, src len]
53
+
54
+ #embed tokens and positions
55
+
56
+ # tok_embedded = self.tok_embedding(src)
57
+ tok_embedded = src
58
+
59
+ pos_embedded = self.pos_embedding(pos)
60
+
61
+ #tok_embedded = pos_embedded = [batch size, src len, emb dim]
62
+
63
+ #combine embeddings by elementwise summing
64
+ embedded = self.dropout(tok_embedded + pos_embedded)
65
+
66
+ #embedded = [batch size, src len, emb dim]
67
+
68
+ #pass embedded through linear layer to convert from emb dim to hid dim
69
+ conv_input = self.emb2hid(embedded)
70
+
71
+ #conv_input = [batch size, src len, hid dim]
72
+
73
+ #permute for convolutional layer
74
+ conv_input = conv_input.permute(0, 2, 1)
75
+
76
+ #conv_input = [batch size, hid dim, src len]
77
+
78
+ #begin convolutional blocks...
79
+
80
+ for i, conv in enumerate(self.convs):
81
+
82
+ #pass through convolutional layer
83
+ conved = conv(self.dropout(conv_input))
84
+
85
+ #conved = [batch size, 2 * hid dim, src len]
86
+
87
+ #pass through GLU activation function
88
+ conved = F.glu(conved, dim = 1)
89
+
90
+ #conved = [batch size, hid dim, src len]
91
+
92
+ #apply residual connection
93
+ conved = (conved + conv_input) * self.scale
94
+
95
+ #conved = [batch size, hid dim, src len]
96
+
97
+ #set conv_input to conved for next loop iteration
98
+ conv_input = conved
99
+
100
+ #...end convolutional blocks
101
+
102
+ #permute and convert back to emb dim
103
+ conved = self.hid2emb(conved.permute(0, 2, 1))
104
+
105
+ #conved = [batch size, src len, emb dim]
106
+
107
+ #elementwise sum output (conved) and input (embedded) to be used for attention
108
+ combined = (conved + embedded) * self.scale
109
+
110
+ #combined = [batch size, src len, emb dim]
111
+
112
+ return conved, combined
113
+
114
+ class Decoder(nn.Module):
115
+ def __init__(self,
116
+ output_dim,
117
+ emb_dim,
118
+ hid_dim,
119
+ n_layers,
120
+ kernel_size,
121
+ dropout,
122
+ trg_pad_idx,
123
+ device,
124
+ max_length = 512):
125
+ super().__init__()
126
+
127
+ self.kernel_size = kernel_size
128
+ self.trg_pad_idx = trg_pad_idx
129
+ self.device = device
130
+
131
+ self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
132
+
133
+ self.tok_embedding = nn.Embedding(output_dim, emb_dim)
134
+ self.pos_embedding = nn.Embedding(max_length, emb_dim)
135
+
136
+ self.emb2hid = nn.Linear(emb_dim, hid_dim)
137
+ self.hid2emb = nn.Linear(hid_dim, emb_dim)
138
+
139
+ self.attn_hid2emb = nn.Linear(hid_dim, emb_dim)
140
+ self.attn_emb2hid = nn.Linear(emb_dim, hid_dim)
141
+
142
+ self.fc_out = nn.Linear(emb_dim, output_dim)
143
+
144
+ self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim,
145
+ out_channels = 2 * hid_dim,
146
+ kernel_size = kernel_size)
147
+ for _ in range(n_layers)])
148
+
149
+ self.dropout = nn.Dropout(dropout)
150
+
151
+ def calculate_attention(self, embedded, conved, encoder_conved, encoder_combined):
152
+
153
+ #embedded = [batch size, trg len, emb dim]
154
+ #conved = [batch size, hid dim, trg len]
155
+ #encoder_conved = encoder_combined = [batch size, src len, emb dim]
156
+
157
+ #permute and convert back to emb dim
158
+ conved_emb = self.attn_hid2emb(conved.permute(0, 2, 1))
159
+
160
+ #conved_emb = [batch size, trg len, emb dim]
161
+
162
+ combined = (conved_emb + embedded) * self.scale
163
+
164
+ #combined = [batch size, trg len, emb dim]
165
+
166
+ energy = torch.matmul(combined, encoder_conved.permute(0, 2, 1))
167
+
168
+ #energy = [batch size, trg len, src len]
169
+
170
+ attention = F.softmax(energy, dim=2)
171
+
172
+ #attention = [batch size, trg len, src len]
173
+
174
+ attended_encoding = torch.matmul(attention, encoder_combined)
175
+
176
+ #attended_encoding = [batch size, trg len, emd dim]
177
+
178
+ #convert from emb dim -> hid dim
179
+ attended_encoding = self.attn_emb2hid(attended_encoding)
180
+
181
+ #attended_encoding = [batch size, trg len, hid dim]
182
+
183
+ #apply residual connection
184
+ attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale
185
+
186
+ #attended_combined = [batch size, hid dim, trg len]
187
+
188
+ return attention, attended_combined
189
+
190
+ def forward(self, trg, encoder_conved, encoder_combined):
191
+
192
+ #trg = [batch size, trg len]
193
+ #encoder_conved = encoder_combined = [batch size, src len, emb dim]
194
+ trg = trg.transpose(0, 1)
195
+
196
+ batch_size = trg.shape[0]
197
+ trg_len = trg.shape[1]
198
+ device = trg.device
199
+
200
+ #create position tensor
201
+ pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(device)
202
+
203
+ #pos = [batch size, trg len]
204
+
205
+ #embed tokens and positions
206
+ tok_embedded = self.tok_embedding(trg)
207
+ pos_embedded = self.pos_embedding(pos)
208
+
209
+ #tok_embedded = [batch size, trg len, emb dim]
210
+ #pos_embedded = [batch size, trg len, emb dim]
211
+
212
+ #combine embeddings by elementwise summing
213
+ embedded = self.dropout(tok_embedded + pos_embedded)
214
+
215
+ #embedded = [batch size, trg len, emb dim]
216
+
217
+ #pass embedded through linear layer to go through emb dim -> hid dim
218
+ conv_input = self.emb2hid(embedded)
219
+
220
+ #conv_input = [batch size, trg len, hid dim]
221
+
222
+ #permute for convolutional layer
223
+ conv_input = conv_input.permute(0, 2, 1)
224
+
225
+ #conv_input = [batch size, hid dim, trg len]
226
+
227
+ batch_size = conv_input.shape[0]
228
+ hid_dim = conv_input.shape[1]
229
+
230
+ for i, conv in enumerate(self.convs):
231
+
232
+ #apply dropout
233
+ conv_input = self.dropout(conv_input)
234
+
235
+ #need to pad so decoder can't "cheat"
236
+ padding = torch.zeros(batch_size,
237
+ hid_dim,
238
+ self.kernel_size - 1).fill_(self.trg_pad_idx).to(device)
239
+
240
+ padded_conv_input = torch.cat((padding, conv_input), dim = 2)
241
+
242
+ #padded_conv_input = [batch size, hid dim, trg len + kernel size - 1]
243
+
244
+ #pass through convolutional layer
245
+ conved = conv(padded_conv_input)
246
+
247
+ #conved = [batch size, 2 * hid dim, trg len]
248
+
249
+ #pass through GLU activation function
250
+ conved = F.glu(conved, dim = 1)
251
+
252
+ #conved = [batch size, hid dim, trg len]
253
+
254
+ #calculate attention
255
+ attention, conved = self.calculate_attention(embedded,
256
+ conved,
257
+ encoder_conved,
258
+ encoder_combined)
259
+
260
+ #attention = [batch size, trg len, src len]
261
+
262
+ #apply residual connection
263
+ conved = (conved + conv_input) * self.scale
264
+
265
+ #conved = [batch size, hid dim, trg len]
266
+
267
+ #set conv_input to conved for next loop iteration
268
+ conv_input = conved
269
+
270
+ conved = self.hid2emb(conved.permute(0, 2, 1))
271
+
272
+ #conved = [batch size, trg len, emb dim]
273
+
274
+ output = self.fc_out(self.dropout(conved))
275
+
276
+ #output = [batch size, trg len, output dim]
277
+
278
+ return output, attention
279
+
280
+ class ConvSeq2Seq(nn.Module):
281
+ def __init__(self, vocab_size, emb_dim, hid_dim, enc_layers, dec_layers, enc_kernel_size, dec_kernel_size, enc_max_length, dec_max_length, dropout, pad_idx, device):
282
+ super().__init__()
283
+
284
+ enc = Encoder(emb_dim, hid_dim, enc_layers, enc_kernel_size, dropout, device, enc_max_length)
285
+ dec = Decoder(vocab_size, emb_dim, hid_dim, dec_layers, dec_kernel_size, dropout, pad_idx, device, dec_max_length)
286
+
287
+ self.encoder = enc
288
+ self.decoder = dec
289
+
290
+ def forward_encoder(self, src):
291
+ encoder_conved, encoder_combined = self.encoder(src)
292
+
293
+ return encoder_conved, encoder_combined
294
+
295
+ def forward_decoder(self, trg, memory):
296
+ encoder_conved, encoder_combined = memory
297
+ output, attention = self.decoder(trg, encoder_conved, encoder_combined)
298
+
299
+ return output, (encoder_conved, encoder_combined)
300
+
301
+ def forward(self, src, trg):
302
+
303
+ #src = [batch size, src len]
304
+ #trg = [batch size, trg len - 1] (<eos> token sliced off the end)
305
+
306
+ #calculate z^u (encoder_conved) and (z^u + e) (encoder_combined)
307
+ #encoder_conved is output from final encoder conv. block
308
+ #encoder_combined is encoder_conved plus (elementwise) src embedding plus
309
+ # positional embeddings
310
+ encoder_conved, encoder_combined = self.encoder(src)
311
+
312
+ #encoder_conved = [batch size, src len, emb dim]
313
+ #encoder_combined = [batch size, src len, emb dim]
314
+
315
+ #calculate predictions of next words
316
+ #output is a batch of predictions for each word in the trg sentence
317
+ #attention a batch of attention scores across the src sentence for
318
+ # each word in the trg sentence
319
+ output, attention = self.decoder(trg, encoder_conved, encoder_combined)
320
+
321
+ #output = [batch size, trg len - 1, output dim]
322
+ #attention = [batch size, trg len - 1, src len]
323
+
324
+ return output#, attention
vietocr/model/seqmodel/seq2seq.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.optim as optim
4
+ import torch.nn.functional as F
5
+
6
+ class Encoder(nn.Module):
7
+ def __init__(self, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
8
+ super().__init__()
9
+
10
+ self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
11
+ self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
12
+ self.dropout = nn.Dropout(dropout)
13
+
14
+ def forward(self, src):
15
+ """
16
+ src: src_len x batch_size x img_channel
17
+ outputs: src_len x batch_size x hid_dim
18
+ hidden: batch_size x hid_dim
19
+ """
20
+
21
+ embedded = self.dropout(src)
22
+
23
+ outputs, hidden = self.rnn(embedded)
24
+
25
+ hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
26
+
27
+ return outputs, hidden
28
+
29
+ class Attention(nn.Module):
30
+ def __init__(self, enc_hid_dim, dec_hid_dim):
31
+ super().__init__()
32
+
33
+ self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
34
+ self.v = nn.Linear(dec_hid_dim, 1, bias = False)
35
+
36
+ def forward(self, hidden, encoder_outputs):
37
+ """
38
+ hidden: batch_size x hid_dim
39
+ encoder_outputs: src_len x batch_size x hid_dim,
40
+ outputs: batch_size x src_len
41
+ """
42
+
43
+ batch_size = encoder_outputs.shape[1]
44
+ src_len = encoder_outputs.shape[0]
45
+
46
+ hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
47
+
48
+ encoder_outputs = encoder_outputs.permute(1, 0, 2)
49
+
50
+ energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2)))
51
+
52
+ attention = self.v(energy).squeeze(2)
53
+
54
+ return F.softmax(attention, dim = 1)
55
+
56
+ class Decoder(nn.Module):
57
+ def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
58
+ super().__init__()
59
+
60
+ self.output_dim = output_dim
61
+ self.attention = attention
62
+
63
+ self.embedding = nn.Embedding(output_dim, emb_dim)
64
+ self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
65
+ self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
66
+ self.dropout = nn.Dropout(dropout)
67
+
68
+ def forward(self, input, hidden, encoder_outputs):
69
+ """
70
+ inputs: batch_size
71
+ hidden: batch_size x hid_dim
72
+ encoder_outputs: src_len x batch_size x hid_dim
73
+ """
74
+
75
+ input = input.unsqueeze(0)
76
+
77
+ embedded = self.dropout(self.embedding(input))
78
+
79
+ a = self.attention(hidden, encoder_outputs)
80
+
81
+ a = a.unsqueeze(1)
82
+
83
+ encoder_outputs = encoder_outputs.permute(1, 0, 2)
84
+
85
+ weighted = torch.bmm(a, encoder_outputs)
86
+
87
+ weighted = weighted.permute(1, 0, 2)
88
+
89
+ rnn_input = torch.cat((embedded, weighted), dim = 2)
90
+
91
+ output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
92
+
93
+ assert (output == hidden).all()
94
+
95
+ embedded = embedded.squeeze(0)
96
+ output = output.squeeze(0)
97
+ weighted = weighted.squeeze(0)
98
+
99
+ prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
100
+
101
+ return prediction, hidden.squeeze(0), a.squeeze(1)
102
+
103
+ class Seq2Seq(nn.Module):
104
+ def __init__(self, vocab_size, encoder_hidden, decoder_hidden, img_channel, decoder_embedded, dropout=0.1):
105
+ super().__init__()
106
+
107
+ attn = Attention(encoder_hidden, decoder_hidden)
108
+
109
+ self.encoder = Encoder(img_channel, encoder_hidden, decoder_hidden, dropout)
110
+ self.decoder = Decoder(vocab_size, decoder_embedded, encoder_hidden, decoder_hidden, dropout, attn)
111
+
112
+ def forward_encoder(self, src):
113
+ """
114
+ src: timestep x batch_size x channel
115
+ hidden: batch_size x hid_dim
116
+ encoder_outputs: src_len x batch_size x hid_dim
117
+ """
118
+
119
+ encoder_outputs, hidden = self.encoder(src)
120
+
121
+ return (hidden, encoder_outputs)
122
+
123
+ def forward_decoder(self, tgt, memory):
124
+ """
125
+ tgt: timestep x batch_size
126
+ hidden: batch_size x hid_dim
127
+ encouder: src_len x batch_size x hid_dim
128
+ output: batch_size x 1 x vocab_size
129
+ """
130
+
131
+ tgt = tgt[-1]
132
+ hidden, encoder_outputs = memory
133
+ output, hidden, _ = self.decoder(tgt, hidden, encoder_outputs)
134
+ output = output.unsqueeze(1)
135
+
136
+ return output, (hidden, encoder_outputs)
137
+
138
+ def forward(self, src, trg):
139
+ """
140
+ src: time_step x batch_size
141
+ trg: time_step x batch_size
142
+ outputs: batch_size x time_step x vocab_size
143
+ """
144
+
145
+ batch_size = src.shape[1]
146
+ trg_len = trg.shape[0]
147
+ trg_vocab_size = self.decoder.output_dim
148
+ device = src.device
149
+
150
+ outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(device)
151
+ encoder_outputs, hidden = self.encoder(src)
152
+
153
+ for t in range(trg_len):
154
+ input = trg[t]
155
+ output, hidden, _ = self.decoder(input, hidden, encoder_outputs)
156
+
157
+ outputs[t] = output
158
+
159
+ outputs = outputs.transpose(0, 1).contiguous()
160
+
161
+ return outputs
162
+
163
+ def expand_memory(self, memory, beam_size):
164
+ hidden, encoder_outputs = memory
165
+ hidden = hidden.repeat(beam_size, 1)
166
+ encoder_outputs = encoder_outputs.repeat(1, beam_size, 1)
167
+
168
+ return (hidden, encoder_outputs)
169
+
170
+ def get_memory(self, memory, i):
171
+ hidden, encoder_outputs = memory
172
+ hidden = hidden[[i]]
173
+ encoder_outputs = encoder_outputs[:, [i],:]
174
+
175
+ return (hidden, encoder_outputs)
vietocr/model/seqmodel/transformer.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from einops import rearrange
2
+ from torchvision import models
3
+ import math
4
+ import torch
5
+ from torch import nn
6
+
7
+ class LanguageTransformer(nn.Module):
8
+ def __init__(self, vocab_size,
9
+ d_model, nhead,
10
+ num_encoder_layers, num_decoder_layers,
11
+ dim_feedforward, max_seq_length,
12
+ pos_dropout, trans_dropout):
13
+ super().__init__()
14
+
15
+ self.d_model = d_model
16
+ self.embed_tgt = nn.Embedding(vocab_size, d_model)
17
+ self.pos_enc = PositionalEncoding(d_model, pos_dropout, max_seq_length)
18
+ # self.learned_pos_enc = LearnedPositionalEncoding(d_model, pos_dropout, max_seq_length)
19
+
20
+ self.transformer = nn.Transformer(d_model, nhead,
21
+ num_encoder_layers, num_decoder_layers,
22
+ dim_feedforward, trans_dropout)
23
+
24
+ self.fc = nn.Linear(d_model, vocab_size)
25
+
26
+ def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
27
+ """
28
+ Shape:
29
+ - src: (W, N, C)
30
+ - tgt: (T, N)
31
+ - src_key_padding_mask: (N, S)
32
+ - tgt_key_padding_mask: (N, T)
33
+ - memory_key_padding_mask: (N, S)
34
+ - output: (N, T, E)
35
+
36
+ """
37
+ tgt_mask = self.gen_nopeek_mask(tgt.shape[0]).to(src.device)
38
+
39
+ src = self.pos_enc(src*math.sqrt(self.d_model))
40
+ # src = self.learned_pos_enc(src*math.sqrt(self.d_model))
41
+
42
+ tgt = self.pos_enc(self.embed_tgt(tgt) * math.sqrt(self.d_model))
43
+
44
+ output = self.transformer(src, tgt, tgt_mask=tgt_mask, src_key_padding_mask=src_key_padding_mask,
45
+ tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask)
46
+ # output = rearrange(output, 't n e -> n t e')
47
+ output = output.transpose(0, 1)
48
+ return self.fc(output)
49
+
50
+ def gen_nopeek_mask(self, length):
51
+ mask = (torch.triu(torch.ones(length, length)) == 1).transpose(0, 1)
52
+ mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
53
+
54
+ return mask
55
+
56
+ def forward_encoder(self, src):
57
+ src = self.pos_enc(src*math.sqrt(self.d_model))
58
+ memory = self.transformer.encoder(src)
59
+ return memory
60
+
61
+ def forward_decoder(self, tgt, memory):
62
+ tgt_mask = self.gen_nopeek_mask(tgt.shape[0]).to(tgt.device)
63
+ tgt = self.pos_enc(self.embed_tgt(tgt) * math.sqrt(self.d_model))
64
+
65
+ output = self.transformer.decoder(tgt, memory, tgt_mask=tgt_mask)
66
+ # output = rearrange(output, 't n e -> n t e')
67
+ output = output.transpose(0, 1)
68
+
69
+ return self.fc(output), memory
70
+
71
+ def expand_memory(self, memory, beam_size):
72
+ memory = memory.repeat(1, beam_size, 1)
73
+ return memory
74
+
75
+ def get_memory(self, memory, i):
76
+ memory = memory[:, [i], :]
77
+ return memory
78
+
79
+ class PositionalEncoding(nn.Module):
80
+ def __init__(self, d_model, dropout=0.1, max_len=100):
81
+ super(PositionalEncoding, self).__init__()
82
+ self.dropout = nn.Dropout(p=dropout)
83
+
84
+ pe = torch.zeros(max_len, d_model)
85
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
86
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
87
+ pe[:, 0::2] = torch.sin(position * div_term)
88
+ pe[:, 1::2] = torch.cos(position * div_term)
89
+ pe = pe.unsqueeze(0).transpose(0, 1)
90
+ self.register_buffer('pe', pe)
91
+
92
+ def forward(self, x):
93
+ x = x + self.pe[:x.size(0), :]
94
+
95
+ return self.dropout(x)
96
+
97
+ class LearnedPositionalEncoding(nn.Module):
98
+ def __init__(self, d_model, dropout=0.1, max_len=100):
99
+ super(LearnedPositionalEncoding, self).__init__()
100
+ self.dropout = nn.Dropout(p=dropout)
101
+
102
+ self.pos_embed = nn.Embedding(max_len, d_model)
103
+ self.layernorm = LayerNorm(d_model)
104
+
105
+ def forward(self, x):
106
+ seq_len = x.size(0)
107
+ pos = torch.arange(seq_len, dtype=torch.long, device=x.device)
108
+ pos = pos.unsqueeze(-1).expand(x.size()[:2])
109
+ x = x + self.pos_embed(pos)
110
+ return self.dropout(self.layernorm(x))
111
+
112
+ class LayerNorm(nn.Module):
113
+ "A layernorm module in the TF style (epsilon inside the square root)."
114
+ def __init__(self, d_model, variance_epsilon=1e-12):
115
+ super().__init__()
116
+ self.gamma = nn.Parameter(torch.ones(d_model))
117
+ self.beta = nn.Parameter(torch.zeros(d_model))
118
+ self.variance_epsilon = variance_epsilon
119
+
120
+ def forward(self, x):
121
+ u = x.mean(-1, keepdim=True)
122
+ s = (x - u).pow(2).mean(-1, keepdim=True)
123
+ x = (x - u) / torch.sqrt(s + self.variance_epsilon)
124
+ return self.gamma * x + self.beta
vietocr/model/transformerocr.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vietocr.model.backbone.cnn import CNN
2
+ from vietocr.model.seqmodel.transformer import LanguageTransformer
3
+ from vietocr.model.seqmodel.seq2seq import Seq2Seq
4
+ from vietocr.model.seqmodel.convseq2seq import ConvSeq2Seq
5
+ from torch import nn
6
+
7
+ class VietOCR(nn.Module):
8
+ def __init__(self, vocab_size,
9
+ backbone,
10
+ cnn_args,
11
+ transformer_args, seq_modeling='transformer'):
12
+
13
+ super(VietOCR, self).__init__()
14
+
15
+ self.cnn = CNN(backbone, **cnn_args)
16
+ self.seq_modeling = seq_modeling
17
+
18
+ if seq_modeling == 'transformer':
19
+ self.transformer = LanguageTransformer(vocab_size, **transformer_args)
20
+ elif seq_modeling == 'seq2seq':
21
+ self.transformer = Seq2Seq(vocab_size, **transformer_args)
22
+ elif seq_modeling == 'convseq2seq':
23
+ self.transformer = ConvSeq2Seq(vocab_size, **transformer_args)
24
+ else:
25
+ raise('Not Support Seq Model')
26
+
27
+ def forward(self, img, tgt_input, tgt_key_padding_mask):
28
+ """
29
+ Shape:
30
+ - img: (N, C, H, W)
31
+ - tgt_input: (T, N)
32
+ - tgt_key_padding_mask: (N, T)
33
+ - output: b t v
34
+ """
35
+ src = self.cnn(img)
36
+
37
+ if self.seq_modeling == 'transformer':
38
+ outputs = self.transformer(src, tgt_input, tgt_key_padding_mask=tgt_key_padding_mask)
39
+ elif self.seq_modeling == 'seq2seq':
40
+ outputs = self.transformer(src, tgt_input)
41
+ elif self.seq_modeling == 'convseq2seq':
42
+ outputs = self.transformer(src, tgt_input)
43
+ return outputs
44
+
vietocr/model/vocab.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Vocab():
2
+ def __init__(self, chars):
3
+ self.pad = 0
4
+ self.go = 1
5
+ self.eos = 2
6
+ self.mask_token = 3
7
+
8
+ self.chars = chars
9
+
10
+ self.c2i = {c:i+4 for i, c in enumerate(chars)}
11
+
12
+ self.i2c = {i+4:c for i, c in enumerate(chars)}
13
+
14
+ self.i2c[0] = '<pad>'
15
+ self.i2c[1] = '<sos>'
16
+ self.i2c[2] = '<eos>'
17
+ self.i2c[3] = '*'
18
+
19
+ def encode(self, chars):
20
+ return [self.go] + [self.c2i[c] for c in chars] + [self.eos]
21
+
22
+ def decode(self, ids):
23
+ first = 1 if self.go in ids else 0
24
+ last = ids.index(self.eos) if self.eos in ids else None
25
+ sent = ''.join([self.i2c[i] for i in ids[first:last]])
26
+ return sent
27
+
28
+ def __len__(self):
29
+ return len(self.c2i) + 4
30
+
31
+ def batch_decode(self, arr):
32
+ texts = [self.decode(ids) for ids in arr]
33
+ return texts
34
+
35
+ def __str__(self):
36
+ return self.chars
vietocr/translate.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import math
4
+ from PIL import Image
5
+ from torch.nn.functional import softmax
6
+
7
+ def translate(img, model, max_seq_length=128, sos_token=1, eos_token=2):
8
+ "data: BxCXHxW"
9
+ model.eval()
10
+
11
+ with torch.no_grad():
12
+ src = model.cnn(img)
13
+ memory = model.transformer.forward_encoder(src)
14
+
15
+ translated_sentence = [[sos_token]*len(img)]
16
+
17
+ max_length = 0
18
+
19
+ while max_length <= max_seq_length and not all(np.any(np.asarray(translated_sentence).T==eos_token, axis=1)):
20
+ tgt_inp = torch.LongTensor(translated_sentence)
21
+
22
+ output, memory = model.transformer.forward_decoder(tgt_inp, memory)
23
+ output = softmax(output, dim=-1)
24
+
25
+ _, indices = torch.topk(output, 5)
26
+
27
+ indices = indices[:, -1, 0]
28
+ indices = indices.tolist()
29
+
30
+ translated_sentence.append(indices)
31
+ max_length += 1
32
+
33
+ translated_sentence = np.asarray(translated_sentence).T
34
+
35
+ return translated_sentence
36
+
37
+ def resize(w, h, expected_height, image_min_width, image_max_width):
38
+ new_w = int(expected_height * float(w) / float(h))
39
+ round_to = 10
40
+ new_w = math.ceil(new_w/round_to)*round_to
41
+ new_w = max(new_w, image_min_width)
42
+ new_w = min(new_w, image_max_width)
43
+
44
+ return new_w, expected_height
45
+
46
+ def process_image(image, image_height, image_min_width, image_max_width):
47
+ img = image.convert('RGB')
48
+
49
+ w, h = img.size
50
+ new_w, image_height = resize(w, h, image_height, image_min_width, image_max_width)
51
+
52
+ img = img.resize((new_w, image_height), Image.Resampling.LANCZOS)
53
+
54
+ img = np.asarray(img).transpose(2,0, 1)
55
+ img = img/255
56
+ return img
57
+
58
+ def process_input(image, image_height, image_min_width, image_max_width):
59
+ img = process_image(image, image_height, image_min_width, image_max_width)
60
+ img = img[np.newaxis, ...]
61
+ img = torch.FloatTensor(img)
62
+ return img