Upload 28 files
Browse files- train_old.pth +3 -0
- vgg-seq2seq.yaml +90 -0
- vietocr/__init__.py +0 -0
- vietocr/model/__init__.py +0 -0
- vietocr/model/__pycache__/__init__.cpython-311.pyc +0 -0
- vietocr/model/__pycache__/beam.cpython-311.pyc +0 -0
- vietocr/model/__pycache__/trainer.cpython-311.pyc +0 -0
- vietocr/model/__pycache__/transformerocr.cpython-311.pyc +0 -0
- vietocr/model/__pycache__/vocab.cpython-311.pyc +0 -0
- vietocr/model/backbone/__init__.py +0 -0
- vietocr/model/backbone/__pycache__/__init__.cpython-311.pyc +0 -0
- vietocr/model/backbone/__pycache__/cnn.cpython-311.pyc +0 -0
- vietocr/model/backbone/__pycache__/resnet.cpython-311.pyc +0 -0
- vietocr/model/backbone/__pycache__/vgg.cpython-311.pyc +0 -0
- vietocr/model/backbone/cnn.py +28 -0
- vietocr/model/backbone/resnet.py +140 -0
- vietocr/model/backbone/vgg.py +50 -0
- vietocr/model/seqmodel/__init__.py +0 -0
- vietocr/model/seqmodel/__pycache__/__init__.cpython-311.pyc +0 -0
- vietocr/model/seqmodel/__pycache__/convseq2seq.cpython-311.pyc +0 -0
- vietocr/model/seqmodel/__pycache__/seq2seq.cpython-311.pyc +0 -0
- vietocr/model/seqmodel/__pycache__/transformer.cpython-311.pyc +0 -0
- vietocr/model/seqmodel/convseq2seq.py +324 -0
- vietocr/model/seqmodel/seq2seq.py +175 -0
- vietocr/model/seqmodel/transformer.py +124 -0
- vietocr/model/transformerocr.py +44 -0
- vietocr/model/vocab.py +36 -0
- vietocr/translate.py +62 -0
@@ -0,0 +1,3 @@
1 |
version https://git-lfs.github.com/spec/v1
2 |
oid sha256:eed45bcd25593dca2576c20721a57a449a6b557faf40336bcd7690b2a82eb2e1
3 |
size 89572737
@@ -0,0 +1,90 @@
1 |
project: vietocr_new
2 |
name: Train
3 |
4 |
device: cuda:0
5 |
6 |
# change to list chars of your dataset or use default vietnamese chars
7 |
vocab: 'aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0123456789!"#$%&''()*+,-./:;<=>?@[\]^_`{|}~ '
8 |
9 |
seq_modeling: seq2seq
10 |
11 |
encoder_hidden: 256
12 |
decoder_hidden: 256
13 |
img_channel: 256
14 |
decoder_embedded: 256
15 |
dropout: 0.1
16 |
17 |
18 |
max_lr: 0.001
19 |
pct_start: 0.1
20 |
21 |
22 |
batch_size: 128
23 |
print_every: 100
24 |
valid_every: 500
25 |
test_every: 500
26 |
iters: 10000
27 |
# where to save our model for prediction
28 |
export: weights/train_model.pth
29 |
checkpoint: ./checkpoint/checkpoint_model.pth
30 |
log: ./train.log
31 |
# null to disable compuate accuracy, or change to number of sample to enable validiation while training
32 |
metrics: 49228
33 |
test_metrics: 28918
34 |
pretrained: false
35 |
36 |
37 |
# path to image
38 |
data_root: /mnt/disk3/CGGANv2
39 |
# path to annotation
40 |
train_annotation: datasets/labels/train.txt
41 |
valid_annotation: datasets/labels/valid.txt
42 |
test_annotation: datasets/labels/test.txt
43 |
# path to lmdb datasets
44 |
train_lmdb: datasets/lmdb/train
45 |
valid_lmdb: datasets/lmdb/valid
46 |
test_lmdb: datasets/lmdb/test
47 |
48 |
# resize image to 32 height, larger height will increase accuracy
49 |
image_height: 32
50 |
image_min_width: 32
51 |
image_max_width: 512
52 |
53 |
54 |
num_workers: 12
55 |
pin_memory: true
56 |
57 |
58 |
image_aug: false
59 |
masked_language_model: false
60 |
61 |
62 |
# disable or enable beamsearch while prediction, use beamsearch will be slower
63 |
beamsearch: false
64 |
65 |
quiet: false
66 |
67 |
# for train
68 |
pretrain: https://vocr.vn/data/vietocr/vgg_seq2seq.pth
69 |
70 |
# url or local path (for predict)
71 |
weights: https://vocr.vn/data/vietocr/vgg_seq2seq.pth
72 |
73 |
backbone: vgg19_bn
74 |
75 |
# pooling stride size
76 |
77 |
- [2, 2]
78 |
- [2, 2]
79 |
- [2, 1]
80 |
- [2, 1]
81 |
- [1, 1]
82 |
# pooling kernel size
83 |
84 |
- [2, 2]
85 |
- [2, 2]
86 |
- [2, 1]
87 |
- [2, 1]
88 |
- [1, 1]
89 |
# dim of ouput feature map
90 |
hidden: 256
File without changes
File without changes
Binary file (174 Bytes). View file
Binary file (6.02 kB). View file
Binary file (22.2 kB). View file
Binary file (2.44 kB). View file
Binary file (3.27 kB). View file
File without changes
Binary file (183 Bytes). View file
Binary file (2.18 kB). View file
Binary file (9.41 kB). View file
Binary file (3.06 kB). View file
@@ -0,0 +1,28 @@
1 |
import torch
2 |
from torch import nn
3 |
4 |
import vietocr.model.backbone.vgg as vgg
5 |
from vietocr.model.backbone.resnet import Resnet50
6 |
7 |
class CNN(nn.Module):
8 |
def __init__(self, backbone, **kwargs):
9 |
super(CNN, self).__init__()
10 |
11 |
if backbone == 'vgg11_bn':
12 |
self.model = vgg.vgg11_bn(**kwargs)
13 |
elif backbone == 'vgg19_bn':
14 |
self.model = vgg.vgg19_bn(**kwargs)
15 |
elif backbone == 'resnet50':
16 |
self.model = Resnet50(**kwargs)
17 |
18 |
def forward(self, x):
19 |
return self.model(x)
20 |
21 |
def freeze(self):
22 |
for name, param in self.model.features.named_parameters():
23 |
if name != 'last_conv_1x1':
24 |
param.requires_grad = False
25 |
26 |
def unfreeze(self):
27 |
for param in self.model.features.parameters():
28 |
param.requires_grad = True
@@ -0,0 +1,140 @@
1 |
import torch
2 |
from torch import nn
3 |
4 |
class BasicBlock(nn.Module):
5 |
expansion = 1
6 |
7 |
def __init__(self, inplanes, planes, stride=1, downsample=None):
8 |
super(BasicBlock, self).__init__()
9 |
self.conv1 = self._conv3x3(inplanes, planes)
10 |
self.bn1 = nn.BatchNorm2d(planes)
11 |
self.conv2 = self._conv3x3(planes, planes)
12 |
self.bn2 = nn.BatchNorm2d(planes)
13 |
self.relu = nn.ReLU(inplace=True)
14 |
self.downsample = downsample
15 |
self.stride = stride
16 |
17 |
def _conv3x3(self, in_planes, out_planes, stride=1):
18 |
"3x3 convolution with padding"
19 |
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
20 |
padding=1, bias=False)
21 |
22 |
def forward(self, x):
23 |
residual = x
24 |
25 |
out = self.conv1(x)
26 |
out = self.bn1(out)
27 |
out = self.relu(out)
28 |
29 |
out = self.conv2(out)
30 |
out = self.bn2(out)
31 |
32 |
if self.downsample is not None:
33 |
residual = self.downsample(x)
34 |
out += residual
35 |
out = self.relu(out)
36 |
37 |
return out
38 |
39 |
class ResNet(nn.Module):
40 |
41 |
def __init__(self, input_channel, output_channel, block, layers):
42 |
super(ResNet, self).__init__()
43 |
44 |
self.output_channel_block = [int(output_channel / 4), int(output_channel / 2), output_channel, output_channel]
45 |
46 |
self.inplanes = int(output_channel / 8)
47 |
self.conv0_1 = nn.Conv2d(input_channel, int(output_channel / 16),
48 |
kernel_size=3, stride=1, padding=1, bias=False)
49 |
self.bn0_1 = nn.BatchNorm2d(int(output_channel / 16))
50 |
self.conv0_2 = nn.Conv2d(int(output_channel / 16), self.inplanes,
51 |
kernel_size=3, stride=1, padding=1, bias=False)
52 |
self.bn0_2 = nn.BatchNorm2d(self.inplanes)
53 |
self.relu = nn.ReLU(inplace=True)
54 |
55 |
self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
56 |
self.layer1 = self._make_layer(block, self.output_channel_block[0], layers[0])
57 |
self.conv1 = nn.Conv2d(self.output_channel_block[0], self.output_channel_block[
58 |
0], kernel_size=3, stride=1, padding=1, bias=False)
59 |
self.bn1 = nn.BatchNorm2d(self.output_channel_block[0])
60 |
61 |
self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
62 |
self.layer2 = self._make_layer(block, self.output_channel_block[1], layers[1], stride=1)
63 |
self.conv2 = nn.Conv2d(self.output_channel_block[1], self.output_channel_block[
64 |
1], kernel_size=3, stride=1, padding=1, bias=False)
65 |
self.bn2 = nn.BatchNorm2d(self.output_channel_block[1])
66 |
67 |
self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=(2, 1), padding=(0, 1))
68 |
self.layer3 = self._make_layer(block, self.output_channel_block[2], layers[2], stride=1)
69 |
self.conv3 = nn.Conv2d(self.output_channel_block[2], self.output_channel_block[
70 |
2], kernel_size=3, stride=1, padding=1, bias=False)
71 |
self.bn3 = nn.BatchNorm2d(self.output_channel_block[2])
72 |
73 |
self.layer4 = self._make_layer(block, self.output_channel_block[3], layers[3], stride=1)
74 |
self.conv4_1 = nn.Conv2d(self.output_channel_block[3], self.output_channel_block[
75 |
3], kernel_size=2, stride=(2, 1), padding=(0, 1), bias=False)
76 |
self.bn4_1 = nn.BatchNorm2d(self.output_channel_block[3])
77 |
self.conv4_2 = nn.Conv2d(self.output_channel_block[3], self.output_channel_block[
78 |
3], kernel_size=2, stride=1, padding=0, bias=False)
79 |
self.bn4_2 = nn.BatchNorm2d(self.output_channel_block[3])
80 |
81 |
def _make_layer(self, block, planes, blocks, stride=1):
82 |
downsample = None
83 |
if stride != 1 or self.inplanes != planes * block.expansion:
84 |
downsample = nn.Sequential(
85 |
nn.Conv2d(self.inplanes, planes * block.expansion,
86 |
kernel_size=1, stride=stride, bias=False),
87 |
nn.BatchNorm2d(planes * block.expansion),
88 |
89 |
90 |
layers = []
91 |
layers.append(block(self.inplanes, planes, stride, downsample))
92 |
self.inplanes = planes * block.expansion
93 |
for i in range(1, blocks):
94 |
layers.append(block(self.inplanes, planes))
95 |
96 |
return nn.Sequential(*layers)
97 |
98 |
def forward(self, x):
99 |
x = self.conv0_1(x)
100 |
x = self.bn0_1(x)
101 |
x = self.relu(x)
102 |
x = self.conv0_2(x)
103 |
x = self.bn0_2(x)
104 |
x = self.relu(x)
105 |
106 |
x = self.maxpool1(x)
107 |
x = self.layer1(x)
108 |
x = self.conv1(x)
109 |
x = self.bn1(x)
110 |
x = self.relu(x)
111 |
112 |
x = self.maxpool2(x)
113 |
x = self.layer2(x)
114 |
x = self.conv2(x)
115 |
x = self.bn2(x)
116 |
x = self.relu(x)
117 |
118 |
x = self.maxpool3(x)
119 |
x = self.layer3(x)
120 |
x = self.conv3(x)
121 |
x = self.bn3(x)
122 |
x = self.relu(x)
123 |
124 |
x = self.layer4(x)
125 |
x = self.conv4_1(x)
126 |
x = self.bn4_1(x)
127 |
x = self.relu(x)
128 |
x = self.conv4_2(x)
129 |
x = self.bn4_2(x)
130 |
conv = self.relu(x)
131 |
132 |
conv = conv.transpose(-1, -2)
133 |
conv = conv.flatten(2)
134 |
conv = conv.permute(-1, 0, 1)
135 |
136 |
return conv
137 |
138 |
def Resnet50(ss, hidden):
139 |
return ResNet(3, hidden, BasicBlock, [1, 2, 5, 3])
140 |
@@ -0,0 +1,50 @@
1 |
import torch
2 |
from torch import nn
3 |
from torchvision import models
4 |
from einops import rearrange
5 |
from torchvision.models._utils import IntermediateLayerGetter
6 |
7 |
8 |
class Vgg(nn.Module):
9 |
def __init__(self, name, ss, ks, hidden, pretrained=True, dropout=0.5):
10 |
super(Vgg, self).__init__()
11 |
12 |
if name == 'vgg11_bn':
13 |
cnn = models.vgg11_bn(weights='DEFAULT')
14 |
elif name == 'vgg19_bn':
15 |
cnn = models.vgg19_bn(weights='DEFAULT')
16 |
17 |
pool_idx = 0
18 |
19 |
for i, layer in enumerate(cnn.features):
20 |
if isinstance(layer, torch.nn.MaxPool2d):
21 |
cnn.features[i] = torch.nn.AvgPool2d(kernel_size=ks[pool_idx], stride=ss[pool_idx], padding=0)
22 |
pool_idx += 1
23 |
24 |
self.features = cnn.features
25 |
self.dropout = nn.Dropout(dropout)
26 |
self.last_conv_1x1 = nn.Conv2d(512, hidden, 1)
27 |
28 |
def forward(self, x):
29 |
30 |
31 |
- x: (N, C, H, W)
32 |
- output: (W, N, C)
33 |
34 |
35 |
conv = self.features(x)
36 |
conv = self.dropout(conv)
37 |
conv = self.last_conv_1x1(conv)
38 |
39 |
# conv = rearrange(conv, 'b d h w -> b d (w h)')
40 |
conv = conv.transpose(-1, -2)
41 |
conv = conv.flatten(2)
42 |
conv = conv.permute(-1, 0, 1)
43 |
return conv
44 |
45 |
def vgg11_bn(ss, ks, hidden, pretrained=True, dropout=0.5):
46 |
return Vgg('vgg11_bn', ss, ks, hidden, pretrained, dropout)
47 |
48 |
def vgg19_bn(ss, ks, hidden, pretrained=True, dropout=0.5):
49 |
return Vgg('vgg19_bn', ss, ks, hidden, pretrained, dropout)
50 |
File without changes
Binary file (183 Bytes). View file
Binary file (10.7 kB). View file
Binary file (9.79 kB). View file
Binary file (10.2 kB). View file
@@ -0,0 +1,324 @@
1 |
import torch
2 |
import torch.nn as nn
3 |
import torch.optim as optim
4 |
import torch.nn.functional as F
5 |
6 |
class Encoder(nn.Module):
7 |
def __init__(self,
8 |
9 |
10 |
11 |
12 |
13 |
14 |
max_length = 512):
15 |
16 |
17 |
assert kernel_size % 2 == 1, "Kernel size must be odd!"
18 |
19 |
self.device = device
20 |
21 |
self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
22 |
23 |
# self.tok_embedding = nn.Embedding(input_dim, emb_dim)
24 |
self.pos_embedding = nn.Embedding(max_length, emb_dim)
25 |
26 |
self.emb2hid = nn.Linear(emb_dim, hid_dim)
27 |
self.hid2emb = nn.Linear(hid_dim, emb_dim)
28 |
29 |
self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim,
30 |
out_channels = 2 * hid_dim,
31 |
kernel_size = kernel_size,
32 |
padding = (kernel_size - 1) // 2)
33 |
for _ in range(n_layers)])
34 |
35 |
self.dropout = nn.Dropout(dropout)
36 |
37 |
def forward(self, src):
38 |
39 |
#src = [batch size, src len]
40 |
41 |
src = src.transpose(0, 1)
42 |
43 |
batch_size = src.shape[0]
44 |
src_len = src.shape[1]
45 |
device = src.device
46 |
47 |
#create position tensor
48 |
pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(device)
49 |
50 |
#pos = [0, 1, 2, 3, ..., src len - 1]
51 |
52 |
#pos = [batch size, src len]
53 |
54 |
#embed tokens and positions
55 |
56 |
# tok_embedded = self.tok_embedding(src)
57 |
tok_embedded = src
58 |
59 |
pos_embedded = self.pos_embedding(pos)
60 |
61 |
#tok_embedded = pos_embedded = [batch size, src len, emb dim]
62 |
63 |
#combine embeddings by elementwise summing
64 |
embedded = self.dropout(tok_embedded + pos_embedded)
65 |
66 |
#embedded = [batch size, src len, emb dim]
67 |
68 |
#pass embedded through linear layer to convert from emb dim to hid dim
69 |
conv_input = self.emb2hid(embedded)
70 |
71 |
#conv_input = [batch size, src len, hid dim]
72 |
73 |
#permute for convolutional layer
74 |
conv_input = conv_input.permute(0, 2, 1)
75 |
76 |
#conv_input = [batch size, hid dim, src len]
77 |
78 |
#begin convolutional blocks...
79 |
80 |
for i, conv in enumerate(self.convs):
81 |
82 |
#pass through convolutional layer
83 |
conved = conv(self.dropout(conv_input))
84 |
85 |
#conved = [batch size, 2 * hid dim, src len]
86 |
87 |
#pass through GLU activation function
88 |
conved = F.glu(conved, dim = 1)
89 |
90 |
#conved = [batch size, hid dim, src len]
91 |
92 |
#apply residual connection
93 |
conved = (conved + conv_input) * self.scale
94 |
95 |
#conved = [batch size, hid dim, src len]
96 |
97 |
#set conv_input to conved for next loop iteration
98 |
conv_input = conved
99 |
100 |
#...end convolutional blocks
101 |
102 |
#permute and convert back to emb dim
103 |
conved = self.hid2emb(conved.permute(0, 2, 1))
104 |
105 |
#conved = [batch size, src len, emb dim]
106 |
107 |
#elementwise sum output (conved) and input (embedded) to be used for attention
108 |
combined = (conved + embedded) * self.scale
109 |
110 |
#combined = [batch size, src len, emb dim]
111 |
112 |
return conved, combined
113 |
114 |
class Decoder(nn.Module):
115 |
def __init__(self,
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
max_length = 512):
125 |
126 |
127 |
self.kernel_size = kernel_size
128 |
self.trg_pad_idx = trg_pad_idx
129 |
self.device = device
130 |
131 |
self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
132 |
133 |
self.tok_embedding = nn.Embedding(output_dim, emb_dim)
134 |
self.pos_embedding = nn.Embedding(max_length, emb_dim)
135 |
136 |
self.emb2hid = nn.Linear(emb_dim, hid_dim)
137 |
self.hid2emb = nn.Linear(hid_dim, emb_dim)
138 |
139 |
self.attn_hid2emb = nn.Linear(hid_dim, emb_dim)
140 |
self.attn_emb2hid = nn.Linear(emb_dim, hid_dim)
141 |
142 |
self.fc_out = nn.Linear(emb_dim, output_dim)
143 |
144 |
self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim,
145 |
out_channels = 2 * hid_dim,
146 |
kernel_size = kernel_size)
147 |
for _ in range(n_layers)])
148 |
149 |
self.dropout = nn.Dropout(dropout)
150 |
151 |
def calculate_attention(self, embedded, conved, encoder_conved, encoder_combined):
152 |
153 |
#embedded = [batch size, trg len, emb dim]
154 |
#conved = [batch size, hid dim, trg len]
155 |
#encoder_conved = encoder_combined = [batch size, src len, emb dim]
156 |
157 |
#permute and convert back to emb dim
158 |
conved_emb = self.attn_hid2emb(conved.permute(0, 2, 1))
159 |
160 |
#conved_emb = [batch size, trg len, emb dim]
161 |
162 |
combined = (conved_emb + embedded) * self.scale
163 |
164 |
#combined = [batch size, trg len, emb dim]
165 |
166 |
energy = torch.matmul(combined, encoder_conved.permute(0, 2, 1))
167 |
168 |
#energy = [batch size, trg len, src len]
169 |
170 |
attention = F.softmax(energy, dim=2)
171 |
172 |
#attention = [batch size, trg len, src len]
173 |
174 |
attended_encoding = torch.matmul(attention, encoder_combined)
175 |
176 |
#attended_encoding = [batch size, trg len, emd dim]
177 |
178 |
#convert from emb dim -> hid dim
179 |
attended_encoding = self.attn_emb2hid(attended_encoding)
180 |
181 |
#attended_encoding = [batch size, trg len, hid dim]
182 |
183 |
#apply residual connection
184 |
attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale
185 |
186 |
#attended_combined = [batch size, hid dim, trg len]
187 |
188 |
return attention, attended_combined
189 |
190 |
def forward(self, trg, encoder_conved, encoder_combined):
191 |
192 |
#trg = [batch size, trg len]
193 |
#encoder_conved = encoder_combined = [batch size, src len, emb dim]
194 |
trg = trg.transpose(0, 1)
195 |
196 |
batch_size = trg.shape[0]
197 |
trg_len = trg.shape[1]
198 |
device = trg.device
199 |
200 |
#create position tensor
201 |
pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(device)
202 |
203 |
#pos = [batch size, trg len]
204 |
205 |
#embed tokens and positions
206 |
tok_embedded = self.tok_embedding(trg)
207 |
pos_embedded = self.pos_embedding(pos)
208 |
209 |
#tok_embedded = [batch size, trg len, emb dim]
210 |
#pos_embedded = [batch size, trg len, emb dim]
211 |
212 |
#combine embeddings by elementwise summing
213 |
embedded = self.dropout(tok_embedded + pos_embedded)
214 |
215 |
#embedded = [batch size, trg len, emb dim]
216 |
217 |
#pass embedded through linear layer to go through emb dim -> hid dim
218 |
conv_input = self.emb2hid(embedded)
219 |
220 |
#conv_input = [batch size, trg len, hid dim]
221 |
222 |
#permute for convolutional layer
223 |
conv_input = conv_input.permute(0, 2, 1)
224 |
225 |
#conv_input = [batch size, hid dim, trg len]
226 |
227 |
batch_size = conv_input.shape[0]
228 |
hid_dim = conv_input.shape[1]
229 |
230 |
for i, conv in enumerate(self.convs):
231 |
232 |
#apply dropout
233 |
conv_input = self.dropout(conv_input)
234 |
235 |
#need to pad so decoder can't "cheat"
236 |
padding = torch.zeros(batch_size,
237 |
238 |
self.kernel_size - 1).fill_(self.trg_pad_idx).to(device)
239 |
240 |
padded_conv_input = torch.cat((padding, conv_input), dim = 2)
241 |
242 |
#padded_conv_input = [batch size, hid dim, trg len + kernel size - 1]
243 |
244 |
#pass through convolutional layer
245 |
conved = conv(padded_conv_input)
246 |
247 |
#conved = [batch size, 2 * hid dim, trg len]
248 |
249 |
#pass through GLU activation function
250 |
conved = F.glu(conved, dim = 1)
251 |
252 |
#conved = [batch size, hid dim, trg len]
253 |
254 |
#calculate attention
255 |
attention, conved = self.calculate_attention(embedded,
256 |
257 |
258 |
259 |
260 |
#attention = [batch size, trg len, src len]
261 |
262 |
#apply residual connection
263 |
conved = (conved + conv_input) * self.scale
264 |
265 |
#conved = [batch size, hid dim, trg len]
266 |
267 |
#set conv_input to conved for next loop iteration
268 |
conv_input = conved
269 |
270 |
conved = self.hid2emb(conved.permute(0, 2, 1))
271 |
272 |
#conved = [batch size, trg len, emb dim]
273 |
274 |
output = self.fc_out(self.dropout(conved))
275 |
276 |
#output = [batch size, trg len, output dim]
277 |
278 |
return output, attention
279 |
280 |
class ConvSeq2Seq(nn.Module):
281 |
def __init__(self, vocab_size, emb_dim, hid_dim, enc_layers, dec_layers, enc_kernel_size, dec_kernel_size, enc_max_length, dec_max_length, dropout, pad_idx, device):
282 |
283 |
284 |
enc = Encoder(emb_dim, hid_dim, enc_layers, enc_kernel_size, dropout, device, enc_max_length)
285 |
dec = Decoder(vocab_size, emb_dim, hid_dim, dec_layers, dec_kernel_size, dropout, pad_idx, device, dec_max_length)
286 |
287 |
self.encoder = enc
288 |
self.decoder = dec
289 |
290 |
def forward_encoder(self, src):
291 |
encoder_conved, encoder_combined = self.encoder(src)
292 |
293 |
return encoder_conved, encoder_combined
294 |
295 |
def forward_decoder(self, trg, memory):
296 |
encoder_conved, encoder_combined = memory
297 |
output, attention = self.decoder(trg, encoder_conved, encoder_combined)
298 |
299 |
return output, (encoder_conved, encoder_combined)
300 |
301 |
def forward(self, src, trg):
302 |
303 |
#src = [batch size, src len]
304 |
#trg = [batch size, trg len - 1] (<eos> token sliced off the end)
305 |
306 |
#calculate z^u (encoder_conved) and (z^u + e) (encoder_combined)
307 |
#encoder_conved is output from final encoder conv. block
308 |
#encoder_combined is encoder_conved plus (elementwise) src embedding plus
309 |
# positional embeddings
310 |
encoder_conved, encoder_combined = self.encoder(src)
311 |
312 |
#encoder_conved = [batch size, src len, emb dim]
313 |
#encoder_combined = [batch size, src len, emb dim]
314 |
315 |
#calculate predictions of next words
316 |
#output is a batch of predictions for each word in the trg sentence
317 |
#attention a batch of attention scores across the src sentence for
318 |
# each word in the trg sentence
319 |
output, attention = self.decoder(trg, encoder_conved, encoder_combined)
320 |
321 |
#output = [batch size, trg len - 1, output dim]
322 |
#attention = [batch size, trg len - 1, src len]
323 |
324 |
return output#, attention
@@ -0,0 +1,175 @@
1 |
import torch
2 |
import torch.nn as nn
3 |
import torch.optim as optim
4 |
import torch.nn.functional as F
5 |
6 |
class Encoder(nn.Module):
7 |
def __init__(self, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
8 |
9 |
10 |
self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
11 |
self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
12 |
self.dropout = nn.Dropout(dropout)
13 |
14 |
def forward(self, src):
15 |
16 |
src: src_len x batch_size x img_channel
17 |
outputs: src_len x batch_size x hid_dim
18 |
hidden: batch_size x hid_dim
19 |
20 |
21 |
embedded = self.dropout(src)
22 |
23 |
outputs, hidden = self.rnn(embedded)
24 |
25 |
hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
26 |
27 |
return outputs, hidden
28 |
29 |
class Attention(nn.Module):
30 |
def __init__(self, enc_hid_dim, dec_hid_dim):
31 |
32 |
33 |
self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
34 |
self.v = nn.Linear(dec_hid_dim, 1, bias = False)
35 |
36 |
def forward(self, hidden, encoder_outputs):
37 |
38 |
hidden: batch_size x hid_dim
39 |
encoder_outputs: src_len x batch_size x hid_dim,
40 |
outputs: batch_size x src_len
41 |
42 |
43 |
batch_size = encoder_outputs.shape[1]
44 |
src_len = encoder_outputs.shape[0]
45 |
46 |
hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
47 |
48 |
encoder_outputs = encoder_outputs.permute(1, 0, 2)
49 |
50 |
energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2)))
51 |
52 |
attention = self.v(energy).squeeze(2)
53 |
54 |
return F.softmax(attention, dim = 1)
55 |
56 |
class Decoder(nn.Module):
57 |
def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
58 |
59 |
60 |
self.output_dim = output_dim
61 |
self.attention = attention
62 |
63 |
self.embedding = nn.Embedding(output_dim, emb_dim)
64 |
self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
65 |
self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
66 |
self.dropout = nn.Dropout(dropout)
67 |
68 |
def forward(self, input, hidden, encoder_outputs):
69 |
70 |
inputs: batch_size
71 |
hidden: batch_size x hid_dim
72 |
encoder_outputs: src_len x batch_size x hid_dim
73 |
74 |
75 |
input = input.unsqueeze(0)
76 |
77 |
embedded = self.dropout(self.embedding(input))
78 |
79 |
a = self.attention(hidden, encoder_outputs)
80 |
81 |
a = a.unsqueeze(1)
82 |
83 |
encoder_outputs = encoder_outputs.permute(1, 0, 2)
84 |
85 |
weighted = torch.bmm(a, encoder_outputs)
86 |
87 |
weighted = weighted.permute(1, 0, 2)
88 |
89 |
rnn_input = torch.cat((embedded, weighted), dim = 2)
90 |
91 |
output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
92 |
93 |
assert (output == hidden).all()
94 |
95 |
embedded = embedded.squeeze(0)
96 |
output = output.squeeze(0)
97 |
weighted = weighted.squeeze(0)
98 |
99 |
prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
100 |
101 |
return prediction, hidden.squeeze(0), a.squeeze(1)
102 |
103 |
class Seq2Seq(nn.Module):
104 |
def __init__(self, vocab_size, encoder_hidden, decoder_hidden, img_channel, decoder_embedded, dropout=0.1):
105 |
106 |
107 |
attn = Attention(encoder_hidden, decoder_hidden)
108 |
109 |
self.encoder = Encoder(img_channel, encoder_hidden, decoder_hidden, dropout)
110 |
self.decoder = Decoder(vocab_size, decoder_embedded, encoder_hidden, decoder_hidden, dropout, attn)
111 |
112 |
def forward_encoder(self, src):
113 |
114 |
src: timestep x batch_size x channel
115 |
hidden: batch_size x hid_dim
116 |
encoder_outputs: src_len x batch_size x hid_dim
117 |
118 |
119 |
encoder_outputs, hidden = self.encoder(src)
120 |
121 |
return (hidden, encoder_outputs)
122 |
123 |
def forward_decoder(self, tgt, memory):
124 |
125 |
tgt: timestep x batch_size
126 |
hidden: batch_size x hid_dim
127 |
encouder: src_len x batch_size x hid_dim
128 |
output: batch_size x 1 x vocab_size
129 |
130 |
131 |
tgt = tgt[-1]
132 |
hidden, encoder_outputs = memory
133 |
output, hidden, _ = self.decoder(tgt, hidden, encoder_outputs)
134 |
output = output.unsqueeze(1)
135 |
136 |
return output, (hidden, encoder_outputs)
137 |
138 |
def forward(self, src, trg):
139 |
140 |
src: time_step x batch_size
141 |
trg: time_step x batch_size
142 |
outputs: batch_size x time_step x vocab_size
143 |
144 |
145 |
batch_size = src.shape[1]
146 |
trg_len = trg.shape[0]
147 |
trg_vocab_size = self.decoder.output_dim
148 |
device = src.device
149 |
150 |
outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(device)
151 |
encoder_outputs, hidden = self.encoder(src)
152 |
153 |
for t in range(trg_len):
154 |
input = trg[t]
155 |
output, hidden, _ = self.decoder(input, hidden, encoder_outputs)
156 |
157 |
outputs[t] = output
158 |
159 |
outputs = outputs.transpose(0, 1).contiguous()
160 |
161 |
return outputs
162 |
163 |
def expand_memory(self, memory, beam_size):
164 |
hidden, encoder_outputs = memory
165 |
hidden = hidden.repeat(beam_size, 1)
166 |
encoder_outputs = encoder_outputs.repeat(1, beam_size, 1)
167 |
168 |
return (hidden, encoder_outputs)
169 |
170 |
def get_memory(self, memory, i):
171 |
hidden, encoder_outputs = memory
172 |
hidden = hidden[[i]]
173 |
encoder_outputs = encoder_outputs[:, [i],:]
174 |
175 |
return (hidden, encoder_outputs)
@@ -0,0 +1,124 @@
1 |
from einops import rearrange
2 |
from torchvision import models
3 |
import math
4 |
import torch
5 |
from torch import nn
6 |
7 |
class LanguageTransformer(nn.Module):
8 |
def __init__(self, vocab_size,
9 |
d_model, nhead,
10 |
num_encoder_layers, num_decoder_layers,
11 |
dim_feedforward, max_seq_length,
12 |
pos_dropout, trans_dropout):
13 |
14 |
15 |
self.d_model = d_model
16 |
self.embed_tgt = nn.Embedding(vocab_size, d_model)
17 |
self.pos_enc = PositionalEncoding(d_model, pos_dropout, max_seq_length)
18 |
# self.learned_pos_enc = LearnedPositionalEncoding(d_model, pos_dropout, max_seq_length)
19 |
20 |
self.transformer = nn.Transformer(d_model, nhead,
21 |
num_encoder_layers, num_decoder_layers,
22 |
dim_feedforward, trans_dropout)
23 |
24 |
self.fc = nn.Linear(d_model, vocab_size)
25 |
26 |
def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
27 |
28 |
29 |
- src: (W, N, C)
30 |
- tgt: (T, N)
31 |
- src_key_padding_mask: (N, S)
32 |
- tgt_key_padding_mask: (N, T)
33 |
- memory_key_padding_mask: (N, S)
34 |
- output: (N, T, E)
35 |
36 |
37 |
tgt_mask = self.gen_nopeek_mask(tgt.shape[0]).to(src.device)
38 |
39 |
src = self.pos_enc(src*math.sqrt(self.d_model))
40 |
# src = self.learned_pos_enc(src*math.sqrt(self.d_model))
41 |
42 |
tgt = self.pos_enc(self.embed_tgt(tgt) * math.sqrt(self.d_model))
43 |
44 |
output = self.transformer(src, tgt, tgt_mask=tgt_mask, src_key_padding_mask=src_key_padding_mask,
45 |
tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask)
46 |
# output = rearrange(output, 't n e -> n t e')
47 |
output = output.transpose(0, 1)
48 |
return self.fc(output)
49 |
50 |
def gen_nopeek_mask(self, length):
51 |
mask = (torch.triu(torch.ones(length, length)) == 1).transpose(0, 1)
52 |
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
53 |
54 |
return mask
55 |
56 |
def forward_encoder(self, src):
57 |
src = self.pos_enc(src*math.sqrt(self.d_model))
58 |
memory = self.transformer.encoder(src)
59 |
return memory
60 |
61 |
def forward_decoder(self, tgt, memory):
62 |
tgt_mask = self.gen_nopeek_mask(tgt.shape[0]).to(tgt.device)
63 |
tgt = self.pos_enc(self.embed_tgt(tgt) * math.sqrt(self.d_model))
64 |
65 |
output = self.transformer.decoder(tgt, memory, tgt_mask=tgt_mask)
66 |
# output = rearrange(output, 't n e -> n t e')
67 |
output = output.transpose(0, 1)
68 |
69 |
return self.fc(output), memory
70 |
71 |
def expand_memory(self, memory, beam_size):
72 |
memory = memory.repeat(1, beam_size, 1)
73 |
return memory
74 |
75 |
def get_memory(self, memory, i):
76 |
memory = memory[:, [i], :]
77 |
return memory
78 |
79 |
class PositionalEncoding(nn.Module):
80 |
def __init__(self, d_model, dropout=0.1, max_len=100):
81 |
super(PositionalEncoding, self).__init__()
82 |
self.dropout = nn.Dropout(p=dropout)
83 |
84 |
pe = torch.zeros(max_len, d_model)
85 |
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
86 |
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
87 |
pe[:, 0::2] = torch.sin(position * div_term)
88 |
pe[:, 1::2] = torch.cos(position * div_term)
89 |
pe = pe.unsqueeze(0).transpose(0, 1)
90 |
self.register_buffer('pe', pe)
91 |
92 |
def forward(self, x):
93 |
x = x + self.pe[:x.size(0), :]
94 |
95 |
return self.dropout(x)
96 |
97 |
class LearnedPositionalEncoding(nn.Module):
98 |
def __init__(self, d_model, dropout=0.1, max_len=100):
99 |
super(LearnedPositionalEncoding, self).__init__()
100 |
self.dropout = nn.Dropout(p=dropout)
101 |
102 |
self.pos_embed = nn.Embedding(max_len, d_model)
103 |
self.layernorm = LayerNorm(d_model)
104 |
105 |
def forward(self, x):
106 |
seq_len = x.size(0)
107 |
pos = torch.arange(seq_len, dtype=torch.long, device=x.device)
108 |
pos = pos.unsqueeze(-1).expand(x.size()[:2])
109 |
x = x + self.pos_embed(pos)
110 |
return self.dropout(self.layernorm(x))
111 |
112 |
class LayerNorm(nn.Module):
113 |
"A layernorm module in the TF style (epsilon inside the square root)."
114 |
def __init__(self, d_model, variance_epsilon=1e-12):
115 |
116 |
self.gamma = nn.Parameter(torch.ones(d_model))
117 |
self.beta = nn.Parameter(torch.zeros(d_model))
118 |
self.variance_epsilon = variance_epsilon
119 |
120 |
def forward(self, x):
121 |
u = x.mean(-1, keepdim=True)
122 |
s = (x - u).pow(2).mean(-1, keepdim=True)
123 |
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
124 |
return self.gamma * x + self.beta
@@ -0,0 +1,44 @@
1 |
from vietocr.model.backbone.cnn import CNN
2 |
from vietocr.model.seqmodel.transformer import LanguageTransformer
3 |
from vietocr.model.seqmodel.seq2seq import Seq2Seq
4 |
from vietocr.model.seqmodel.convseq2seq import ConvSeq2Seq
5 |
from torch import nn
6 |
7 |
class VietOCR(nn.Module):
8 |
def __init__(self, vocab_size,
9 |
10 |
11 |
transformer_args, seq_modeling='transformer'):
12 |
13 |
super(VietOCR, self).__init__()
14 |
15 |
self.cnn = CNN(backbone, **cnn_args)
16 |
self.seq_modeling = seq_modeling
17 |
18 |
if seq_modeling == 'transformer':
19 |
self.transformer = LanguageTransformer(vocab_size, **transformer_args)
20 |
elif seq_modeling == 'seq2seq':
21 |
self.transformer = Seq2Seq(vocab_size, **transformer_args)
22 |
elif seq_modeling == 'convseq2seq':
23 |
self.transformer = ConvSeq2Seq(vocab_size, **transformer_args)
24 |
25 |
raise('Not Support Seq Model')
26 |
27 |
def forward(self, img, tgt_input, tgt_key_padding_mask):
28 |
29 |
30 |
- img: (N, C, H, W)
31 |
- tgt_input: (T, N)
32 |
- tgt_key_padding_mask: (N, T)
33 |
- output: b t v
34 |
35 |
src = self.cnn(img)
36 |
37 |
if self.seq_modeling == 'transformer':
38 |
outputs = self.transformer(src, tgt_input, tgt_key_padding_mask=tgt_key_padding_mask)
39 |
elif self.seq_modeling == 'seq2seq':
40 |
outputs = self.transformer(src, tgt_input)
41 |
elif self.seq_modeling == 'convseq2seq':
42 |
outputs = self.transformer(src, tgt_input)
43 |
return outputs
44 |
@@ -0,0 +1,36 @@
1 |
class Vocab():
2 |
def __init__(self, chars):
3 |
self.pad = 0
4 |
self.go = 1
5 |
self.eos = 2
6 |
self.mask_token = 3
7 |
8 |
self.chars = chars
9 |
10 |
self.c2i = {c:i+4 for i, c in enumerate(chars)}
11 |
12 |
self.i2c = {i+4:c for i, c in enumerate(chars)}
13 |
14 |
self.i2c[0] = '<pad>'
15 |
self.i2c[1] = '<sos>'
16 |
self.i2c[2] = '<eos>'
17 |
self.i2c[3] = '*'
18 |
19 |
def encode(self, chars):
20 |
return [self.go] + [self.c2i[c] for c in chars] + [self.eos]
21 |
22 |
def decode(self, ids):
23 |
first = 1 if self.go in ids else 0
24 |
last = ids.index(self.eos) if self.eos in ids else None
25 |
sent = ''.join([self.i2c[i] for i in ids[first:last]])
26 |
return sent
27 |
28 |
def __len__(self):
29 |
return len(self.c2i) + 4
30 |
31 |
def batch_decode(self, arr):
32 |
texts = [self.decode(ids) for ids in arr]
33 |
return texts
34 |
35 |
def __str__(self):
36 |
return self.chars
@@ -0,0 +1,62 @@
1 |
import torch
2 |
import numpy as np
3 |
import math
4 |
from PIL import Image
5 |
from torch.nn.functional import softmax
6 |
7 |
def translate(img, model, max_seq_length=128, sos_token=1, eos_token=2):
8 |
"data: BxCXHxW"
9 |
10 |
11 |
with torch.no_grad():
12 |
src = model.cnn(img)
13 |
memory = model.transformer.forward_encoder(src)
14 |
15 |
translated_sentence = [[sos_token]*len(img)]
16 |
17 |
max_length = 0
18 |
19 |
while max_length <= max_seq_length and not all(np.any(np.asarray(translated_sentence).T==eos_token, axis=1)):
20 |
tgt_inp = torch.LongTensor(translated_sentence)
21 |
22 |
output, memory = model.transformer.forward_decoder(tgt_inp, memory)
23 |
output = softmax(output, dim=-1)
24 |
25 |
_, indices = torch.topk(output, 5)
26 |
27 |
indices = indices[:, -1, 0]
28 |
indices = indices.tolist()
29 |
30 |
31 |
max_length += 1
32 |
33 |
translated_sentence = np.asarray(translated_sentence).T
34 |
35 |
return translated_sentence
36 |
37 |
def resize(w, h, expected_height, image_min_width, image_max_width):
38 |
new_w = int(expected_height * float(w) / float(h))
39 |
round_to = 10
40 |
new_w = math.ceil(new_w/round_to)*round_to
41 |
new_w = max(new_w, image_min_width)
42 |
new_w = min(new_w, image_max_width)
43 |
44 |
return new_w, expected_height
45 |
46 |
def process_image(image, image_height, image_min_width, image_max_width):
47 |
img = image.convert('RGB')
48 |
49 |
w, h = img.size
50 |
new_w, image_height = resize(w, h, image_height, image_min_width, image_max_width)
51 |
52 |
img = img.resize((new_w, image_height), Image.Resampling.LANCZOS)
53 |
54 |
img = np.asarray(img).transpose(2,0, 1)
55 |
img = img/255
56 |
return img
57 |
58 |
def process_input(image, image_height, image_min_width, image_max_width):
59 |
img = process_image(image, image_height, image_min_width, image_max_width)
60 |
img = img[np.newaxis, ...]
61 |
img = torch.FloatTensor(img)
62 |
return img