hantech commited on
Commit
2d5e190
β€’
1 Parent(s): 1787b5b

Delete vietocr/model

Browse files
vietocr/model/__init__.py DELETED
File without changes
vietocr/model/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (174 Bytes)
 
vietocr/model/__pycache__/beam.cpython-311.pyc DELETED
Binary file (6.02 kB)
 
vietocr/model/__pycache__/trainer.cpython-311.pyc DELETED
Binary file (22.2 kB)
 
vietocr/model/__pycache__/transformerocr.cpython-311.pyc DELETED
Binary file (2.44 kB)
 
vietocr/model/__pycache__/vocab.cpython-311.pyc DELETED
Binary file (3.27 kB)
 
vietocr/model/backbone/__init__.py DELETED
File without changes
vietocr/model/backbone/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (183 Bytes)
 
vietocr/model/backbone/__pycache__/cnn.cpython-311.pyc DELETED
Binary file (2.18 kB)
 
vietocr/model/backbone/__pycache__/resnet.cpython-311.pyc DELETED
Binary file (9.41 kB)
 
vietocr/model/backbone/__pycache__/vgg.cpython-311.pyc DELETED
Binary file (3.06 kB)
 
vietocr/model/backbone/cnn.py DELETED
@@ -1,28 +0,0 @@
1
- import torch
2
- from torch import nn
3
-
4
- import vietocr.model.backbone.vgg as vgg
5
- from vietocr.model.backbone.resnet import Resnet50
6
-
7
- class CNN(nn.Module):
8
- def __init__(self, backbone, **kwargs):
9
- super(CNN, self).__init__()
10
-
11
- if backbone == 'vgg11_bn':
12
- self.model = vgg.vgg11_bn(**kwargs)
13
- elif backbone == 'vgg19_bn':
14
- self.model = vgg.vgg19_bn(**kwargs)
15
- elif backbone == 'resnet50':
16
- self.model = Resnet50(**kwargs)
17
-
18
- def forward(self, x):
19
- return self.model(x)
20
-
21
- def freeze(self):
22
- for name, param in self.model.features.named_parameters():
23
- if name != 'last_conv_1x1':
24
- param.requires_grad = False
25
-
26
- def unfreeze(self):
27
- for param in self.model.features.parameters():
28
- param.requires_grad = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vietocr/model/backbone/resnet.py DELETED
@@ -1,140 +0,0 @@
1
- import torch
2
- from torch import nn
3
-
4
- class BasicBlock(nn.Module):
5
- expansion = 1
6
-
7
- def __init__(self, inplanes, planes, stride=1, downsample=None):
8
- super(BasicBlock, self).__init__()
9
- self.conv1 = self._conv3x3(inplanes, planes)
10
- self.bn1 = nn.BatchNorm2d(planes)
11
- self.conv2 = self._conv3x3(planes, planes)
12
- self.bn2 = nn.BatchNorm2d(planes)
13
- self.relu = nn.ReLU(inplace=True)
14
- self.downsample = downsample
15
- self.stride = stride
16
-
17
- def _conv3x3(self, in_planes, out_planes, stride=1):
18
- "3x3 convolution with padding"
19
- return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
20
- padding=1, bias=False)
21
-
22
- def forward(self, x):
23
- residual = x
24
-
25
- out = self.conv1(x)
26
- out = self.bn1(out)
27
- out = self.relu(out)
28
-
29
- out = self.conv2(out)
30
- out = self.bn2(out)
31
-
32
- if self.downsample is not None:
33
- residual = self.downsample(x)
34
- out += residual
35
- out = self.relu(out)
36
-
37
- return out
38
-
39
- class ResNet(nn.Module):
40
-
41
- def __init__(self, input_channel, output_channel, block, layers):
42
- super(ResNet, self).__init__()
43
-
44
- self.output_channel_block = [int(output_channel / 4), int(output_channel / 2), output_channel, output_channel]
45
-
46
- self.inplanes = int(output_channel / 8)
47
- self.conv0_1 = nn.Conv2d(input_channel, int(output_channel / 16),
48
- kernel_size=3, stride=1, padding=1, bias=False)
49
- self.bn0_1 = nn.BatchNorm2d(int(output_channel / 16))
50
- self.conv0_2 = nn.Conv2d(int(output_channel / 16), self.inplanes,
51
- kernel_size=3, stride=1, padding=1, bias=False)
52
- self.bn0_2 = nn.BatchNorm2d(self.inplanes)
53
- self.relu = nn.ReLU(inplace=True)
54
-
55
- self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
56
- self.layer1 = self._make_layer(block, self.output_channel_block[0], layers[0])
57
- self.conv1 = nn.Conv2d(self.output_channel_block[0], self.output_channel_block[
58
- 0], kernel_size=3, stride=1, padding=1, bias=False)
59
- self.bn1 = nn.BatchNorm2d(self.output_channel_block[0])
60
-
61
- self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
62
- self.layer2 = self._make_layer(block, self.output_channel_block[1], layers[1], stride=1)
63
- self.conv2 = nn.Conv2d(self.output_channel_block[1], self.output_channel_block[
64
- 1], kernel_size=3, stride=1, padding=1, bias=False)
65
- self.bn2 = nn.BatchNorm2d(self.output_channel_block[1])
66
-
67
- self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=(2, 1), padding=(0, 1))
68
- self.layer3 = self._make_layer(block, self.output_channel_block[2], layers[2], stride=1)
69
- self.conv3 = nn.Conv2d(self.output_channel_block[2], self.output_channel_block[
70
- 2], kernel_size=3, stride=1, padding=1, bias=False)
71
- self.bn3 = nn.BatchNorm2d(self.output_channel_block[2])
72
-
73
- self.layer4 = self._make_layer(block, self.output_channel_block[3], layers[3], stride=1)
74
- self.conv4_1 = nn.Conv2d(self.output_channel_block[3], self.output_channel_block[
75
- 3], kernel_size=2, stride=(2, 1), padding=(0, 1), bias=False)
76
- self.bn4_1 = nn.BatchNorm2d(self.output_channel_block[3])
77
- self.conv4_2 = nn.Conv2d(self.output_channel_block[3], self.output_channel_block[
78
- 3], kernel_size=2, stride=1, padding=0, bias=False)
79
- self.bn4_2 = nn.BatchNorm2d(self.output_channel_block[3])
80
-
81
- def _make_layer(self, block, planes, blocks, stride=1):
82
- downsample = None
83
- if stride != 1 or self.inplanes != planes * block.expansion:
84
- downsample = nn.Sequential(
85
- nn.Conv2d(self.inplanes, planes * block.expansion,
86
- kernel_size=1, stride=stride, bias=False),
87
- nn.BatchNorm2d(planes * block.expansion),
88
- )
89
-
90
- layers = []
91
- layers.append(block(self.inplanes, planes, stride, downsample))
92
- self.inplanes = planes * block.expansion
93
- for i in range(1, blocks):
94
- layers.append(block(self.inplanes, planes))
95
-
96
- return nn.Sequential(*layers)
97
-
98
- def forward(self, x):
99
- x = self.conv0_1(x)
100
- x = self.bn0_1(x)
101
- x = self.relu(x)
102
- x = self.conv0_2(x)
103
- x = self.bn0_2(x)
104
- x = self.relu(x)
105
-
106
- x = self.maxpool1(x)
107
- x = self.layer1(x)
108
- x = self.conv1(x)
109
- x = self.bn1(x)
110
- x = self.relu(x)
111
-
112
- x = self.maxpool2(x)
113
- x = self.layer2(x)
114
- x = self.conv2(x)
115
- x = self.bn2(x)
116
- x = self.relu(x)
117
-
118
- x = self.maxpool3(x)
119
- x = self.layer3(x)
120
- x = self.conv3(x)
121
- x = self.bn3(x)
122
- x = self.relu(x)
123
-
124
- x = self.layer4(x)
125
- x = self.conv4_1(x)
126
- x = self.bn4_1(x)
127
- x = self.relu(x)
128
- x = self.conv4_2(x)
129
- x = self.bn4_2(x)
130
- conv = self.relu(x)
131
-
132
- conv = conv.transpose(-1, -2)
133
- conv = conv.flatten(2)
134
- conv = conv.permute(-1, 0, 1)
135
-
136
- return conv
137
-
138
- def Resnet50(ss, hidden):
139
- return ResNet(3, hidden, BasicBlock, [1, 2, 5, 3])
140
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vietocr/model/backbone/vgg.py DELETED
@@ -1,50 +0,0 @@
1
- import torch
2
- from torch import nn
3
- from torchvision import models
4
- from einops import rearrange
5
- from torchvision.models._utils import IntermediateLayerGetter
6
-
7
-
8
- class Vgg(nn.Module):
9
- def __init__(self, name, ss, ks, hidden, pretrained=True, dropout=0.5):
10
- super(Vgg, self).__init__()
11
-
12
- if name == 'vgg11_bn':
13
- cnn = models.vgg11_bn(weights='DEFAULT')
14
- elif name == 'vgg19_bn':
15
- cnn = models.vgg19_bn(weights='DEFAULT')
16
-
17
- pool_idx = 0
18
-
19
- for i, layer in enumerate(cnn.features):
20
- if isinstance(layer, torch.nn.MaxPool2d):
21
- cnn.features[i] = torch.nn.AvgPool2d(kernel_size=ks[pool_idx], stride=ss[pool_idx], padding=0)
22
- pool_idx += 1
23
-
24
- self.features = cnn.features
25
- self.dropout = nn.Dropout(dropout)
26
- self.last_conv_1x1 = nn.Conv2d(512, hidden, 1)
27
-
28
- def forward(self, x):
29
- """
30
- Shape:
31
- - x: (N, C, H, W)
32
- - output: (W, N, C)
33
- """
34
-
35
- conv = self.features(x)
36
- conv = self.dropout(conv)
37
- conv = self.last_conv_1x1(conv)
38
-
39
- # conv = rearrange(conv, 'b d h w -> b d (w h)')
40
- conv = conv.transpose(-1, -2)
41
- conv = conv.flatten(2)
42
- conv = conv.permute(-1, 0, 1)
43
- return conv
44
-
45
- def vgg11_bn(ss, ks, hidden, pretrained=True, dropout=0.5):
46
- return Vgg('vgg11_bn', ss, ks, hidden, pretrained, dropout)
47
-
48
- def vgg19_bn(ss, ks, hidden, pretrained=True, dropout=0.5):
49
- return Vgg('vgg19_bn', ss, ks, hidden, pretrained, dropout)
50
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vietocr/model/seqmodel/__init__.py DELETED
File without changes
vietocr/model/seqmodel/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (183 Bytes)
 
vietocr/model/seqmodel/__pycache__/convseq2seq.cpython-311.pyc DELETED
Binary file (10.7 kB)
 
vietocr/model/seqmodel/__pycache__/seq2seq.cpython-311.pyc DELETED
Binary file (9.79 kB)
 
vietocr/model/seqmodel/__pycache__/transformer.cpython-311.pyc DELETED
Binary file (10.2 kB)
 
vietocr/model/seqmodel/convseq2seq.py DELETED
@@ -1,324 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- import torch.optim as optim
4
- import torch.nn.functional as F
5
-
6
- class Encoder(nn.Module):
7
- def __init__(self,
8
- emb_dim,
9
- hid_dim,
10
- n_layers,
11
- kernel_size,
12
- dropout,
13
- device,
14
- max_length = 512):
15
- super().__init__()
16
-
17
- assert kernel_size % 2 == 1, "Kernel size must be odd!"
18
-
19
- self.device = device
20
-
21
- self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
22
-
23
- # self.tok_embedding = nn.Embedding(input_dim, emb_dim)
24
- self.pos_embedding = nn.Embedding(max_length, emb_dim)
25
-
26
- self.emb2hid = nn.Linear(emb_dim, hid_dim)
27
- self.hid2emb = nn.Linear(hid_dim, emb_dim)
28
-
29
- self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim,
30
- out_channels = 2 * hid_dim,
31
- kernel_size = kernel_size,
32
- padding = (kernel_size - 1) // 2)
33
- for _ in range(n_layers)])
34
-
35
- self.dropout = nn.Dropout(dropout)
36
-
37
- def forward(self, src):
38
-
39
- #src = [batch size, src len]
40
-
41
- src = src.transpose(0, 1)
42
-
43
- batch_size = src.shape[0]
44
- src_len = src.shape[1]
45
- device = src.device
46
-
47
- #create position tensor
48
- pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(device)
49
-
50
- #pos = [0, 1, 2, 3, ..., src len - 1]
51
-
52
- #pos = [batch size, src len]
53
-
54
- #embed tokens and positions
55
-
56
- # tok_embedded = self.tok_embedding(src)
57
- tok_embedded = src
58
-
59
- pos_embedded = self.pos_embedding(pos)
60
-
61
- #tok_embedded = pos_embedded = [batch size, src len, emb dim]
62
-
63
- #combine embeddings by elementwise summing
64
- embedded = self.dropout(tok_embedded + pos_embedded)
65
-
66
- #embedded = [batch size, src len, emb dim]
67
-
68
- #pass embedded through linear layer to convert from emb dim to hid dim
69
- conv_input = self.emb2hid(embedded)
70
-
71
- #conv_input = [batch size, src len, hid dim]
72
-
73
- #permute for convolutional layer
74
- conv_input = conv_input.permute(0, 2, 1)
75
-
76
- #conv_input = [batch size, hid dim, src len]
77
-
78
- #begin convolutional blocks...
79
-
80
- for i, conv in enumerate(self.convs):
81
-
82
- #pass through convolutional layer
83
- conved = conv(self.dropout(conv_input))
84
-
85
- #conved = [batch size, 2 * hid dim, src len]
86
-
87
- #pass through GLU activation function
88
- conved = F.glu(conved, dim = 1)
89
-
90
- #conved = [batch size, hid dim, src len]
91
-
92
- #apply residual connection
93
- conved = (conved + conv_input) * self.scale
94
-
95
- #conved = [batch size, hid dim, src len]
96
-
97
- #set conv_input to conved for next loop iteration
98
- conv_input = conved
99
-
100
- #...end convolutional blocks
101
-
102
- #permute and convert back to emb dim
103
- conved = self.hid2emb(conved.permute(0, 2, 1))
104
-
105
- #conved = [batch size, src len, emb dim]
106
-
107
- #elementwise sum output (conved) and input (embedded) to be used for attention
108
- combined = (conved + embedded) * self.scale
109
-
110
- #combined = [batch size, src len, emb dim]
111
-
112
- return conved, combined
113
-
114
- class Decoder(nn.Module):
115
- def __init__(self,
116
- output_dim,
117
- emb_dim,
118
- hid_dim,
119
- n_layers,
120
- kernel_size,
121
- dropout,
122
- trg_pad_idx,
123
- device,
124
- max_length = 512):
125
- super().__init__()
126
-
127
- self.kernel_size = kernel_size
128
- self.trg_pad_idx = trg_pad_idx
129
- self.device = device
130
-
131
- self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
132
-
133
- self.tok_embedding = nn.Embedding(output_dim, emb_dim)
134
- self.pos_embedding = nn.Embedding(max_length, emb_dim)
135
-
136
- self.emb2hid = nn.Linear(emb_dim, hid_dim)
137
- self.hid2emb = nn.Linear(hid_dim, emb_dim)
138
-
139
- self.attn_hid2emb = nn.Linear(hid_dim, emb_dim)
140
- self.attn_emb2hid = nn.Linear(emb_dim, hid_dim)
141
-
142
- self.fc_out = nn.Linear(emb_dim, output_dim)
143
-
144
- self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim,
145
- out_channels = 2 * hid_dim,
146
- kernel_size = kernel_size)
147
- for _ in range(n_layers)])
148
-
149
- self.dropout = nn.Dropout(dropout)
150
-
151
- def calculate_attention(self, embedded, conved, encoder_conved, encoder_combined):
152
-
153
- #embedded = [batch size, trg len, emb dim]
154
- #conved = [batch size, hid dim, trg len]
155
- #encoder_conved = encoder_combined = [batch size, src len, emb dim]
156
-
157
- #permute and convert back to emb dim
158
- conved_emb = self.attn_hid2emb(conved.permute(0, 2, 1))
159
-
160
- #conved_emb = [batch size, trg len, emb dim]
161
-
162
- combined = (conved_emb + embedded) * self.scale
163
-
164
- #combined = [batch size, trg len, emb dim]
165
-
166
- energy = torch.matmul(combined, encoder_conved.permute(0, 2, 1))
167
-
168
- #energy = [batch size, trg len, src len]
169
-
170
- attention = F.softmax(energy, dim=2)
171
-
172
- #attention = [batch size, trg len, src len]
173
-
174
- attended_encoding = torch.matmul(attention, encoder_combined)
175
-
176
- #attended_encoding = [batch size, trg len, emd dim]
177
-
178
- #convert from emb dim -> hid dim
179
- attended_encoding = self.attn_emb2hid(attended_encoding)
180
-
181
- #attended_encoding = [batch size, trg len, hid dim]
182
-
183
- #apply residual connection
184
- attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale
185
-
186
- #attended_combined = [batch size, hid dim, trg len]
187
-
188
- return attention, attended_combined
189
-
190
- def forward(self, trg, encoder_conved, encoder_combined):
191
-
192
- #trg = [batch size, trg len]
193
- #encoder_conved = encoder_combined = [batch size, src len, emb dim]
194
- trg = trg.transpose(0, 1)
195
-
196
- batch_size = trg.shape[0]
197
- trg_len = trg.shape[1]
198
- device = trg.device
199
-
200
- #create position tensor
201
- pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(device)
202
-
203
- #pos = [batch size, trg len]
204
-
205
- #embed tokens and positions
206
- tok_embedded = self.tok_embedding(trg)
207
- pos_embedded = self.pos_embedding(pos)
208
-
209
- #tok_embedded = [batch size, trg len, emb dim]
210
- #pos_embedded = [batch size, trg len, emb dim]
211
-
212
- #combine embeddings by elementwise summing
213
- embedded = self.dropout(tok_embedded + pos_embedded)
214
-
215
- #embedded = [batch size, trg len, emb dim]
216
-
217
- #pass embedded through linear layer to go through emb dim -> hid dim
218
- conv_input = self.emb2hid(embedded)
219
-
220
- #conv_input = [batch size, trg len, hid dim]
221
-
222
- #permute for convolutional layer
223
- conv_input = conv_input.permute(0, 2, 1)
224
-
225
- #conv_input = [batch size, hid dim, trg len]
226
-
227
- batch_size = conv_input.shape[0]
228
- hid_dim = conv_input.shape[1]
229
-
230
- for i, conv in enumerate(self.convs):
231
-
232
- #apply dropout
233
- conv_input = self.dropout(conv_input)
234
-
235
- #need to pad so decoder can't "cheat"
236
- padding = torch.zeros(batch_size,
237
- hid_dim,
238
- self.kernel_size - 1).fill_(self.trg_pad_idx).to(device)
239
-
240
- padded_conv_input = torch.cat((padding, conv_input), dim = 2)
241
-
242
- #padded_conv_input = [batch size, hid dim, trg len + kernel size - 1]
243
-
244
- #pass through convolutional layer
245
- conved = conv(padded_conv_input)
246
-
247
- #conved = [batch size, 2 * hid dim, trg len]
248
-
249
- #pass through GLU activation function
250
- conved = F.glu(conved, dim = 1)
251
-
252
- #conved = [batch size, hid dim, trg len]
253
-
254
- #calculate attention
255
- attention, conved = self.calculate_attention(embedded,
256
- conved,
257
- encoder_conved,
258
- encoder_combined)
259
-
260
- #attention = [batch size, trg len, src len]
261
-
262
- #apply residual connection
263
- conved = (conved + conv_input) * self.scale
264
-
265
- #conved = [batch size, hid dim, trg len]
266
-
267
- #set conv_input to conved for next loop iteration
268
- conv_input = conved
269
-
270
- conved = self.hid2emb(conved.permute(0, 2, 1))
271
-
272
- #conved = [batch size, trg len, emb dim]
273
-
274
- output = self.fc_out(self.dropout(conved))
275
-
276
- #output = [batch size, trg len, output dim]
277
-
278
- return output, attention
279
-
280
- class ConvSeq2Seq(nn.Module):
281
- def __init__(self, vocab_size, emb_dim, hid_dim, enc_layers, dec_layers, enc_kernel_size, dec_kernel_size, enc_max_length, dec_max_length, dropout, pad_idx, device):
282
- super().__init__()
283
-
284
- enc = Encoder(emb_dim, hid_dim, enc_layers, enc_kernel_size, dropout, device, enc_max_length)
285
- dec = Decoder(vocab_size, emb_dim, hid_dim, dec_layers, dec_kernel_size, dropout, pad_idx, device, dec_max_length)
286
-
287
- self.encoder = enc
288
- self.decoder = dec
289
-
290
- def forward_encoder(self, src):
291
- encoder_conved, encoder_combined = self.encoder(src)
292
-
293
- return encoder_conved, encoder_combined
294
-
295
- def forward_decoder(self, trg, memory):
296
- encoder_conved, encoder_combined = memory
297
- output, attention = self.decoder(trg, encoder_conved, encoder_combined)
298
-
299
- return output, (encoder_conved, encoder_combined)
300
-
301
- def forward(self, src, trg):
302
-
303
- #src = [batch size, src len]
304
- #trg = [batch size, trg len - 1] (<eos> token sliced off the end)
305
-
306
- #calculate z^u (encoder_conved) and (z^u + e) (encoder_combined)
307
- #encoder_conved is output from final encoder conv. block
308
- #encoder_combined is encoder_conved plus (elementwise) src embedding plus
309
- # positional embeddings
310
- encoder_conved, encoder_combined = self.encoder(src)
311
-
312
- #encoder_conved = [batch size, src len, emb dim]
313
- #encoder_combined = [batch size, src len, emb dim]
314
-
315
- #calculate predictions of next words
316
- #output is a batch of predictions for each word in the trg sentence
317
- #attention a batch of attention scores across the src sentence for
318
- # each word in the trg sentence
319
- output, attention = self.decoder(trg, encoder_conved, encoder_combined)
320
-
321
- #output = [batch size, trg len - 1, output dim]
322
- #attention = [batch size, trg len - 1, src len]
323
-
324
- return output#, attention
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vietocr/model/seqmodel/seq2seq.py DELETED
@@ -1,175 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- import torch.optim as optim
4
- import torch.nn.functional as F
5
-
6
- class Encoder(nn.Module):
7
- def __init__(self, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
8
- super().__init__()
9
-
10
- self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
11
- self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
12
- self.dropout = nn.Dropout(dropout)
13
-
14
- def forward(self, src):
15
- """
16
- src: src_len x batch_size x img_channel
17
- outputs: src_len x batch_size x hid_dim
18
- hidden: batch_size x hid_dim
19
- """
20
-
21
- embedded = self.dropout(src)
22
-
23
- outputs, hidden = self.rnn(embedded)
24
-
25
- hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
26
-
27
- return outputs, hidden
28
-
29
- class Attention(nn.Module):
30
- def __init__(self, enc_hid_dim, dec_hid_dim):
31
- super().__init__()
32
-
33
- self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
34
- self.v = nn.Linear(dec_hid_dim, 1, bias = False)
35
-
36
- def forward(self, hidden, encoder_outputs):
37
- """
38
- hidden: batch_size x hid_dim
39
- encoder_outputs: src_len x batch_size x hid_dim,
40
- outputs: batch_size x src_len
41
- """
42
-
43
- batch_size = encoder_outputs.shape[1]
44
- src_len = encoder_outputs.shape[0]
45
-
46
- hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
47
-
48
- encoder_outputs = encoder_outputs.permute(1, 0, 2)
49
-
50
- energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2)))
51
-
52
- attention = self.v(energy).squeeze(2)
53
-
54
- return F.softmax(attention, dim = 1)
55
-
56
- class Decoder(nn.Module):
57
- def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
58
- super().__init__()
59
-
60
- self.output_dim = output_dim
61
- self.attention = attention
62
-
63
- self.embedding = nn.Embedding(output_dim, emb_dim)
64
- self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
65
- self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
66
- self.dropout = nn.Dropout(dropout)
67
-
68
- def forward(self, input, hidden, encoder_outputs):
69
- """
70
- inputs: batch_size
71
- hidden: batch_size x hid_dim
72
- encoder_outputs: src_len x batch_size x hid_dim
73
- """
74
-
75
- input = input.unsqueeze(0)
76
-
77
- embedded = self.dropout(self.embedding(input))
78
-
79
- a = self.attention(hidden, encoder_outputs)
80
-
81
- a = a.unsqueeze(1)
82
-
83
- encoder_outputs = encoder_outputs.permute(1, 0, 2)
84
-
85
- weighted = torch.bmm(a, encoder_outputs)
86
-
87
- weighted = weighted.permute(1, 0, 2)
88
-
89
- rnn_input = torch.cat((embedded, weighted), dim = 2)
90
-
91
- output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
92
-
93
- assert (output == hidden).all()
94
-
95
- embedded = embedded.squeeze(0)
96
- output = output.squeeze(0)
97
- weighted = weighted.squeeze(0)
98
-
99
- prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
100
-
101
- return prediction, hidden.squeeze(0), a.squeeze(1)
102
-
103
- class Seq2Seq(nn.Module):
104
- def __init__(self, vocab_size, encoder_hidden, decoder_hidden, img_channel, decoder_embedded, dropout=0.1):
105
- super().__init__()
106
-
107
- attn = Attention(encoder_hidden, decoder_hidden)
108
-
109
- self.encoder = Encoder(img_channel, encoder_hidden, decoder_hidden, dropout)
110
- self.decoder = Decoder(vocab_size, decoder_embedded, encoder_hidden, decoder_hidden, dropout, attn)
111
-
112
- def forward_encoder(self, src):
113
- """
114
- src: timestep x batch_size x channel
115
- hidden: batch_size x hid_dim
116
- encoder_outputs: src_len x batch_size x hid_dim
117
- """
118
-
119
- encoder_outputs, hidden = self.encoder(src)
120
-
121
- return (hidden, encoder_outputs)
122
-
123
- def forward_decoder(self, tgt, memory):
124
- """
125
- tgt: timestep x batch_size
126
- hidden: batch_size x hid_dim
127
- encouder: src_len x batch_size x hid_dim
128
- output: batch_size x 1 x vocab_size
129
- """
130
-
131
- tgt = tgt[-1]
132
- hidden, encoder_outputs = memory
133
- output, hidden, _ = self.decoder(tgt, hidden, encoder_outputs)
134
- output = output.unsqueeze(1)
135
-
136
- return output, (hidden, encoder_outputs)
137
-
138
- def forward(self, src, trg):
139
- """
140
- src: time_step x batch_size
141
- trg: time_step x batch_size
142
- outputs: batch_size x time_step x vocab_size
143
- """
144
-
145
- batch_size = src.shape[1]
146
- trg_len = trg.shape[0]
147
- trg_vocab_size = self.decoder.output_dim
148
- device = src.device
149
-
150
- outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(device)
151
- encoder_outputs, hidden = self.encoder(src)
152
-
153
- for t in range(trg_len):
154
- input = trg[t]
155
- output, hidden, _ = self.decoder(input, hidden, encoder_outputs)
156
-
157
- outputs[t] = output
158
-
159
- outputs = outputs.transpose(0, 1).contiguous()
160
-
161
- return outputs
162
-
163
- def expand_memory(self, memory, beam_size):
164
- hidden, encoder_outputs = memory
165
- hidden = hidden.repeat(beam_size, 1)
166
- encoder_outputs = encoder_outputs.repeat(1, beam_size, 1)
167
-
168
- return (hidden, encoder_outputs)
169
-
170
- def get_memory(self, memory, i):
171
- hidden, encoder_outputs = memory
172
- hidden = hidden[[i]]
173
- encoder_outputs = encoder_outputs[:, [i],:]
174
-
175
- return (hidden, encoder_outputs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vietocr/model/seqmodel/transformer.py DELETED
@@ -1,124 +0,0 @@
1
- from einops import rearrange
2
- from torchvision import models
3
- import math
4
- import torch
5
- from torch import nn
6
-
7
- class LanguageTransformer(nn.Module):
8
- def __init__(self, vocab_size,
9
- d_model, nhead,
10
- num_encoder_layers, num_decoder_layers,
11
- dim_feedforward, max_seq_length,
12
- pos_dropout, trans_dropout):
13
- super().__init__()
14
-
15
- self.d_model = d_model
16
- self.embed_tgt = nn.Embedding(vocab_size, d_model)
17
- self.pos_enc = PositionalEncoding(d_model, pos_dropout, max_seq_length)
18
- # self.learned_pos_enc = LearnedPositionalEncoding(d_model, pos_dropout, max_seq_length)
19
-
20
- self.transformer = nn.Transformer(d_model, nhead,
21
- num_encoder_layers, num_decoder_layers,
22
- dim_feedforward, trans_dropout)
23
-
24
- self.fc = nn.Linear(d_model, vocab_size)
25
-
26
- def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
27
- """
28
- Shape:
29
- - src: (W, N, C)
30
- - tgt: (T, N)
31
- - src_key_padding_mask: (N, S)
32
- - tgt_key_padding_mask: (N, T)
33
- - memory_key_padding_mask: (N, S)
34
- - output: (N, T, E)
35
-
36
- """
37
- tgt_mask = self.gen_nopeek_mask(tgt.shape[0]).to(src.device)
38
-
39
- src = self.pos_enc(src*math.sqrt(self.d_model))
40
- # src = self.learned_pos_enc(src*math.sqrt(self.d_model))
41
-
42
- tgt = self.pos_enc(self.embed_tgt(tgt) * math.sqrt(self.d_model))
43
-
44
- output = self.transformer(src, tgt, tgt_mask=tgt_mask, src_key_padding_mask=src_key_padding_mask,
45
- tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask)
46
- # output = rearrange(output, 't n e -> n t e')
47
- output = output.transpose(0, 1)
48
- return self.fc(output)
49
-
50
- def gen_nopeek_mask(self, length):
51
- mask = (torch.triu(torch.ones(length, length)) == 1).transpose(0, 1)
52
- mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
53
-
54
- return mask
55
-
56
- def forward_encoder(self, src):
57
- src = self.pos_enc(src*math.sqrt(self.d_model))
58
- memory = self.transformer.encoder(src)
59
- return memory
60
-
61
- def forward_decoder(self, tgt, memory):
62
- tgt_mask = self.gen_nopeek_mask(tgt.shape[0]).to(tgt.device)
63
- tgt = self.pos_enc(self.embed_tgt(tgt) * math.sqrt(self.d_model))
64
-
65
- output = self.transformer.decoder(tgt, memory, tgt_mask=tgt_mask)
66
- # output = rearrange(output, 't n e -> n t e')
67
- output = output.transpose(0, 1)
68
-
69
- return self.fc(output), memory
70
-
71
- def expand_memory(self, memory, beam_size):
72
- memory = memory.repeat(1, beam_size, 1)
73
- return memory
74
-
75
- def get_memory(self, memory, i):
76
- memory = memory[:, [i], :]
77
- return memory
78
-
79
- class PositionalEncoding(nn.Module):
80
- def __init__(self, d_model, dropout=0.1, max_len=100):
81
- super(PositionalEncoding, self).__init__()
82
- self.dropout = nn.Dropout(p=dropout)
83
-
84
- pe = torch.zeros(max_len, d_model)
85
- position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
86
- div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
87
- pe[:, 0::2] = torch.sin(position * div_term)
88
- pe[:, 1::2] = torch.cos(position * div_term)
89
- pe = pe.unsqueeze(0).transpose(0, 1)
90
- self.register_buffer('pe', pe)
91
-
92
- def forward(self, x):
93
- x = x + self.pe[:x.size(0), :]
94
-
95
- return self.dropout(x)
96
-
97
- class LearnedPositionalEncoding(nn.Module):
98
- def __init__(self, d_model, dropout=0.1, max_len=100):
99
- super(LearnedPositionalEncoding, self).__init__()
100
- self.dropout = nn.Dropout(p=dropout)
101
-
102
- self.pos_embed = nn.Embedding(max_len, d_model)
103
- self.layernorm = LayerNorm(d_model)
104
-
105
- def forward(self, x):
106
- seq_len = x.size(0)
107
- pos = torch.arange(seq_len, dtype=torch.long, device=x.device)
108
- pos = pos.unsqueeze(-1).expand(x.size()[:2])
109
- x = x + self.pos_embed(pos)
110
- return self.dropout(self.layernorm(x))
111
-
112
- class LayerNorm(nn.Module):
113
- "A layernorm module in the TF style (epsilon inside the square root)."
114
- def __init__(self, d_model, variance_epsilon=1e-12):
115
- super().__init__()
116
- self.gamma = nn.Parameter(torch.ones(d_model))
117
- self.beta = nn.Parameter(torch.zeros(d_model))
118
- self.variance_epsilon = variance_epsilon
119
-
120
- def forward(self, x):
121
- u = x.mean(-1, keepdim=True)
122
- s = (x - u).pow(2).mean(-1, keepdim=True)
123
- x = (x - u) / torch.sqrt(s + self.variance_epsilon)
124
- return self.gamma * x + self.beta
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vietocr/model/transformerocr.py DELETED
@@ -1,44 +0,0 @@
1
- from vietocr.model.backbone.cnn import CNN
2
- from vietocr.model.seqmodel.transformer import LanguageTransformer
3
- from vietocr.model.seqmodel.seq2seq import Seq2Seq
4
- from vietocr.model.seqmodel.convseq2seq import ConvSeq2Seq
5
- from torch import nn
6
-
7
- class VietOCR(nn.Module):
8
- def __init__(self, vocab_size,
9
- backbone,
10
- cnn_args,
11
- transformer_args, seq_modeling='transformer'):
12
-
13
- super(VietOCR, self).__init__()
14
-
15
- self.cnn = CNN(backbone, **cnn_args)
16
- self.seq_modeling = seq_modeling
17
-
18
- if seq_modeling == 'transformer':
19
- self.transformer = LanguageTransformer(vocab_size, **transformer_args)
20
- elif seq_modeling == 'seq2seq':
21
- self.transformer = Seq2Seq(vocab_size, **transformer_args)
22
- elif seq_modeling == 'convseq2seq':
23
- self.transformer = ConvSeq2Seq(vocab_size, **transformer_args)
24
- else:
25
- raise('Not Support Seq Model')
26
-
27
- def forward(self, img, tgt_input, tgt_key_padding_mask):
28
- """
29
- Shape:
30
- - img: (N, C, H, W)
31
- - tgt_input: (T, N)
32
- - tgt_key_padding_mask: (N, T)
33
- - output: b t v
34
- """
35
- src = self.cnn(img)
36
-
37
- if self.seq_modeling == 'transformer':
38
- outputs = self.transformer(src, tgt_input, tgt_key_padding_mask=tgt_key_padding_mask)
39
- elif self.seq_modeling == 'seq2seq':
40
- outputs = self.transformer(src, tgt_input)
41
- elif self.seq_modeling == 'convseq2seq':
42
- outputs = self.transformer(src, tgt_input)
43
- return outputs
44
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vietocr/model/vocab.py DELETED
@@ -1,36 +0,0 @@
1
- class Vocab():
2
- def __init__(self, chars):
3
- self.pad = 0
4
- self.go = 1
5
- self.eos = 2
6
- self.mask_token = 3
7
-
8
- self.chars = chars
9
-
10
- self.c2i = {c:i+4 for i, c in enumerate(chars)}
11
-
12
- self.i2c = {i+4:c for i, c in enumerate(chars)}
13
-
14
- self.i2c[0] = '<pad>'
15
- self.i2c[1] = '<sos>'
16
- self.i2c[2] = '<eos>'
17
- self.i2c[3] = '*'
18
-
19
- def encode(self, chars):
20
- return [self.go] + [self.c2i[c] for c in chars] + [self.eos]
21
-
22
- def decode(self, ids):
23
- first = 1 if self.go in ids else 0
24
- last = ids.index(self.eos) if self.eos in ids else None
25
- sent = ''.join([self.i2c[i] for i in ids[first:last]])
26
- return sent
27
-
28
- def __len__(self):
29
- return len(self.c2i) + 4
30
-
31
- def batch_decode(self, arr):
32
- texts = [self.decode(ids) for ids in arr]
33
- return texts
34
-
35
- def __str__(self):
36
- return self.chars