hantech's picture
Upload 38 files
bd22b5e
import torch
import numpy as np
import math
from PIL import Image
from torch.nn.functional import log_softmax, softmax
from vietocr.vietocr.model.transformerocr import VietOCR
from vietocr.vietocr.model.vocab import Vocab
from vietocr.vietocr.model.beam import Beam
def batch_translate_beam_search(img, model, beam_size=4, candidates=1, max_seq_length=128, sos_token=1, eos_token=2):
# img: NxCxHxW
model.eval()
device = img.device
sents = []
with torch.no_grad():
src = model.cnn(img)
print(src.shap)
memories = model.transformer.forward_encoder(src)
for i in range(src.size(0)):
# memory = memories[:,i,:].repeat(1, beam_size, 1) # TxNxE
memory = model.transformer.get_memory(memories, i)
sent = beamsearch(memory, model, device, beam_size, candidates, max_seq_length, sos_token, eos_token)
sents.append(sent)
sents = np.asarray(sents)
return sents
def translate_beam_search(img, model, beam_size=4, candidates=1, max_seq_length=128, sos_token=1, eos_token=2):
# img: 1xCxHxW
model.eval()
device = img.device
with torch.no_grad():
src = model.cnn(img)
memory = model.transformer.forward_encoder(src) #TxNxE
sent = beamsearch(memory, model, device, beam_size, candidates, max_seq_length, sos_token, eos_token)
return sent
def beamsearch(memory, model, device, beam_size=4, candidates=1, max_seq_length=128, sos_token=1, eos_token=2):
# memory: Tx1xE
model.eval()
beam = Beam(beam_size=beam_size, min_length=0, n_top=candidates, ranker=None, start_token_id=sos_token, end_token_id=eos_token)
with torch.no_grad():
# memory = memory.repeat(1, beam_size, 1) # TxNxE
memory = model.transformer.expand_memory(memory, beam_size)
for _ in range(max_seq_length):
tgt_inp = beam.get_current_state().transpose(0,1).to(device) # TxN
decoder_outputs, memory = model.transformer.forward_decoder(tgt_inp, memory)
log_prob = log_softmax(decoder_outputs[:,-1, :].squeeze(0), dim=-1)
beam.advance(log_prob.cpu())
if beam.done():
break
scores, ks = beam.sort_finished(minimum=1)
hypothesises = []
for i, (times, k) in enumerate(ks[:candidates]):
hypothesis = beam.get_hypothesis(times, k)
hypothesises.append(hypothesis)
return [1] + [int(i) for i in hypothesises[0][:-1]]
def translate(img, model, max_seq_length=128, sos_token=1, eos_token=2):
"data: BxCXHxW"
model.eval()
device = img.device
with torch.no_grad():
src = model.cnn(img)
memory = model.transformer.forward_encoder(src)
translated_sentence = [[sos_token]*len(img)]
char_probs = [[1]*len(img)]
max_length = 0
while max_length <= max_seq_length and not all(np.any(np.asarray(translated_sentence).T==eos_token, axis=1)):
tgt_inp = torch.LongTensor(translated_sentence).to(device)
# output = model(img, tgt_inp, tgt_key_padding_mask=None)
# output = model.transformer(src, tgt_inp, tgt_key_padding_mask=None)
output, memory = model.transformer.forward_decoder(tgt_inp, memory)
output = softmax(output, dim=-1)
output = output.to('cpu')
values, indices = torch.topk(output, 5)
indices = indices[:, -1, 0]
indices = indices.tolist()
values = values[:, -1, 0]
values = values.tolist()
char_probs.append(values)
translated_sentence.append(indices)
max_length += 1
del output
translated_sentence = np.asarray(translated_sentence).T
char_probs = np.asarray(char_probs).T
char_probs = np.multiply(char_probs, translated_sentence>3)
char_probs = np.sum(char_probs, axis=-1)/(char_probs>0).sum(-1)
return translated_sentence, char_probs
def build_model(config):
vocab = Vocab(config['vocab'])
device = config['device']
model = VietOCR(len(vocab),
config['backbone'],
config['cnn'],
config['transformer'],
config['seq_modeling'])
model = model.to(device)
return model, vocab
def resize(w, h, expected_height, image_min_width, image_max_width):
new_w = int(expected_height * float(w) / float(h))
round_to = 10
new_w = math.ceil(new_w/round_to)*round_to
new_w = max(new_w, image_min_width)
new_w = min(new_w, image_max_width)
return new_w, expected_height
def process_image(image, image_height, image_min_width, image_max_width):
img = image.convert('RGB')
w, h = img.size
new_w, image_height = resize(w, h, image_height, image_min_width, image_max_width)
img = img.resize((new_w, image_height), Image.ANTIALIAS)
img = np.asarray(img).transpose(2,0, 1)
img = img/255
return img
def process_input(image, image_height, image_min_width, image_max_width):
img = process_image(image, image_height, image_min_width, image_max_width)
img = img[np.newaxis, ...]
img = torch.FloatTensor(img)
return img
def predict(filename, config):
img = Image.open(filename)
img = process_input(img)
img = img.to(config['device'])
model, vocab = build_model(config)
s = translate(img, model)[0].tolist()
s = vocab.decode(s)
return s