|
import ctranslate2 |
|
from mosestokenizer import MosesSentenceSplitter, MosesTokenizer |
|
|
|
from indicnlp.tokenize import sentence_tokenize, indic_tokenize |
|
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory |
|
|
|
import codecs |
|
from subword_nmt.apply_bpe import BPE |
|
|
|
|
|
tokenize = MosesTokenizer('en') |
|
|
|
codes = codecs.open("en-hi/bpe-codes/codes.en", encoding='utf-8') |
|
bpe = BPE(codes) |
|
|
|
translator = ctranslate2.Translator("en-hi/model_deploy/", |
|
|
|
) |
|
|
|
INP="input-files/flores/eng.devtest" |
|
OUT="output-translation/flores/test.hi" |
|
|
|
inp_lines = open(INP, 'r').readlines() |
|
inp_lines = [line.strip("\n") for line in inp_lines] |
|
|
|
out_file = open(OUT, 'w+') |
|
|
|
|
|
inp_lines = [line.lower() for line in inp_lines] |
|
|
|
|
|
inp_lines = [' '.join(tokenize(line)) for line in inp_lines] |
|
|
|
|
|
inp_lines = [bpe.process_line(line).split(" ") for line in inp_lines] |
|
|
|
|
|
out_lines = translator.translate_batch(inp_lines, beam_size=5, max_batch_size=16) |
|
|
|
|
|
out_lines = [(' '.join(line.hypotheses[0]) + " ").replace("@@ ", "") for line in out_lines] |
|
|
|
for line in out_lines: |
|
out_file.write(line + "\n") |
|
out_file.close() |
|
|