translator / translate.py
PaulNdrei's picture
Add segmenter
5f540b3
raw
history blame
No virus
2.02 kB
import gc
import ctranslate2
import pyonmttok
from huggingface_hub import snapshot_download
from texttokenizer import TextTokenizer
import unicodedata
def _normalize_input_string(result):
result = unicodedata.normalize('NFC', result)
return result
def _translate_batch(input_batch, spm, model, max_sentence_batch=10):
batch_input_tokenized = []
batch_input_markers = []
#preserve_markup = PreserveMarkup()
num_sentences = len(input_batch)
for pos in range(0, num_sentences):
tokenized = spm.tokenize(input_batch[pos])[0]
batch_input_tokenized.append(tokenized)
batch_output = []
for offset in range(0,len(batch_input_tokenized), max_sentence_batch):
partial_result = model.translate_batch(batch_input_tokenized[offset:offset+max_sentence_batch], return_scores=False, replace_unknowns=True)
for pos in range(0,len(partial_result)):
tokenized = partial_result[pos][0]['tokens']
translated = spm.detokenize(tokenized)
batch_output.append(translated)
return batch_output
def translate_text(sample_text, source_language, lang_pair, max_sentence_batch=20):
spm = pyonmttok.Tokenizer(mode="none",sp_model_path=lang_pair[0])
translator = ctranslate2.Translator(lang_pair[1], device="cpu")
tokenizer=TextTokenizer(source_language)
text = _normalize_input_string(sample_text)
sentences, translate = tokenizer.tokenize(text)
num_sentences = len(sentences)
sentences_batch = []
indexes = []
results = ["" for x in range(num_sentences)]
for i in range(num_sentences):
if translate[i] is False:
continue
sentences_batch.append(sentences[i])
indexes.append(i)
translated_batch = _translate_batch(sentences_batch, spm, translator, max_sentence_batch)
for pos in range(0, len(translated_batch)):
i = indexes[pos]
results[i] = translated_batch[pos]
#Rebuild split sentences
translated = tokenizer.sentence_from_tokens(sentences, translate, results)
gc.collect()
return translated