Spaces:
Running
Running
import gc | |
import ctranslate2 | |
import pyonmttok | |
from huggingface_hub import snapshot_download | |
from texttokenizer import TextTokenizer | |
import unicodedata | |
def _normalize_input_string(result): | |
result = unicodedata.normalize('NFC', result) | |
return result | |
def _translate_batch(input_batch, spm, model, max_sentence_batch=10): | |
batch_input_tokenized = [] | |
batch_input_markers = [] | |
#preserve_markup = PreserveMarkup() | |
num_sentences = len(input_batch) | |
for pos in range(0, num_sentences): | |
tokenized = spm.tokenize(input_batch[pos])[0] | |
batch_input_tokenized.append(tokenized) | |
batch_output = [] | |
for offset in range(0,len(batch_input_tokenized), max_sentence_batch): | |
partial_result = model.translate_batch(batch_input_tokenized[offset:offset+max_sentence_batch], return_scores=False, replace_unknowns=True) | |
for pos in range(0,len(partial_result)): | |
tokenized = partial_result[pos][0]['tokens'] | |
translated = spm.detokenize(tokenized) | |
batch_output.append(translated) | |
return batch_output | |
def translate_text(sample_text, source_language, lang_pair, max_sentence_batch=20): | |
spm = pyonmttok.Tokenizer(mode="none",sp_model_path=lang_pair[0]) | |
translator = ctranslate2.Translator(lang_pair[1], device="cpu") | |
tokenizer=TextTokenizer(source_language) | |
text = _normalize_input_string(sample_text) | |
sentences, translate = tokenizer.tokenize(text) | |
num_sentences = len(sentences) | |
sentences_batch = [] | |
indexes = [] | |
results = ["" for x in range(num_sentences)] | |
for i in range(num_sentences): | |
if translate[i] is False: | |
continue | |
sentences_batch.append(sentences[i]) | |
indexes.append(i) | |
translated_batch = _translate_batch(sentences_batch, spm, translator, max_sentence_batch) | |
for pos in range(0, len(translated_batch)): | |
i = indexes[pos] | |
results[i] = translated_batch[pos] | |
#Rebuild split sentences | |
translated = tokenizer.sentence_from_tokens(sentences, translate, results) | |
gc.collect() | |
return translated | |