import gc import ctranslate2 import pyonmttok from huggingface_hub import snapshot_download from texttokenizer import TextTokenizer import unicodedata def _normalize_input_string(result): result = unicodedata.normalize('NFC', result) return result def _translate_batch(input_batch, spm, model, max_sentence_batch=10): batch_input_tokenized = [] batch_input_markers = [] #preserve_markup = PreserveMarkup() num_sentences = len(input_batch) for pos in range(0, num_sentences): tokenized = spm.tokenize(input_batch[pos])[0] batch_input_tokenized.append(tokenized) batch_output = [] for offset in range(0,len(batch_input_tokenized), max_sentence_batch): partial_result = model.translate_batch(batch_input_tokenized[offset:offset+max_sentence_batch], return_scores=False, replace_unknowns=True) for pos in range(0,len(partial_result)): tokenized = partial_result[pos][0]['tokens'] translated = spm.detokenize(tokenized) batch_output.append(translated) return batch_output def translate_text(sample_text, source_language, lang_pair, max_sentence_batch=20): spm = pyonmttok.Tokenizer(mode="none",sp_model_path=lang_pair[0]) translator = ctranslate2.Translator(lang_pair[1], device="cpu") tokenizer=TextTokenizer(source_language) text = _normalize_input_string(sample_text) sentences, translate = tokenizer.tokenize(text) num_sentences = len(sentences) sentences_batch = [] indexes = [] results = ["" for x in range(num_sentences)] for i in range(num_sentences): if translate[i] is False: continue sentences_batch.append(sentences[i]) indexes.append(i) translated_batch = _translate_batch(sentences_batch, spm, translator, max_sentence_batch) for pos in range(0, len(translated_batch)): i = indexes[pos] results[i] = translated_batch[pos] #Rebuild split sentences translated = tokenizer.sentence_from_tokens(sentences, translate, results) gc.collect() return translated