File size: 2,017 Bytes
5f540b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import gc
import ctranslate2
import pyonmttok
from huggingface_hub import snapshot_download

from texttokenizer import TextTokenizer
import unicodedata


def _normalize_input_string(result):
    result = unicodedata.normalize('NFC', result)
    return result

def _translate_batch(input_batch, spm, model, max_sentence_batch=10):

    batch_input_tokenized = []
    batch_input_markers = []

    #preserve_markup = PreserveMarkup()

    num_sentences = len(input_batch)
    for pos in range(0, num_sentences):
        tokenized = spm.tokenize(input_batch[pos])[0]
        batch_input_tokenized.append(tokenized)

    batch_output = []
    for offset in range(0,len(batch_input_tokenized), max_sentence_batch):
      partial_result = model.translate_batch(batch_input_tokenized[offset:offset+max_sentence_batch], return_scores=False, replace_unknowns=True)
      for pos in range(0,len(partial_result)):
        tokenized = partial_result[pos][0]['tokens']
        translated = spm.detokenize(tokenized)
        batch_output.append(translated)

    return batch_output


def translate_text(sample_text, source_language, lang_pair, max_sentence_batch=20):
     
	spm = pyonmttok.Tokenizer(mode="none",sp_model_path=lang_pair[0])
	translator = ctranslate2.Translator(lang_pair[1], device="cpu")
	tokenizer=TextTokenizer(source_language)

	text = _normalize_input_string(sample_text)
	sentences, translate = tokenizer.tokenize(text)
	num_sentences = len(sentences)
	sentences_batch = []
	indexes = []
	results = ["" for x in range(num_sentences)]
	for i in range(num_sentences):
		if translate[i] is False:
			continue

		sentences_batch.append(sentences[i])
		indexes.append(i)

	translated_batch = _translate_batch(sentences_batch, spm, translator, max_sentence_batch)
	for pos in range(0, len(translated_batch)):
		i = indexes[pos]
		results[i] = translated_batch[pos]

	#Rebuild split sentences
	translated = tokenizer.sentence_from_tokens(sentences, translate, results)
     
	gc.collect()
     
	return translated