Spaces:

projecte-aina
/

translator

Running

App Files Files Community

translator / translate.py

PaulNdrei

Add segmenter

5f540b3 7 months ago

raw

history blame

No virus

2.02 kB

	import gc
	import ctranslate2
	import pyonmttok
	from huggingface_hub import snapshot_download

	from texttokenizer import TextTokenizer
	import unicodedata


	def _normalize_input_string(result):
	result = unicodedata.normalize('NFC', result)
	return result

	def _translate_batch(input_batch, spm, model, max_sentence_batch=10):

	batch_input_tokenized = []
	batch_input_markers = []

	#preserve_markup = PreserveMarkup()

	num_sentences = len(input_batch)
	for pos in range(0, num_sentences):
	tokenized = spm.tokenize(input_batch[pos])[0]
	batch_input_tokenized.append(tokenized)

	batch_output = []
	for offset in range(0,len(batch_input_tokenized), max_sentence_batch):
	partial_result = model.translate_batch(batch_input_tokenized[offset:offset+max_sentence_batch], return_scores=False, replace_unknowns=True)
	for pos in range(0,len(partial_result)):
	tokenized = partial_result[pos][0]['tokens']
	translated = spm.detokenize(tokenized)
	batch_output.append(translated)

	return batch_output


	def translate_text(sample_text, source_language, lang_pair, max_sentence_batch=20):

	spm = pyonmttok.Tokenizer(mode="none",sp_model_path=lang_pair[0])
	translator = ctranslate2.Translator(lang_pair[1], device="cpu")
	tokenizer=TextTokenizer(source_language)

	text = _normalize_input_string(sample_text)
	sentences, translate = tokenizer.tokenize(text)
	num_sentences = len(sentences)
	sentences_batch = []
	indexes = []
	results = ["" for x in range(num_sentences)]
	for i in range(num_sentences):
	if translate[i] is False:
	continue

	sentences_batch.append(sentences[i])
	indexes.append(i)

	translated_batch = _translate_batch(sentences_batch, spm, translator, max_sentence_batch)
	for pos in range(0, len(translated_batch)):
	i = indexes[pos]
	results[i] = translated_batch[pos]

	#Rebuild split sentences
	translated = tokenizer.sentence_from_tokens(sentences, translate, results)

	gc.collect()

	return translated