Spaces:

projecte-aina
/

translator

Running

App Files Files Community

translator / texttokenizer.py

PaulNdrei

Add segmenter

5f540b3 10 months ago

raw

history blame

No virus

2.47 kB

	#!/usr/bin/env python3
	# -- encoding: utf-8 --
	#
	# Copyright (c) 2020 Jordi Mas i Hernandez <jmas@softcatala.org>
	#
	# This program is free software; you can redistribute it and/or
	# modify it under the terms of the GNU Lesser General Public
	# License as published by the Free Software Foundation; either
	# version 2.1 of the License, or (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# Lesser General Public License for more details.
	#
	# You should have received a copy of the GNU Lesser General Public
	# License along with this program; if not, write to the
	# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
	# Boston, MA 02111-1307, USA.

	from __future__ import print_function
	from srx_segmenter import SrxSegmenter, parse
	import os


	def add_breakline_rule(rules,language):
	rules[language]["breaks"].append(["\n", #Before
	""] # After
	)
	return rules


	'''
	Splits text into sentences keeping spaces to allow later
	to reconstruct the same text but with translatabled text changed
	'''
	class TextTokenizer:
	def __init__(self, language):
	srx_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'segment.srx')
	self.rules = parse(srx_filepath)
	self.language = language
	self.rules = add_breakline_rule(self.rules,language)

	def tokenize(self, sentence):
	strings = []
	translate = []

	segmenter = SrxSegmenter(self.rules[self.language], sentence)
	segments, whitespaces = segmenter.extract()

	for i in range(len(segments)):
	whitespace = whitespaces[i]
	if len(whitespace) > 0:
	strings.append(whitespace)
	translate.append(False)

	string = segments[i]
	strings.append(string)
	translate.append(True)

	return strings, translate

	def sentence_from_tokens(self, sentences, translate, translated):
	num_sentences = len(sentences)
	translation = ''
	for i in range(0, num_sentences):
	if translate[i] is True:
	translation += translated[i]
	else:
	translation += sentences[i]

	return translation.strip()