#!/usr/bin/env python3 # -*- encoding: utf-8 -*- # # Copyright (c) 2020 Jordi Mas i Hernandez # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this program; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. from __future__ import print_function from srx_segmenter import SrxSegmenter, parse import os def add_breakline_rule(rules,language): rules[language]["breaks"].append(["\n", #Before ""] # After ) return rules ''' Splits text into sentences keeping spaces to allow later to reconstruct the same text but with translatabled text changed ''' class TextTokenizer: def __init__(self, language): srx_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'segment.srx') self.rules = parse(srx_filepath) self.language = language self.rules = add_breakline_rule(self.rules,language) def tokenize(self, sentence): strings = [] translate = [] segmenter = SrxSegmenter(self.rules[self.language], sentence) segments, whitespaces = segmenter.extract() for i in range(len(segments)): whitespace = whitespaces[i] if len(whitespace) > 0: strings.append(whitespace) translate.append(False) string = segments[i] strings.append(string) translate.append(True) return strings, translate def sentence_from_tokens(self, sentences, translate, translated): num_sentences = len(sentences) translation = '' for i in range(0, num_sentences): if translate[i] is True: translation += translated[i] else: translation += sentences[i] return translation.strip()