Spaces:
Running
Running
#!/usr/bin/env python3 | |
# -*- encoding: utf-8 -*- | |
# | |
# Copyright (c) 2020 Jordi Mas i Hernandez <jmas@softcatala.org> | |
# | |
# This program is free software; you can redistribute it and/or | |
# modify it under the terms of the GNU Lesser General Public | |
# License as published by the Free Software Foundation; either | |
# version 2.1 of the License, or (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
# Lesser General Public License for more details. | |
# | |
# You should have received a copy of the GNU Lesser General Public | |
# License along with this program; if not, write to the | |
# Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
# Boston, MA 02111-1307, USA. | |
from __future__ import print_function | |
from srx_segmenter import SrxSegmenter, parse | |
import os | |
def add_breakline_rule(rules,language): | |
rules[language]["breaks"].append(["\n", #Before | |
""] # After | |
) | |
return rules | |
''' | |
Splits text into sentences keeping spaces to allow later | |
to reconstruct the same text but with translatabled text changed | |
''' | |
class TextTokenizer: | |
def __init__(self, language): | |
srx_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'segment.srx') | |
self.rules = parse(srx_filepath) | |
self.language = language | |
self.rules = add_breakline_rule(self.rules,language) | |
def tokenize(self, sentence): | |
strings = [] | |
translate = [] | |
segmenter = SrxSegmenter(self.rules[self.language], sentence) | |
segments, whitespaces = segmenter.extract() | |
for i in range(len(segments)): | |
whitespace = whitespaces[i] | |
if len(whitespace) > 0: | |
strings.append(whitespace) | |
translate.append(False) | |
string = segments[i] | |
strings.append(string) | |
translate.append(True) | |
return strings, translate | |
def sentence_from_tokens(self, sentences, translate, translated): | |
num_sentences = len(sentences) | |
translation = '' | |
for i in range(0, num_sentences): | |
if translate[i] is True: | |
translation += translated[i] | |
else: | |
translation += sentences[i] | |
return translation.strip() | |