|
|
|
|
|
|
|
import re |
|
import regex |
|
|
|
from itertools import chain |
|
|
|
|
|
class MosesPunctNormalizer: |
|
""" |
|
This is a Python port of the Moses punctuation normalizer from |
|
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl |
|
""" |
|
|
|
EXTRA_WHITESPACE = [ |
|
(r"\r", r""), |
|
(r"\(", r" ("), |
|
(r"\)", r") "), |
|
(r" +", r" "), |
|
(r"\) ([.!:?;,])", r")\g<1>"), |
|
(r"\( ", r"("), |
|
(r" \)", r")"), |
|
(r"(\d) %", r"\g<1>%"), |
|
(r" :", r":"), |
|
(r" ;", r";"), |
|
] |
|
|
|
NORMALIZE_UNICODE_IF_NOT_PENN = [(r"`", r"'"), (r"''", r' " ')] |
|
|
|
NORMALIZE_UNICODE = [ |
|
("„", r'"'), |
|
("“", r'"'), |
|
("”", r'"'), |
|
("–", r"-"), |
|
("—", r" - "), |
|
(r" +", r" "), |
|
("´", r"'"), |
|
("([a-zA-Z])‘([a-zA-Z])", r"\g<1>'\g<2>"), |
|
("([a-zA-Z])’([a-zA-Z])", r"\g<1>'\g<2>"), |
|
("‘", r"'"), |
|
("‚", r"'"), |
|
("’", r"'"), |
|
(r"''", r'"'), |
|
("´´", r'"'), |
|
("…", r"..."), |
|
] |
|
|
|
FRENCH_QUOTES = [ |
|
("\u00A0«\u00A0", r'"'), |
|
("«\u00A0", r'"'), |
|
("«", r'"'), |
|
("\u00A0»\u00A0", r'"'), |
|
("\u00A0»", r'"'), |
|
("»", r'"'), |
|
] |
|
|
|
HANDLE_PSEUDO_SPACES = [ |
|
("\u00A0%", r"%"), |
|
("nº\u00A0", "nº "), |
|
("\u00A0:", r":"), |
|
("\u00A0ºC", " ºC"), |
|
("\u00A0cm", r" cm"), |
|
("\u00A0\\?", "?"), |
|
("\u00A0\\!", "!"), |
|
("\u00A0;", r";"), |
|
(",\u00A0", r", "), |
|
(r" +", r" "), |
|
] |
|
|
|
EN_QUOTATION_FOLLOWED_BY_COMMA = [(r'"([,.]+)', r'\g<1>"')] |
|
|
|
DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA = [ |
|
(r',"', r'",'), |
|
(r'(\.+)"(\s*[^<])', r'"\g<1>\g<2>'), |
|
] |
|
|
|
DE_ES_CZ_CS_FR = [ |
|
("(\\d)\u00A0(\\d)", r"\g<1>,\g<2>"), |
|
] |
|
|
|
OTHER = [ |
|
("(\\d)\u00A0(\\d)", r"\g<1>.\g<2>"), |
|
] |
|
|
|
|
|
|
|
REPLACE_UNICODE_PUNCTUATION = [ |
|
(",", ","), |
|
(r"。\s*", ". "), |
|
("、", ","), |
|
("”", '"'), |
|
("“", '"'), |
|
("∶", ":"), |
|
(":", ":"), |
|
("?", "?"), |
|
("《", '"'), |
|
("》", '"'), |
|
(")", ")"), |
|
("!", "!"), |
|
("(", "("), |
|
(";", ";"), |
|
("」", '"'), |
|
("「", '"'), |
|
("0", "0"), |
|
("1", "1"), |
|
("2", "2"), |
|
("3", "3"), |
|
("4", "4"), |
|
("5", "5"), |
|
("6", "6"), |
|
("7", "7"), |
|
("8", "8"), |
|
("9", "9"), |
|
(r".\s*", ". "), |
|
("~", "~"), |
|
("’", "'"), |
|
("…", "..."), |
|
("━", "-"), |
|
("〈", "<"), |
|
("〉", ">"), |
|
("【", "["), |
|
("】", "]"), |
|
("%", "%"), |
|
] |
|
|
|
def __init__( |
|
self, |
|
lang="en", |
|
penn=True, |
|
norm_quote_commas=True, |
|
norm_numbers=True, |
|
pre_replace_unicode_punct=False, |
|
post_remove_control_chars=False, |
|
): |
|
""" |
|
:param language: The two-letter language code. |
|
:type lang: str |
|
:param penn: Normalize Penn Treebank style quotations. |
|
:type penn: bool |
|
:param norm_quote_commas: Normalize quotations and commas |
|
:type norm_quote_commas: bool |
|
:param norm_numbers: Normalize numbers |
|
:type norm_numbers: bool |
|
""" |
|
self.substitutions = [ |
|
self.EXTRA_WHITESPACE, |
|
self.NORMALIZE_UNICODE, |
|
self.FRENCH_QUOTES, |
|
self.HANDLE_PSEUDO_SPACES, |
|
] |
|
|
|
if penn: |
|
self.substitutions.insert(1, self.NORMALIZE_UNICODE_IF_NOT_PENN) |
|
|
|
if norm_quote_commas: |
|
if lang == "en": |
|
self.substitutions.append(self.EN_QUOTATION_FOLLOWED_BY_COMMA) |
|
elif lang in ["de", "es", "fr"]: |
|
self.substitutions.append(self.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA) |
|
|
|
if norm_numbers: |
|
if lang in ["de", "es", "cz", "cs", "fr"]: |
|
self.substitutions.append(self.DE_ES_CZ_CS_FR) |
|
else: |
|
self.substitutions.append(self.OTHER) |
|
|
|
self.substitutions = list(chain(*self.substitutions)) |
|
|
|
self.pre_replace_unicode_punct = pre_replace_unicode_punct |
|
self.post_remove_control_chars = post_remove_control_chars |
|
|
|
def normalize(self, text): |
|
""" |
|
Returns a string with normalized punctuation. |
|
""" |
|
|
|
if self.pre_replace_unicode_punct: |
|
text = self.replace_unicode_punct(text) |
|
|
|
|
|
for regexp, substitution in self.substitutions: |
|
|
|
text = re.sub(regexp, substitution, str(text)) |
|
|
|
|
|
|
|
if self.post_remove_control_chars: |
|
text = self.remove_control_chars(text) |
|
|
|
return text.strip() |
|
|
|
def replace_unicode_punct(self, text): |
|
for regexp, substitution in self.REPLACE_UNICODE_PUNCTUATION: |
|
text = re.sub(regexp, substitution, str(text)) |
|
return text |
|
|
|
def remove_control_chars(self, text): |
|
return regex.sub(r"\p{C}", "", text) |
|
|