"""Set of default text cleaners""" # TODO: pick the cleaner for languages dynamically import re # Regular expression matching whitespace: _whitespace_re = re.compile(r"\s+") rep_map = { ":": ",", ";": ",", ",": ",", "。": ".", "!": "!", "?": "?", "\n": ".", "·": ",", "、": ",", "...": ".", "…": ".", "$": ".", "“": "'", "”": "'", "‘": "'", "’": "'", "(": "'", ")": "'", "(": "'", ")": "'", "《": "'", "》": "'", "【": "'", "】": "'", "[": "'", "]": "'", "—": "", "~": "-", "~": "-", "「": "'", "」": "'", } def replace_punctuation(text): pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) return replaced_text def lowercase(text): return text.lower() def collapse_whitespace(text): return re.sub(_whitespace_re, " ", text).strip() def remove_punctuation_at_begin(text): return re.sub(r'^[,.!?]+', '', text) def remove_aux_symbols(text): text = re.sub(r"[\<\>\(\)\[\]\"\«\»\']+", "", text) return text def replace_symbols(text, lang="en"): """Replace symbols based on the lenguage tag. Args: text: Input text. lang: Lenguage identifier. ex: "en", "fr", "pt", "ca". Returns: The modified text example: input args: text: "si l'avi cau, diguem-ho" lang: "ca" Output: text: "si lavi cau, diguemho" """ text = text.replace(";", ",") text = text.replace("-", " ") if lang != "ca" else text.replace("-", "") text = text.replace(":", ",") if lang == "en": text = text.replace("&", " and ") elif lang == "fr": text = text.replace("&", " et ") elif lang == "pt": text = text.replace("&", " e ") elif lang == "ca": text = text.replace("&", " i ") text = text.replace("'", "") elif lang== "es": text=text.replace("&","y") text = text.replace("'", "") return text def unicleaners(text, cased=False, lang='en'): """Basic pipeline for Portuguese text. There is no need to expand abbreviation and numbers, phonemizer already does that""" if not cased: text = lowercase(text) text = replace_punctuation(text) text = replace_symbols(text, lang=lang) text = remove_aux_symbols(text) text = remove_punctuation_at_begin(text) text = collapse_whitespace(text) text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text) return text