Spaces:
Running
on
T4
Running
on
T4
"""Set of default text cleaners""" | |
# TODO: pick the cleaner for languages dynamically | |
import re | |
from .french_abbreviations import abbreviations_fr | |
# Regular expression matching whitespace: | |
_whitespace_re = re.compile(r"\s+") | |
rep_map = { | |
":": ",", | |
";": ",", | |
",": ",", | |
"。": ".", | |
"!": "!", | |
"?": "?", | |
"\n": ".", | |
"·": ",", | |
"、": ",", | |
"...": ".", | |
"…": ".", | |
"$": ".", | |
"“": "", | |
"”": "", | |
"‘": "", | |
"’": "", | |
"(": "", | |
")": "", | |
"(": "", | |
")": "", | |
"《": "", | |
"》": "", | |
"【": "", | |
"】": "", | |
"[": "", | |
"]": "", | |
"—": "", | |
"~": "-", | |
"~": "-", | |
"「": "", | |
"」": "", | |
"¿" : "", | |
"¡" : "" | |
} | |
def replace_punctuation(text): | |
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) | |
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) | |
return replaced_text | |
def expand_abbreviations(text, lang="fr"): | |
if lang == "fr": | |
_abbreviations = abbreviations_fr | |
for regex, replacement in _abbreviations: | |
text = re.sub(regex, replacement, text) | |
return text | |
def lowercase(text): | |
return text.lower() | |
def collapse_whitespace(text): | |
return re.sub(_whitespace_re, " ", text).strip() | |
def remove_punctuation_at_begin(text): | |
return re.sub(r'^[,.!?]+', '', text) | |
def remove_aux_symbols(text): | |
text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text) | |
return text | |
def replace_symbols(text, lang="en"): | |
"""Replace symbols based on the lenguage tag. | |
Args: | |
text: | |
Input text. | |
lang: | |
Lenguage identifier. ex: "en", "fr", "pt", "ca". | |
Returns: | |
The modified text | |
example: | |
input args: | |
text: "si l'avi cau, diguem-ho" | |
lang: "ca" | |
Output: | |
text: "si lavi cau, diguemho" | |
""" | |
text = text.replace(";", ",") | |
text = text.replace("-", " ") if lang != "ca" else text.replace("-", "") | |
text = text.replace(":", ",") | |
if lang == "en": | |
text = text.replace("&", " and ") | |
elif lang == "fr": | |
text = text.replace("&", " et ") | |
elif lang == "pt": | |
text = text.replace("&", " e ") | |
elif lang == "ca": | |
text = text.replace("&", " i ") | |
text = text.replace("'", "") | |
elif lang== "es": | |
text=text.replace("&","y") | |
text = text.replace("'", "") | |
return text | |
def french_cleaners(text): | |
"""Pipeline for French text. There is no need to expand numbers, phonemizer already does that""" | |
text = expand_abbreviations(text, lang="fr") | |
# text = lowercase(text) # as we use the cased bert | |
text = replace_punctuation(text) | |
text = replace_symbols(text, lang="fr") | |
text = remove_aux_symbols(text) | |
text = remove_punctuation_at_begin(text) | |
text = collapse_whitespace(text) | |
text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text) | |
return text | |