|
"""Set of default text cleaners""" |
|
|
|
|
|
import re |
|
from .french_abbreviations import abbreviations_fr |
|
|
|
|
|
_whitespace_re = re.compile(r"\s+") |
|
|
|
|
|
rep_map = { |
|
":": ",", |
|
";": ",", |
|
",": ",", |
|
"。": ".", |
|
"!": "!", |
|
"?": "?", |
|
"\n": ".", |
|
"·": ",", |
|
"、": ",", |
|
"...": ".", |
|
"…": ".", |
|
"$": ".", |
|
"“": "", |
|
"”": "", |
|
"‘": "", |
|
"’": "", |
|
"(": "", |
|
")": "", |
|
"(": "", |
|
")": "", |
|
"《": "", |
|
"》": "", |
|
"【": "", |
|
"】": "", |
|
"[": "", |
|
"]": "", |
|
"—": "", |
|
"~": "-", |
|
"~": "-", |
|
"「": "", |
|
"」": "", |
|
"¿" : "", |
|
"¡" : "" |
|
} |
|
|
|
|
|
def replace_punctuation(text): |
|
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) |
|
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) |
|
return replaced_text |
|
|
|
def expand_abbreviations(text, lang="fr"): |
|
if lang == "fr": |
|
_abbreviations = abbreviations_fr |
|
for regex, replacement in _abbreviations: |
|
text = re.sub(regex, replacement, text) |
|
return text |
|
|
|
|
|
def lowercase(text): |
|
return text.lower() |
|
|
|
|
|
def collapse_whitespace(text): |
|
return re.sub(_whitespace_re, " ", text).strip() |
|
|
|
def remove_punctuation_at_begin(text): |
|
return re.sub(r'^[,.!?]+', '', text) |
|
|
|
def remove_aux_symbols(text): |
|
text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text) |
|
return text |
|
|
|
|
|
def replace_symbols(text, lang="en"): |
|
"""Replace symbols based on the lenguage tag. |
|
|
|
Args: |
|
text: |
|
Input text. |
|
lang: |
|
Lenguage identifier. ex: "en", "fr", "pt", "ca". |
|
|
|
Returns: |
|
The modified text |
|
example: |
|
input args: |
|
text: "si l'avi cau, diguem-ho" |
|
lang: "ca" |
|
Output: |
|
text: "si lavi cau, diguemho" |
|
""" |
|
text = text.replace(";", ",") |
|
text = text.replace("-", " ") if lang != "ca" else text.replace("-", "") |
|
text = text.replace(":", ",") |
|
if lang == "en": |
|
text = text.replace("&", " and ") |
|
elif lang == "fr": |
|
text = text.replace("&", " et ") |
|
elif lang == "pt": |
|
text = text.replace("&", " e ") |
|
elif lang == "ca": |
|
text = text.replace("&", " i ") |
|
text = text.replace("'", "") |
|
elif lang== "es": |
|
text=text.replace("&","y") |
|
text = text.replace("'", "") |
|
return text |
|
|
|
def french_cleaners(text): |
|
"""Pipeline for French text. There is no need to expand numbers, phonemizer already does that""" |
|
text = expand_abbreviations(text, lang="fr") |
|
|
|
text = replace_punctuation(text) |
|
text = replace_symbols(text, lang="fr") |
|
text = remove_aux_symbols(text) |
|
text = remove_punctuation_at_begin(text) |
|
text = collapse_whitespace(text) |
|
text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text) |
|
return text |
|
|
|
|