|
"""Set of default text cleaners""" |
|
|
|
|
|
import re |
|
|
|
|
|
_whitespace_re = re.compile(r"\s+") |
|
|
|
rep_map = { |
|
":": ",", |
|
";": ",", |
|
",": ",", |
|
"。": ".", |
|
"!": "!", |
|
"?": "?", |
|
"\n": ".", |
|
"·": ",", |
|
"、": ",", |
|
"...": ".", |
|
"…": ".", |
|
"$": ".", |
|
"“": "'", |
|
"”": "'", |
|
"‘": "'", |
|
"’": "'", |
|
"(": "'", |
|
")": "'", |
|
"(": "'", |
|
")": "'", |
|
"《": "'", |
|
"》": "'", |
|
"【": "'", |
|
"】": "'", |
|
"[": "'", |
|
"]": "'", |
|
"—": "", |
|
"~": "-", |
|
"~": "-", |
|
"「": "'", |
|
"」": "'", |
|
} |
|
|
|
def replace_punctuation(text): |
|
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) |
|
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) |
|
return replaced_text |
|
|
|
def lowercase(text): |
|
return text.lower() |
|
|
|
|
|
def collapse_whitespace(text): |
|
return re.sub(_whitespace_re, " ", text).strip() |
|
|
|
def remove_punctuation_at_begin(text): |
|
return re.sub(r'^[,.!?]+', '', text) |
|
|
|
def remove_aux_symbols(text): |
|
text = re.sub(r"[\<\>\(\)\[\]\"\«\»\']+", "", text) |
|
return text |
|
|
|
|
|
def replace_symbols(text, lang="en"): |
|
"""Replace symbols based on the lenguage tag. |
|
|
|
Args: |
|
text: |
|
Input text. |
|
lang: |
|
Lenguage identifier. ex: "en", "fr", "pt", "ca". |
|
|
|
Returns: |
|
The modified text |
|
example: |
|
input args: |
|
text: "si l'avi cau, diguem-ho" |
|
lang: "ca" |
|
Output: |
|
text: "si lavi cau, diguemho" |
|
""" |
|
text = text.replace(";", ",") |
|
text = text.replace("-", " ") if lang != "ca" else text.replace("-", "") |
|
text = text.replace(":", ",") |
|
if lang == "en": |
|
text = text.replace("&", " and ") |
|
elif lang == "fr": |
|
text = text.replace("&", " et ") |
|
elif lang == "pt": |
|
text = text.replace("&", " e ") |
|
elif lang == "ca": |
|
text = text.replace("&", " i ") |
|
text = text.replace("'", "") |
|
elif lang== "es": |
|
text=text.replace("&","y") |
|
text = text.replace("'", "") |
|
return text |
|
|
|
def unicleaners(text, cased=False, lang='en'): |
|
"""Basic pipeline for Portuguese text. There is no need to expand abbreviation and |
|
numbers, phonemizer already does that""" |
|
if not cased: |
|
text = lowercase(text) |
|
text = replace_punctuation(text) |
|
text = replace_symbols(text, lang=lang) |
|
text = remove_aux_symbols(text) |
|
text = remove_punctuation_at_begin(text) |
|
text = collapse_whitespace(text) |
|
text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text) |
|
return text |
|
|
|
|