Spaces:
Paused
Paused
| """Set of default text cleaners""" | |
| # TODO: pick the cleaner for languages dynamically | |
| import re | |
| from anyascii import anyascii | |
| from TTS.tts.utils.text.chinese_mandarin.numbers import replace_numbers_to_characters_in_text | |
| from .english.abbreviations import abbreviations_en | |
| from .english.number_norm import normalize_numbers as en_normalize_numbers | |
| from .english.time_norm import expand_time_english | |
| from .french.abbreviations import abbreviations_fr | |
| # Regular expression matching whitespace: | |
| _whitespace_re = re.compile(r"\s+") | |
| def expand_abbreviations(text, lang="en"): | |
| if lang == "en": | |
| _abbreviations = abbreviations_en | |
| elif lang == "fr": | |
| _abbreviations = abbreviations_fr | |
| for regex, replacement in _abbreviations: | |
| text = re.sub(regex, replacement, text) | |
| return text | |
| def lowercase(text): | |
| return text.lower() | |
| def collapse_whitespace(text): | |
| return re.sub(_whitespace_re, " ", text).strip() | |
| def convert_to_ascii(text): | |
| return anyascii(text) | |
| def remove_aux_symbols(text): | |
| text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text) | |
| return text | |
| def replace_symbols(text, lang="en"): | |
| """Replace symbols based on the lenguage tag. | |
| Args: | |
| text: | |
| Input text. | |
| lang: | |
| Lenguage identifier. ex: "en", "fr", "pt", "ca". | |
| Returns: | |
| The modified text | |
| example: | |
| input args: | |
| text: "si l'avi cau, diguem-ho" | |
| lang: "ca" | |
| Output: | |
| text: "si lavi cau, diguemho" | |
| """ | |
| text = text.replace(";", ",") | |
| text = text.replace("-", " ") if lang != "ca" else text.replace("-", "") | |
| text = text.replace(":", ",") | |
| if lang == "en": | |
| text = text.replace("&", " and ") | |
| elif lang == "fr": | |
| text = text.replace("&", " et ") | |
| elif lang == "pt": | |
| text = text.replace("&", " e ") | |
| elif lang == "ca": | |
| text = text.replace("&", " i ") | |
| text = text.replace("'", "") | |
| return text | |
| def basic_cleaners(text): | |
| """Basic pipeline that lowercases and collapses whitespace without transliteration.""" | |
| text = lowercase(text) | |
| text = collapse_whitespace(text) | |
| return text | |
| def transliteration_cleaners(text): | |
| """Pipeline for non-English text that transliterates to ASCII.""" | |
| # text = convert_to_ascii(text) | |
| text = lowercase(text) | |
| text = collapse_whitespace(text) | |
| return text | |
| def basic_german_cleaners(text): | |
| """Pipeline for German text""" | |
| text = lowercase(text) | |
| text = collapse_whitespace(text) | |
| return text | |
| # TODO: elaborate it | |
| def basic_turkish_cleaners(text): | |
| """Pipeline for Turkish text""" | |
| text = text.replace("I", "ı") | |
| text = lowercase(text) | |
| text = collapse_whitespace(text) | |
| return text | |
| def english_cleaners(text): | |
| """Pipeline for English text, including number and abbreviation expansion.""" | |
| # text = convert_to_ascii(text) | |
| text = lowercase(text) | |
| text = expand_time_english(text) | |
| text = en_normalize_numbers(text) | |
| text = expand_abbreviations(text) | |
| text = replace_symbols(text) | |
| text = remove_aux_symbols(text) | |
| text = collapse_whitespace(text) | |
| return text | |
| def phoneme_cleaners(text): | |
| """Pipeline for phonemes mode, including number and abbreviation expansion.""" | |
| text = en_normalize_numbers(text) | |
| text = expand_abbreviations(text) | |
| text = replace_symbols(text) | |
| text = remove_aux_symbols(text) | |
| text = collapse_whitespace(text) | |
| return text | |
| def french_cleaners(text): | |
| """Pipeline for French text. There is no need to expand numbers, phonemizer already does that""" | |
| text = expand_abbreviations(text, lang="fr") | |
| text = lowercase(text) | |
| text = replace_symbols(text, lang="fr") | |
| text = remove_aux_symbols(text) | |
| text = collapse_whitespace(text) | |
| return text | |
| def portuguese_cleaners(text): | |
| """Basic pipeline for Portuguese text. There is no need to expand abbreviation and | |
| numbers, phonemizer already does that""" | |
| text = lowercase(text) | |
| text = replace_symbols(text, lang="pt") | |
| text = remove_aux_symbols(text) | |
| text = collapse_whitespace(text) | |
| return text | |
| def chinese_mandarin_cleaners(text: str) -> str: | |
| """Basic pipeline for chinese""" | |
| text = replace_numbers_to_characters_in_text(text) | |
| return text | |
| def multilingual_cleaners(text): | |
| """Pipeline for multilingual text""" | |
| text = lowercase(text) | |
| text = replace_symbols(text, lang=None) | |
| text = remove_aux_symbols(text) | |
| text = collapse_whitespace(text) | |
| return text | |
| def no_cleaners(text): | |
| # remove newline characters | |
| text = text.replace("\n", "") | |
| return text | |