# Modified from OpenAI's Whisper english_normalizer. import re import unicodedata from typing import Iterable # non-ASCII letters that are not separated by "NFKD" normalization ADDITIONAL_DIACRITICS = { "œ": "oe", "Œ": "OE", "ø": "o", "Ø": "O", "æ": "ae", "Æ": "AE", "ß": "ss", "ẞ": "SS", "đ": "d", "Đ": "D", "ð": "d", "Ð": "D", "þ": "th", "Þ": "th", "ł": "l", "Ł": "L", } PORTUGUESE_ACCENTED_CHARACTERS = [ "ç", "á", "é", "í", "ó", "ú", "â", "ê", "ô", "ã", "õ", "à", "ò", "è", "ì", "ù" ] PORTUGUESE_DIACRITICS = ['̧', '̂', '̀', '̃', '́'] def remove_symbols_and_diacritics(s: str, keep: Iterable[str] = "") -> str: """ Replace any other markers, symbols, and punctuations with a space, and drop any diacritics (category 'Mn' and some manual mappings) """ return "".join( c if c in keep else ADDITIONAL_DIACRITICS[c] if c in ADDITIONAL_DIACRITICS else "" if unicodedata.category(c) == "Mn" else " " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKD", s) ) class PortugueseTextNormalizer: def __init__(self): self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh)\b" self.replacers = { # contractions in titles/prefixes r"\bsr\b": "senhor ", r"\bsra\b": "senhora ", r"\bsto\b": "santo ", r"\bsta\b": "santa ", r"\bdr\b": "doutor ", r"\bdra\b": "doutora ", r"\bprof\b": "professor ", r"\bcap\b": "capitão ", } def __call__(self, s: str): s = s.lower() s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis s = re.sub(self.ignore_patterns, "", s) for pattern, replacement in self.replacers.items(): s = re.sub(pattern, replacement, s) # In english, one wold remove commas between digits (thousands separators) # and periods not followed by digits (decimals). But in portuguese, either comma or period # can be used as a decimal separator. s = re.sub(r"(\d),(\d)", r"\1\2", s) # remove commas between digits s = re.sub(r"(\d)\.(\d)", r"\1\2", s) # remove periods between digits s = remove_symbols_and_diacritics(s, keep=PORTUGUESE_DIACRITICS) s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space return s.lower()