import re import unicodedata import regex # non-ASCII letters that are not separated by "NFKD" normalization ADDITIONAL_DIACRITICS = { "œ": "oe", "Œ": "OE", "ø": "o", "Ø": "O", "æ": "ae", "Æ": "AE", "ß": "ss", "ẞ": "SS", "đ": "d", "Đ": "D", "ð": "d", "Ð": "D", "þ": "th", "Þ": "th", "ł": "l", "Ł": "L", } def remove_symbols_and_diacritics(s: str, keep=""): """ Replace any other markers, symbols, and punctuations with a space, and drop any diacritics (category 'Mn' and some manual mappings) """ return "".join( c if c in keep else ADDITIONAL_DIACRITICS[c] if c in ADDITIONAL_DIACRITICS else "" if unicodedata.category(c) == "Mn" else " " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKD", s) ) def remove_symbols(s: str): """ Replace any other markers, symbols, punctuations with a space, keeping diacritics """ return "".join( " " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s) ) class BasicTextNormalizer: def __init__(self, remove_diacritics: bool = False, split_letters: bool = False): self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols self.split_letters = split_letters def __call__(self, s: str): s = s.lower() s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis s = self.clean(s).lower() if self.split_letters: s = " ".join(regex.findall(r"\X", s, regex.U)) s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space return s