import re, unicodedata | |
_pad = "$" | |
_punctuation = ';:,.!?¡¿—…"«»“” ' | |
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' | |
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" | |
# Export all symbols: | |
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) | |
dicts = {} | |
for i in range(len((symbols))): | |
dicts[symbols[i]] = i | |
class TextCleaner: | |
""" | |
• Normalises text to NFC so pre-composed IPA glyphs match `symbols`. | |
• Splits on event tokens first (e.g. <evt_gasp>), then per-character. | |
• Unknown chars map to the <unk> symbol instead of printing. | |
""" | |
_EVENT_RE = re.compile(r"<[^>]+>|.") # match <evt_xxx> or single char | |
def __init__(self): | |
# `dicts` must already include EVENT_TOKENS and "<unk>" | |
self.lookup = dicts | |
self.unk_id = 0 | |
def __call__(self, text: str): | |
text = unicodedata.normalize("NFC", text) | |
ids = [] | |
for tok in self._EVENT_RE.findall(text): | |
ids.append(self.lookup.get(tok, self.unk_id)) | |
return ids | |
tc = TextCleaner() | |
miss = {} | |
with open("/home/ubuntu/styletts2-ft/data/train_list.txt", encoding="utf-8") as f: | |
for line in f: | |
for i in tc(line.split("|")[1]): # convert once | |
pass # if it got an ID, it's known | |
print("Unknown chars left:", [k for k,v in miss.items()]) |