styletts2 / data /add_phones.py
ak36's picture
Add files using upload-large-folder tool
07b5cfc verified
import re, unicodedata
_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»“” '
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
# Export all symbols:
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
dicts = {}
for i in range(len((symbols))):
dicts[symbols[i]] = i
class TextCleaner:
"""
• Normalises text to NFC so pre-composed IPA glyphs match `symbols`.
• Splits on event tokens first (e.g. <evt_gasp>), then per-character.
• Unknown chars map to the <unk> symbol instead of printing.
"""
_EVENT_RE = re.compile(r"<[^>]+>|.") # match <evt_xxx> or single char
def __init__(self):
# `dicts` must already include EVENT_TOKENS and "<unk>"
self.lookup = dicts
self.unk_id = 0
def __call__(self, text: str):
text = unicodedata.normalize("NFC", text)
ids = []
for tok in self._EVENT_RE.findall(text):
ids.append(self.lookup.get(tok, self.unk_id))
return ids
tc = TextCleaner()
miss = {}
with open("/home/ubuntu/styletts2-ft/data/train_list.txt", encoding="utf-8") as f:
for line in f:
for i in tc(line.split("|")[1]): # convert once
pass # if it got an ID, it's known
print("Unknown chars left:", [k for k,v in miss.items()])