ak36
/

styletts2

Model card Files Files and versions

Metrics Training metrics Community

styletts2 / data /add_phones.py

ak36's picture

Add files using upload-large-folder tool

07b5cfc verified 4 months ago

history blame contribute delete

1.6 kB

	import re, unicodedata

	_pad = "$"
	_punctuation = ';:,.!?¡¿—…"«»“” '
	_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
	_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"

	# Export all symbols:
	symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)

	dicts = {}
	for i in range(len((symbols))):
	dicts[symbols[i]] = i

	class TextCleaner:
	"""
	• Normalises text to NFC so pre-composed IPA glyphs match `symbols`.
	• Splits on event tokens first (e.g. <evt_gasp>), then per-character.
	• Unknown chars map to the <unk> symbol instead of printing.
	"""
	_EVENT_RE = re.compile(r"<[^>]+>\|.") # match <evt_xxx> or single char

	def __init__(self):
	# `dicts` must already include EVENT_TOKENS and "<unk>"
	self.lookup = dicts
	self.unk_id = 0

	def __call__(self, text: str):
	text = unicodedata.normalize("NFC", text)
	ids = []
	for tok in self._EVENT_RE.findall(text):
	ids.append(self.lookup.get(tok, self.unk_id))
	return ids

	tc = TextCleaner()
	miss = {}

	with open("/home/ubuntu/styletts2-ft/data/train_list.txt", encoding="utf-8") as f:
	for line in f:
	for i in tc(line.split("\|")[1]): # convert once
	pass # if it got an ID, it's known
	print("Unknown chars left:", [k for k,v in miss.items()])