Spaces:

hon9kon9ize
/

tts

Running

App Files Files Community

tts / text /cantonese.py

indiejoseph

first commit

af3d42a 4 months ago

raw

history blame

7.25 kB

	from text.symbols import punctuation
	import re
	import unicodedata
	import cn2an
	import pycantonese
	import jieba
	import csv


	jieba.load_userdict("./text/yue_dict.txt")

	jyutping_dict = {}

	with open("./text/jyutping.csv", "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	word, jyutping = line.split(",")

	if word not in jyutping_dict:
	jyutping_dict[word] = [jyutping]
	else:
	jyutping_dict[word].append(jyutping)


	def normalizer(x):
	x = cn2an.transform(x, "an2cn")

	return x


	def word2jyutping(word):
	jyutpings = [pycantonese.characters_to_jyutping(
	w)[0][1] for w in word if unicodedata.name(w, "").startswith("CJK UNIFIED IDEOGRAPH")]

	for i, j in enumerate(jyutpings):
	if re.search(r"^(la\|ga)[1-6]$", j):
	# la1 -> laa1, ga1 -> gaa1
	jyutpings[i] = jyutpings[i].replace('a', 'aa')

	if None in jyutpings:
	raise ValueError(f"Failed to convert {word} to jyutping: {jyutpings}")

	return " ".join(jyutpings)


	INITIALS = ["", "b", "c", "d", "f", "g", "gw", "h", "j",
	"k", "kw", "l", "m", "n", "ng", "p", "s", "t", "w", "z"]
	FINALS = ["aa", "aai", "aau", "aam", "aan", "aang", "aap", "aat", "aak", "ai", "au", "am", "an", "ang", "ap", "at", "ak", "e", "ei", "eu", "em", "eng", "ep", "ek", "i", "iu", "im",
	"in", "ing", "ip", "it", "ik", "o", "oi", "ou", "on", "ong", "ot", "ok", "oe", "oeng", "oek", "eoi", "eon", "eot", "u", "ui", "un", "ung", "ut", "uk", "yu", "yun", "yut", "m", "ng"]

	rep_map = {
	"：": ",",
	"︰": ",",
	"；": ",",
	"，": ",",
	"﹐": ",",
	"。": ".",
	"！": "!",
	"？": "?",
	"﹖": "?",
	"﹗": "!",
	"\n": ".",
	"·": ",",
	"、": ",",
	"丶": ",",
	"...": "…",
	"⋯": "…",
	"$": ".",
	"“": "'",
	"”": "'",
	'"': "'",
	"‘": "'",
	"’": "'",
	"（": "'",
	"）": "'",
	"(": "'",
	")": "'",
	"《": "'",
	"》": "'",
	"【": "'",
	"】": "'",
	"[": "'",
	"]": "'",
	"—": "-",
	"～": "-",
	"~": "-",
	"「": "'",
	"」": "'",
	"_": "-",
	}

	replacement_chars = {
	"\n": " ",
	'ㄧ': '一',
	'—': '一',
	'更': '更',
	'不': '不',
	'料': '料',
	'聯': '聯',
	'行': '行',
	'利': '利',
	'謢': '護',
	'岀': '出',
	'鎭': '鎮',
	'戯': '戲',
	'旣': '既',
	'立': '立',
	'來': '來',
	'年': '年',
	'㗇': '蝦',
	}


	def replace_punctuation(text):
	pattern = re.compile("\|".join(re.escape(p) for p in rep_map.keys()))
	replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
	replaced_text = "".join(
	c for c in replaced_text if unicodedata.name(c, "").startswith("CJK UNIFIED IDEOGRAPH") or c in punctuation
	)

	return replaced_text


	def replace_chars(text):
	for k, v in replacement_chars.items():
	text = text.replace(k, v)
	return text


	def word_segmentation(text):
	words = jieba.cut(text)
	return words


	def text_normalize(text):
	text = text.strip()
	text = normalizer(text)
	text = replace_punctuation(text)
	text = replace_chars(text)
	return text


	def jyuping_to_initials_finals_tones(jyuping_syllables):
	initials_finals = []
	tones = []
	word2ph = []

	for syllable in jyuping_syllables:
	if syllable in punctuation:
	initials_finals.append(syllable)
	tones.append(0)
	word2ph.append(1) # Add 1 for punctuation
	else:
	init, final, tone = parse_jyutping(syllable)
	initials_finals.extend([init, final])
	tones.extend([tone, tone])
	word2ph.append(2)

	assert len(initials_finals) == len(tones)
	return initials_finals, tones, word2ph


	wordshk_juytping = {}

	# with open("/notebooks/bert-vits2/Bert-VITS2-Cantonese/wordshk_juytping.csv", "r") as csv_file:
	# csv_reader = csv.reader(csv_file, delimiter=',')

	# for row in csv_reader:
	# wordshk_juytping[text_normalize(row[0])] = row[1]


	def get_jyutping(text):
	if text in wordshk_juytping:
	return wordshk_juytping[text].split(" ")

	words = word_segmentation(text)
	jyutping_array = []

	for word in words:
	if word in punctuation:
	jyutping_array.append(word)
	else:
	jyutpings = ""

	if word in jyutping_dict:
	jyutpings = jyutping_dict[word][0]
	else:
	jyutpings = word2jyutping(word)

	if 'la1' in jyutpings:
	print(text, words, jyutpings)

	# match multple jyutping eg: liu4 ge3, or single jyutping eg: liu4
	if not re.search(r"^([a-z]+[1-6]+[ ]?)+$", jyutpings):
	raise ValueError(
	f"Failed to convert {word} to jyutping: {jyutpings}")

	jyutping_array.extend(jyutpings.split(" "))

	return jyutping_array


	def get_bert_feature(text, word2ph):
	from text import cantonese_bert

	return cantonese_bert.get_bert_feature(text, word2ph)


	def parse_jyutping(jyutping):
	orig_jyutping = jyutping

	if len(jyutping) < 2:
	raise ValueError(f"Jyutping string too short: {jyutping}")
	init = ""
	if jyutping[0] == 'n' and jyutping[1] == 'g' and len(jyutping) == 3:
	init = ""
	elif jyutping[0] == 'm' and len(jyutping) == 2:
	init = ""
	elif jyutping[0] == 'n' and jyutping[1] == 'g':
	init = 'ng'
	jyutping = jyutping[2:]
	elif jyutping[0] == 'g' and jyutping[1] == 'w':
	init = 'gw'
	jyutping = jyutping[2:]
	elif jyutping[0] == 'k' and jyutping[1] == 'w':
	init = 'kw'
	jyutping = jyutping[2:]
	elif jyutping[0] in 'bpmfdtnlgkhwzcsj':
	init = jyutping[0]
	jyutping = jyutping[1:]
	else:
	jyutping = jyutping
	try:
	tone = int(jyutping[-1])
	jyutping = jyutping[:-1]
	except:
	raise ValueError(
	f"Jyutping string does not end with a tone number, in {orig_jyutping}")
	final = jyutping

	assert init in INITIALS, f"Invalid initial: {init}, in {orig_jyutping}"

	if final not in FINALS:
	raise ValueError(f"Invalid final: {final}, in {orig_jyutping}")

	return [init, final, tone]


	def g2p(text):
	word2ph = []
	jyuping = get_jyutping(text)
	phones, tones, word2ph = jyuping_to_initials_finals_tones(jyuping)
	phones = ["_"] + phones + ["_"]
	tones = [0] + tones + [0]
	word2ph = [1] + word2ph + [1]
	return phones, tones, word2ph


	if __name__ == "__main__":
	from text.cantonese_bert import get_bert_feature

	# text = "Apple BB 你點解會咁柒㗎？我真係唔該晒你呀！123"
	text = "佢邊係想辭工吖，跳下草裙舞想加人工之嘛。"
	# text = "我個 app 嘅介紹文想由你寫，因為我唔知從一般用家角度要細緻到乜程度"
	# text = "佢哋最叻咪就係去㗇人傷害人,得個殼咋!"
	text = text_normalize(text)
	print('normalized text', text)
	phones, tones, word2ph = g2p(text)
	print(phones, tones, word2ph)
	bert = get_bert_feature(text, word2ph)
	print(bert.shape)