Spaces:

Mahiruoshi
/

MyGO_VIts-bert

Running

App Files Files Community

MyGO_VIts-bert / text /japanese.py

Mahiruoshi

Upload 103 files

5f275a8 verified 5 months ago

raw history blame contribute delete

No virus

18.5 kB

	# Convert Japanese text to phonemes which is
	# compatible with Julius https://github.com/julius-speech/segmentation-kit
	import re
	import unicodedata

	from transformers import AutoTokenizer

	from text import punctuation, symbols

	from num2words import num2words

	import pyopenjtalk
	import jaconv


	# Mapping of hiragana to phonetic representation
	hiragana_map = {
	"う゛ぁ": " v a",
	"う゛ぃ": " v i",
	"う゛ぇ": " v e",
	"う゛ぉ": " v o",
	"う゛ゅ": " by u",
	"ぅ゛": " v u",
	# ゔ等の処理を追加
	"ゔぁ": " v a",
	"ゔぃ": " v i",
	"ゔぇ": " v e",
	"ゔぉ": " v o",
	"ゔゅ": " by u",
	# 2文字からなる変換規則
	"あぁ": " a a",
	"いぃ": " i i",
	"いぇ": " i e",
	"いゃ": " y a",
	"うぅ": " u:",
	"えぇ": " e e",
	"おぉ": " o:",
	"かぁ": " k a:",
	"きぃ": " k i:",
	"くぅ": " k u:",
	"くゃ": " ky a",
	"くゅ": " ky u",
	"くょ": " ky o",
	"けぇ": " k e:",
	"こぉ": " k o:",
	"がぁ": " g a:",
	"ぎぃ": " g i:",
	"ぐぅ": " g u:",
	"ぐゃ": " gy a",
	"ぐゅ": " gy u",
	"ぐょ": " gy o",
	"げぇ": " g e:",
	"ごぉ": " g o:",
	"さぁ": " s a:",
	"しぃ": " sh i",
	"すぅ": " s u:",
	"すゃ": " sh a",
	"すゅ": " sh u",
	"すょ": " sh o",
	"せぇ": " s e:",
	"そぉ": " s o:",
	"ざぁ": " z a:",
	"じぃ": " j i:",
	"ずぅ": " z u:",
	"ずゃ": " zy a",
	"ずゅ": " zy u",
	"ずょ": " zy o",
	"ぜぇ": " z e:",
	"ぞぉ": " z o:",
	"たぁ": " t a:",
	"ちぃ": " ch i",
	"つぁ": " ts a",
	"つぃ": " ts i",
	"つぅ": " ts u",
	"つゃ": " ch a",
	"つゅ": " ch u",
	"つょ": " ch o",
	"つぇ": " ts e",
	"つぉ": " ts o",
	"てぇ": " t e:",
	"とぉ": " t o:",
	"だぁ": " d a:",
	"ぢぃ": " j i:",
	"づぅ": " d u:",
	"づゃ": " zy a",
	"づゅ": " zy u",
	"づょ": " zy o",
	"でぇ": " d e:",
	"なぁ": " n a:",
	"にぃ": " n i:",
	"ぬぅ": " n u:",
	"ぬゃ": " ny a",
	"ぬゅ": " ny u",
	"ぬょ": " ny o",
	"ねぇ": " n e:",
	"のぉ": " n o:",
	"はぁ": " h a:",
	"ひぃ": " h i:",
	"ふぅ": " f u:",
	"ふゃ": " hy a",
	"へぇ": " h e:",
	"ほぉ": " h o:",
	"ばぁ": " b a:",
	"びぃ": " b i:",
	"ぶぅ": " b u:",
	"ぶゅ": " by u",
	"べぇ": " b e:",
	"ぼぉ": " b o:",
	"ぱぁ": " p a:",
	"ぴぃ": " p i:",
	"ぷぅ": " p u:",
	"ぷゃ": " py a",
	"ぷゅ": " py u",
	"ぷょ": " py o",
	"ぺぇ": " p e:",
	"ぽぉ": " p o:",
	"まぁ": " m a:",
	"みぃ": " m i:",
	"むぅ": " m u:",
	"むゃ": " my a",
	"むゅ": " my u",
	"むょ": " my o",
	"めぇ": " m e:",
	"もぉ": " m o:",
	"やぁ": " y a:",
	"ゆぅ": " y u:",
	"ゆゃ": " y a:",
	"ゆゅ": " y u:",
	"ゆょ": " y o:",
	"よぉ": " y o:",
	"らぁ": " r a:",
	"りぃ": " r i:",
	"るぅ": " r u:",
	"るゃ": " ry a",
	"るゅ": " ry u",
	"るょ": " ry o",
	"れぇ": " r e:",
	"ろぉ": " r o:",
	"わぁ": " w a:",
	"をぉ": " o:",
	"う゛": " b u",
	"でぃ": " d i",
	"でゃ": " dy a",
	"でゅ": " dy u",
	"でょ": " dy o",
	"てぃ": " t i",
	"てゃ": " ty a",
	"てゅ": " ty u",
	"てょ": " ty o",
	"すぃ": " s i",
	"ずぁ": " z u",
	"ずぃ": " z i",
	"ずぇ": " z e",
	"ずぉ": " z o",
	"きゃ": " ky a",
	"きゅ": " ky u",
	"きょ": " ky o",
	"しゃ": " sh a",
	"しゅ": " sh u",
	"しぇ": " sh e",
	"しょ": " sh o",
	"ちゃ": " ch a",
	"ちゅ": " ch u",
	"ちぇ": " ch e",
	"ちょ": " ch o",
	"とぅ": " t u",
	"とゃ": " ty a",
	"とゅ": " ty u",
	"とょ": " ty o",
	"どぁ": " d o ",
	"どぅ": " d u",
	"どゃ": " dy a",
	"どゅ": " dy u",
	"どょ": " dy o",
	"どぉ": " d o:",
	"にゃ": " ny a",
	"にゅ": " ny u",
	"にょ": " ny o",
	"ひゃ": " hy a",
	"ひゅ": " hy u",
	"ひょ": " hy o",
	"みゃ": " my a",
	"みゅ": " my u",
	"みょ": " my o",
	"りゃ": " ry a",
	"りゅ": " ry u",
	"りょ": " ry o",
	"ぎゃ": " gy a",
	"ぎゅ": " gy u",
	"ぎょ": " gy o",
	"ぢぇ": " j e",
	"ぢゃ": " j a",
	"ぢゅ": " j u",
	"ぢょ": " j o",
	"じぇ": " j e",
	"じゃ": " j a",
	"じゅ": " j u",
	"じょ": " j o",
	"びゃ": " by a",
	"びゅ": " by u",
	"びょ": " by o",
	"ぴゃ": " py a",
	"ぴゅ": " py u",
	"ぴょ": " py o",
	"うぁ": " u a",
	"うぃ": " w i",
	"うぇ": " w e",
	"うぉ": " w o",
	"ふぁ": " f a",
	"ふぃ": " f i",
	"ふゅ": " hy u",
	"ふょ": " hy o",
	"ふぇ": " f e",
	"ふぉ": " f o",
	# 1音からなる変換規則
	"あ": " a",
	"い": " i",
	"う": " u",
	"ゔ": " v u", # ゔの処理を追加
	"え": " e",
	"お": " o",
	"か": " k a",
	"き": " k i",
	"く": " k u",
	"け": " k e",
	"こ": " k o",
	"さ": " s a",
	"し": " sh i",
	"す": " s u",
	"せ": " s e",
	"そ": " s o",
	"た": " t a",
	"ち": " ch i",
	"つ": " ts u",
	"て": " t e",
	"と": " t o",
	"な": " n a",
	"に": " n i",
	"ぬ": " n u",
	"ね": " n e",
	"の": " n o",
	"は": " h a",
	"ひ": " h i",
	"ふ": " f u",
	"へ": " h e",
	"ほ": " h o",
	"ま": " m a",
	"み": " m i",
	"む": " m u",
	"め": " m e",
	"も": " m o",
	"ら": " r a",
	"り": " r i",
	"る": " r u",
	"れ": " r e",
	"ろ": " r o",
	"が": " g a",
	"ぎ": " g i",
	"ぐ": " g u",
	"げ": " g e",
	"ご": " g o",
	"ざ": " z a",
	"じ": " j i",
	"ず": " z u",
	"ぜ": " z e",
	"ぞ": " z o",
	"だ": " d a",
	"ぢ": " j i",
	"づ": " z u",
	"で": " d e",
	"ど": " d o",
	"ば": " b a",
	"び": " b i",
	"ぶ": " b u",
	"べ": " b e",
	"ぼ": " b o",
	"ぱ": " p a",
	"ぴ": " p i",
	"ぷ": " p u",
	"ぺ": " p e",
	"ぽ": " p o",
	"や": " y a",
	"ゆ": " y u",
	"よ": " y o",
	"わ": " w a",
	"ゐ": " i",
	"ゑ": " e",
	"ん": " N",
	"っ": " q",
	# ここまでに処理されてないぁぃぅぇぉはそのまま大文字扱い
	"ぁ": " a",
	"ぃ": " i",
	"ぅ": " u",
	"ぇ": " e",
	"ぉ": " o",
	"ゎ": " w a",
	# 長音の処理
	# for (pattern, replace_str) in JULIUS_LONG_VOWEL:
	# text = pattern.sub(replace_str, text)
	# text = text.replace("o u", "o:") # おう -> おーの音便
	"ー": ":",
	"〜": ":",
	"−": ":",
	"-": ":",
	# その他特別な処理
	"を": " o",
	# ここまでに処理されていないゅ等もそのまま大文字扱い（追加）
	"ゃ": " y a",
	"ゅ": " y u",
	"ょ": " y o",
	}


	def hiragana2p(txt: str) -> str:
	"""
	Modification of `jaconv.hiragana2julius`.
	- avoid using `:`, instead, `あーーー` -> `a a a a`.
	- avoid converting `o u` to `o o` (because the input is already actual `yomi`).
	- avoid using `N` for `ん` (for compatibility)
	- use `v` for `ゔ` related text.
	- add bare `ゃ` `ゅ` `ょ` to `y a` `y u` `y o` (for compatibility).
	"""

	result = []
	skip = 0
	for i in range(len(txt)):
	if skip:
	skip -= 1
	continue

	for length in range(3, 0, -1):
	if txt[i : i + length] in hiragana_map:
	result.append(hiragana_map[txt[i : i + length]])
	skip = length - 1
	break

	txt = "".join(result)
	txt = txt.strip()
	txt = txt.replace(":+", ":")

	# ここまで`jaconv.hiragana2julius`と音便処理と長音処理をのぞいて同じ
	# ここから`k a:: k i:`→`k a a a k i i`のように`:`の数だけ繰り返す処理
	pattern = r"(\w)(:*)"
	replacement = lambda m: m.group(1) + (" " + m.group(1)) * len(m.group(2))

	txt = re.sub(pattern, replacement, txt)
	txt = txt.replace("N", "n") # 促音のNをnに変換
	return txt


	def kata2phoneme(text: str) -> str:
	"""Convert katakana text to phonemes."""
	text = text.strip()
	if text == "ー":
	return ["ー"]
	elif text.startswith("ー"):
	return ["ー"] + kata2phoneme(text[1:])
	res = []
	prev = None
	while text:
	if re.match(_MARKS, text):
	res.append(text)
	text = text[1:]
	continue
	if text.startswith("ー"):
	if prev:
	res.append(prev[-1])
	text = text[1:]
	continue
	res += hiragana2p(jaconv.kata2hira(text)).split(" ")
	break
	# res = _COLON_RX.sub(":", res)
	return res


	_SYMBOL_TOKENS = set(list("・、。？！"))
	_NO_YOMI_TOKENS = set(list("「」『』―（）［］[]"))
	_MARKS = re.compile(
	r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
	)


	def text2sep_kata(text: str):
	parsed = pyopenjtalk.run_frontend(text)
	res = []
	sep = []
	for parts in parsed:
	word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
	"’", ""
	)
	if yomi:
	if re.match(_MARKS, yomi):
	if len(word) > 1:
	word = [replace_punctuation(i) for i in list(word)]
	yomi = word
	res += yomi
	sep += word
	continue
	elif word not in rep_map.keys() and word not in rep_map.values():
	word = ","
	yomi = word
	res.append(yomi)
	else:
	if word in _SYMBOL_TOKENS:
	res.append(word)
	elif word in ("っ", "ッ"):
	res.append("ッ")
	elif word in _NO_YOMI_TOKENS:
	pass
	else:
	res.append(word)
	sep.append(word)
	return sep, res, get_accent(parsed)


	def get_accent(parsed):
	labels = pyopenjtalk.make_label(parsed)

	phonemes = []
	accents = []
	for n, label in enumerate(labels):
	phoneme = re.search(r"\-([^\+]*)\+", label).group(1)
	if phoneme not in ["sil", "pau"]:
	phonemes.append(phoneme.replace("cl", "q").lower())
	else:
	continue
	a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
	a2 = int(re.search(r"\+(\d+)\+", label).group(1))
	if re.search(r"\-([^\+]*)\+", labels[n + 1]).group(1) in ["sil", "pau"]:
	a2_next = -1
	else:
	a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
	# Falling
	if a1 == 0 and a2_next == a2 + 1:
	accents.append(-1)
	# Rising
	elif a2 == 1 and a2_next == 2:
	accents.append(1)
	else:
	accents.append(0)
	return list(zip(phonemes, accents))


	_ALPHASYMBOL_YOMI = {
	"#": "シャープ",
	"%": "パーセント",
	"&": "アンド",
	"+": "プラス",
	"-": "マイナス",
	":": "コロン",
	";": "セミコロン",
	"<": "小なり",
	"=": "イコール",
	">": "大なり",
	"@": "アット",
	"a": "エー",
	"b": "ビー",
	"c": "シー",
	"d": "ディー",
	"e": "イー",
	"f": "エフ",
	"g": "ジー",
	"h": "エイチ",
	"i": "アイ",
	"j": "ジェー",
	"k": "ケー",
	"l": "エル",
	"m": "エム",
	"n": "エヌ",
	"o": "オー",
	"p": "ピー",
	"q": "キュー",
	"r": "アール",
	"s": "エス",
	"t": "ティー",
	"u": "ユー",
	"v": "ブイ",
	"w": "ダブリュー",
	"x": "エックス",
	"y": "ワイ",
	"z": "ゼット",
	"α": "アルファ",
	"β": "ベータ",
	"γ": "ガンマ",
	"δ": "デルタ",
	"ε": "イプシロン",
	"ζ": "ゼータ",
	"η": "イータ",
	"θ": "シータ",
	"ι": "イオタ",
	"κ": "カッパ",
	"λ": "ラムダ",
	"μ": "ミュー",
	"ν": "ニュー",
	"ξ": "クサイ",
	"ο": "オミクロン",
	"π": "パイ",
	"ρ": "ロー",
	"σ": "シグマ",
	"τ": "タウ",
	"υ": "ウプシロン",
	"φ": "ファイ",
	"χ": "カイ",
	"ψ": "プサイ",
	"ω": "オメガ",
	}


	_NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
	_CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
	_CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
	_NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")


	def japanese_convert_numbers_to_words(text: str) -> str:
	res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
	res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
	res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
	return res


	def japanese_convert_alpha_symbols_to_words(text: str) -> str:
	return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])


	def is_japanese_character(char):
	# 定义日语文字系统的 Unicode 范围
	japanese_ranges = [
	(0x3040, 0x309F), # 平假名
	(0x30A0, 0x30FF), # 片假名
	(0x4E00, 0x9FFF), # 汉字 (CJK Unified Ideographs)
	(0x3400, 0x4DBF), # 汉字扩展 A
	(0x20000, 0x2A6DF), # 汉字扩展 B
	# 可以根据需要添加其他汉字扩展范围
	]

	# 将字符的 Unicode 编码转换为整数
	char_code = ord(char)

	# 检查字符是否在任何一个日语范围内
	for start, end in japanese_ranges:
	if start <= char_code <= end:
	return True

	return False


	rep_map = {
	"：": ",",
	"；": ",",
	"，": ",",
	"。": ".",
	"！": "!",
	"？": "?",
	"\n": ".",
	"．": ".",
	"…": "...",
	"···": "...",
	"・・・": "...",
	"·": ",",
	"・": ",",
	"、": ",",
	"$": ".",
	"“": "'",
	"”": "'",
	'"': "'",
	"‘": "'",
	"’": "'",
	"（": "'",
	"）": "'",
	"(": "'",
	")": "'",
	"《": "'",
	"》": "'",
	"【": "'",
	"】": "'",
	"[": "'",
	"]": "'",
	"—": "-",
	"−": "-",
	"～": "-",
	"~": "-",
	"「": "'",
	"」": "'",
	}


	def replace_punctuation(text):
	pattern = re.compile("\|".join(re.escape(p) for p in rep_map.keys()))

	replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)

	replaced_text = re.sub(
	r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
	+ "".join(punctuation)
	+ r"]+",
	"",
	replaced_text,
	)

	return replaced_text


	def text_normalize(text):
	res = unicodedata.normalize("NFKC", text)
	res = japanese_convert_numbers_to_words(res)
	# res = "".join([i for i in res if is_japanese_character(i)])
	res = replace_punctuation(res)
	res = res.replace("゙", "")
	return res


	def distribute_phone(n_phone, n_word):
	phones_per_word = [0] * n_word
	for task in range(n_phone):
	min_tasks = min(phones_per_word)
	min_index = phones_per_word.index(min_tasks)
	phones_per_word[min_index] += 1
	return phones_per_word


	def handle_long(sep_phonemes):
	for i in range(len(sep_phonemes)):
	if sep_phonemes[i][0] == "ー":
	sep_phonemes[i][0] = sep_phonemes[i - 1][-1]
	if "ー" in sep_phonemes[i]:
	for j in range(len(sep_phonemes[i])):
	if sep_phonemes[i][j] == "ー":
	sep_phonemes[i][j] = sep_phonemes[i][j - 1][-1]
	return sep_phonemes


	tokenizer = AutoTokenizer.from_pretrained("./bert/deberta-v2-large-japanese-char-wwm")


	def align_tones(phones, tones):
	res = []
	for pho in phones:
	temp = [0] * len(pho)
	for idx, p in enumerate(pho):
	if len(tones) == 0:
	break
	if p == tones[0][0]:
	temp[idx] = tones[0][1]
	if idx > 0:
	temp[idx] += temp[idx - 1]
	tones.pop(0)
	temp = [0] + temp
	temp = temp[:-1]
	if -1 in temp:
	temp = [i + 1 for i in temp]
	res.append(temp)
	res = [i for j in res for i in j]
	assert not any([i < 0 for i in res]) and not any([i > 1 for i in res])
	return res


	def rearrange_tones(tones, phones):
	res = [0] * len(tones)
	for i in range(len(tones)):
	if i == 0:
	if tones[i] not in punctuation:
	res[i] = 1
	elif tones[i] == prev:
	if phones[i] in punctuation:
	res[i] = 0
	else:
	res[i] = 1
	elif tones[i] > prev:
	res[i] = 2
	elif tones[i] < prev:
	res[i - 1] = 3
	res[i] = 1
	prev = tones[i]
	return res


	def g2p(norm_text):
	sep_text, sep_kata, acc = text2sep_kata(norm_text)
	sep_tokenized = []
	for i in sep_text:
	if i not in punctuation:
	sep_tokenized.append(tokenizer.tokenize(i))
	else:
	sep_tokenized.append([i])

	sep_phonemes = handle_long([kata2phoneme(i) for i in sep_kata])
	# 异常处理，MeCab不认识的词的话会一路传到这里来，然后炸掉。目前来看只有那些超级稀有的生僻词会出现这种情况
	for i in sep_phonemes:
	for j in i:
	assert j in symbols, (sep_text, sep_kata, sep_phonemes)
	tones = align_tones(sep_phonemes, acc)

	word2ph = []
	for token, phoneme in zip(sep_tokenized, sep_phonemes):
	phone_len = len(phoneme)
	word_len = len(token)

	aaa = distribute_phone(phone_len, word_len)
	word2ph += aaa
	phones = ["_"] + [j for i in sep_phonemes for j in i] + ["_"]
	# tones = [0] + rearrange_tones(tones, phones[1:-1]) + [0]
	tones = [0] + tones + [0]
	word2ph = [1] + word2ph + [1]
	assert len(phones) == len(tones)
	return phones, tones, word2ph


	if __name__ == "__main__":
	tokenizer = AutoTokenizer.from_pretrained("./bert/deberta-v2-large-japanese")
	text = "hello,こんにちは、世界ー！……"
	from text.japanese_bert import get_bert_feature

	text = text_normalize(text)
	print(text)

	phones, tones, word2ph = g2p(text)
	bert = get_bert_feature(text, word2ph)

	print(phones, tones, word2ph, bert.shape)