OpenVoice-Srt

Running

App Files Files Community

OpenVoice-Srt / text /mandarin.py

kevinwang676

Upload folder using huggingface_hub

23d4b26 11 months ago

raw

history blame contribute delete

7.72 kB

	import os
	import sys
	import re
	from pypinyin import lazy_pinyin, BOPOMOFO
	import jieba
	import cn2an
	import logging


	# List of (Latin alphabet, bopomofo) pairs:
	_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
	('a', 'ㄟˉ'),
	('b', 'ㄅㄧˋ'),
	('c', 'ㄙㄧˉ'),
	('d', 'ㄉㄧˋ'),
	('e', 'ㄧˋ'),
	('f', 'ㄝˊㄈㄨˋ'),
	('g', 'ㄐㄧˋ'),
	('h', 'ㄝˇㄑㄩˋ'),
	('i', 'ㄞˋ'),
	('j', 'ㄐㄟˋ'),
	('k', 'ㄎㄟˋ'),
	('l', 'ㄝˊㄛˋ'),
	('m', 'ㄝˊㄇㄨˋ'),
	('n', 'ㄣˉ'),
	('o', 'ㄡˉ'),
	('p', 'ㄆㄧˉ'),
	('q', 'ㄎㄧㄡˉ'),
	('r', 'ㄚˋ'),
	('s', 'ㄝˊㄙˋ'),
	('t', 'ㄊㄧˋ'),
	('u', 'ㄧㄡˉ'),
	('v', 'ㄨㄧˉ'),
	('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
	('x', 'ㄝˉㄎㄨˋㄙˋ'),
	('y', 'ㄨㄞˋ'),
	('z', 'ㄗㄟˋ')
	]]

	# List of (bopomofo, romaji) pairs:
	_bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [
	('ㄅㄛ', 'p⁼wo'),
	('ㄆㄛ', 'pʰwo'),
	('ㄇㄛ', 'mwo'),
	('ㄈㄛ', 'fwo'),
	('ㄅ', 'p⁼'),
	('ㄆ', 'pʰ'),
	('ㄇ', 'm'),
	('ㄈ', 'f'),
	('ㄉ', 't⁼'),
	('ㄊ', 'tʰ'),
	('ㄋ', 'n'),
	('ㄌ', 'l'),
	('ㄍ', 'k⁼'),
	('ㄎ', 'kʰ'),
	('ㄏ', 'h'),
	('ㄐ', 'ʧ⁼'),
	('ㄑ', 'ʧʰ'),
	('ㄒ', 'ʃ'),
	('ㄓ', 'ʦ`⁼'),
	('ㄔ', 'ʦ`ʰ'),
	('ㄕ', 's`'),
	('ㄖ', 'ɹ`'),
	('ㄗ', 'ʦ⁼'),
	('ㄘ', 'ʦʰ'),
	('ㄙ', 's'),
	('ㄚ', 'a'),
	('ㄛ', 'o'),
	('ㄜ', 'ə'),
	('ㄝ', 'e'),
	('ㄞ', 'ai'),
	('ㄟ', 'ei'),
	('ㄠ', 'au'),
	('ㄡ', 'ou'),
	('ㄧㄢ', 'yeNN'),
	('ㄢ', 'aNN'),
	('ㄧㄣ', 'iNN'),
	('ㄣ', 'əNN'),
	('ㄤ', 'aNg'),
	('ㄧㄥ', 'iNg'),
	('ㄨㄥ', 'uNg'),
	('ㄩㄥ', 'yuNg'),
	('ㄥ', 'əNg'),
	('ㄦ', 'əɻ'),
	('ㄧ', 'i'),
	('ㄨ', 'u'),
	('ㄩ', 'ɥ'),
	('ˉ', '→'),
	('ˊ', '↑'),
	('ˇ', '↓↑'),
	('ˋ', '↓'),
	('˙', ''),
	('，', ','),
	('。', '.'),
	('！', '!'),
	('？', '?'),
	('—', '-')
	]]

	# List of (romaji, ipa) pairs:
	_romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
	('ʃy', 'ʃ'),
	('ʧʰy', 'ʧʰ'),
	('ʧ⁼y', 'ʧ⁼'),
	('NN', 'n'),
	('Ng', 'ŋ'),
	('y', 'j'),
	('h', 'x')
	]]

	# List of (bopomofo, ipa) pairs:
	_bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
	('ㄅㄛ', 'p⁼wo'),
	('ㄆㄛ', 'pʰwo'),
	('ㄇㄛ', 'mwo'),
	('ㄈㄛ', 'fwo'),
	('ㄅ', 'p⁼'),
	('ㄆ', 'pʰ'),
	('ㄇ', 'm'),
	('ㄈ', 'f'),
	('ㄉ', 't⁼'),
	('ㄊ', 'tʰ'),
	('ㄋ', 'n'),
	('ㄌ', 'l'),
	('ㄍ', 'k⁼'),
	('ㄎ', 'kʰ'),
	('ㄏ', 'x'),
	('ㄐ', 'tʃ⁼'),
	('ㄑ', 'tʃʰ'),
	('ㄒ', 'ʃ'),
	('ㄓ', 'ts`⁼'),
	('ㄔ', 'ts`ʰ'),
	('ㄕ', 's`'),
	('ㄖ', 'ɹ`'),
	('ㄗ', 'ts⁼'),
	('ㄘ', 'tsʰ'),
	('ㄙ', 's'),
	('ㄚ', 'a'),
	('ㄛ', 'o'),
	('ㄜ', 'ə'),
	('ㄝ', 'ɛ'),
	('ㄞ', 'aɪ'),
	('ㄟ', 'eɪ'),
	('ㄠ', 'ɑʊ'),
	('ㄡ', 'oʊ'),
	('ㄧㄢ', 'jɛn'),
	('ㄩㄢ', 'ɥæn'),
	('ㄢ', 'an'),
	('ㄧㄣ', 'in'),
	('ㄩㄣ', 'ɥn'),
	('ㄣ', 'ən'),
	('ㄤ', 'ɑŋ'),
	('ㄧㄥ', 'iŋ'),
	('ㄨㄥ', 'ʊŋ'),
	('ㄩㄥ', 'jʊŋ'),
	('ㄥ', 'əŋ'),
	('ㄦ', 'əɻ'),
	('ㄧ', 'i'),
	('ㄨ', 'u'),
	('ㄩ', 'ɥ'),
	('ˉ', '→'),
	('ˊ', '↑'),
	('ˇ', '↓↑'),
	('ˋ', '↓'),
	('˙', ''),
	('，', ','),
	('。', '.'),
	('！', '!'),
	('？', '?'),
	('—', '-')
	]]

	# List of (bopomofo, ipa2) pairs:
	_bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
	('ㄅㄛ', 'pwo'),
	('ㄆㄛ', 'pʰwo'),
	('ㄇㄛ', 'mwo'),
	('ㄈㄛ', 'fwo'),
	('ㄅ', 'p'),
	('ㄆ', 'pʰ'),
	('ㄇ', 'm'),
	('ㄈ', 'f'),
	('ㄉ', 't'),
	('ㄊ', 'tʰ'),
	('ㄋ', 'n'),
	('ㄌ', 'l'),
	('ㄍ', 'k'),
	('ㄎ', 'kʰ'),
	('ㄏ', 'h'),
	('ㄐ', 'tɕ'),
	('ㄑ', 'tɕʰ'),
	('ㄒ', 'ɕ'),
	('ㄓ', 'tʂ'),
	('ㄔ', 'tʂʰ'),
	('ㄕ', 'ʂ'),
	('ㄖ', 'ɻ'),
	('ㄗ', 'ts'),
	('ㄘ', 'tsʰ'),
	('ㄙ', 's'),
	('ㄚ', 'a'),
	('ㄛ', 'o'),
	('ㄜ', 'ɤ'),
	('ㄝ', 'ɛ'),
	('ㄞ', 'aɪ'),
	('ㄟ', 'eɪ'),
	('ㄠ', 'ɑʊ'),
	('ㄡ', 'oʊ'),
	('ㄧㄢ', 'jɛn'),
	('ㄩㄢ', 'yæn'),
	('ㄢ', 'an'),
	('ㄧㄣ', 'in'),
	('ㄩㄣ', 'yn'),
	('ㄣ', 'ən'),
	('ㄤ', 'ɑŋ'),
	('ㄧㄥ', 'iŋ'),
	('ㄨㄥ', 'ʊŋ'),
	('ㄩㄥ', 'jʊŋ'),
	('ㄥ', 'ɤŋ'),
	('ㄦ', 'əɻ'),
	('ㄧ', 'i'),
	('ㄨ', 'u'),
	('ㄩ', 'y'),
	('ˉ', '˥'),
	('ˊ', '˧˥'),
	('ˇ', '˨˩˦'),
	('ˋ', '˥˩'),
	('˙', ''),
	('，', ','),
	('。', '.'),
	('！', '!'),
	('？', '?'),
	('—', '-')
	]]


	def number_to_chinese(text):
	numbers = re.findall(r'\d+(?:\.?\d+)?', text)
	for number in numbers:
	text = text.replace(number, cn2an.an2cn(number), 1)
	return text


	def chinese_to_bopomofo(text):
	text = text.replace('、', '，').replace('；', '，').replace('：', '，')
	words = jieba.lcut(text, cut_all=False)
	text = ''
	for word in words:
	bopomofos = lazy_pinyin(word, BOPOMOFO)
	if not re.search('[\u4e00-\u9fff]', word):
	text += word
	continue
	for i in range(len(bopomofos)):
	bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
	if text != '':
	text += ' '
	text += ''.join(bopomofos)
	return text


	def latin_to_bopomofo(text):
	for regex, replacement in _latin_to_bopomofo:
	text = re.sub(regex, replacement, text)
	return text


	def bopomofo_to_romaji(text):
	for regex, replacement in _bopomofo_to_romaji:
	text = re.sub(regex, replacement, text)
	return text


	def bopomofo_to_ipa(text):
	for regex, replacement in _bopomofo_to_ipa:
	text = re.sub(regex, replacement, text)
	return text


	def bopomofo_to_ipa2(text):
	for regex, replacement in _bopomofo_to_ipa2:
	text = re.sub(regex, replacement, text)
	return text


	def chinese_to_romaji(text):
	text = number_to_chinese(text)
	text = chinese_to_bopomofo(text)
	text = latin_to_bopomofo(text)
	text = bopomofo_to_romaji(text)
	text = re.sub('i([aoe])', r'y\1', text)
	text = re.sub('u([aoəe])', r'w\1', text)
	text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+\|$)',
	r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
	text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+\|$)', r'\1ɹ\2', text)
	return text


	def chinese_to_lazy_ipa(text):
	text = chinese_to_romaji(text)
	for regex, replacement in _romaji_to_ipa:
	text = re.sub(regex, replacement, text)
	return text


	def chinese_to_ipa(text):
	text = number_to_chinese(text)
	text = chinese_to_bopomofo(text)
	text = latin_to_bopomofo(text)
	text = bopomofo_to_ipa(text)
	text = re.sub('i([aoe])', r'j\1', text)
	text = re.sub('u([aoəe])', r'w\1', text)
	text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+\|$)',
	r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
	text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+\|$)', r'\1ɹ\2', text)
	return text


	def chinese_to_ipa2(text):
	text = number_to_chinese(text)
	text = chinese_to_bopomofo(text)
	text = latin_to_bopomofo(text)
	text = bopomofo_to_ipa2(text)
	text = re.sub(r'i([aoe])', r'j\1', text)
	text = re.sub(r'u([aoəe])', r'w\1', text)
	text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+\|$)', r'\1ʅ\2', text)
	text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+\|$)', r'\1ɿ\2', text)
	return text