# Convert Japanese text to phonemes which is # compatible with Julius https://github.com/julius-speech/segmentation-kit import re import unicodedata from transformers import AutoTokenizer from text import punctuation, symbols try: import MeCab except ImportError as e: raise ImportError("Japanese requires mecab-python3 and unidic-lite.") from e from num2words import num2words _CONVRULES = [ # Conversion of 2 letters "アァ/ a a", "イィ/ i i", "イェ/ i e", "イャ/ y a", "ウゥ/ u:", "エェ/ e e", "オォ/ o:", "カァ/ k a:", "キィ/ k i:", "クゥ/ k u:", "クャ/ ky a", "クュ/ ky u", "クョ/ ky o", "ケェ/ k e:", "コォ/ k o:", "ガァ/ g a:", "ギィ/ g i:", "グゥ/ g u:", "グャ/ gy a", "グュ/ gy u", "グョ/ gy o", "ゲェ/ g e:", "ゴォ/ g o:", "サァ/ s a:", "シィ/ sh i:", "スゥ/ s u:", "スャ/ sh a", "スュ/ sh u", "スョ/ sh o", "セェ/ s e:", "ソォ/ s o:", "ザァ/ z a:", "ジィ/ j i:", "ズゥ/ z u:", "ズャ/ zy a", "ズュ/ zy u", "ズョ/ zy o", "ゼェ/ z e:", "ゾォ/ z o:", "タァ/ t a:", "チィ/ ch i:", "ツァ/ ts a", "ツィ/ ts i", "ツゥ/ ts u:", "ツャ/ ch a", "ツュ/ ch u", "ツョ/ ch o", "ツェ/ ts e", "ツォ/ ts o", "テェ/ t e:", "トォ/ t o:", "ダァ/ d a:", "ヂィ/ j i:", "ヅゥ/ d u:", "ヅャ/ zy a", "ヅュ/ zy u", "ヅョ/ zy o", "デェ/ d e:", "ドォ/ d o:", "ナァ/ n a:", "ニィ/ n i:", "ヌゥ/ n u:", "ヌャ/ ny a", "ヌュ/ ny u", "ヌョ/ ny o", "ネェ/ n e:", "ノォ/ n o:", "ハァ/ h a:", "ヒィ/ h i:", "フゥ/ f u:", "フャ/ hy a", "フュ/ hy u", "フョ/ hy o", "ヘェ/ h e:", "ホォ/ h o:", "バァ/ b a:", "ビィ/ b i:", "ブゥ/ b u:", "フャ/ hy a", "ブュ/ by u", "フョ/ hy o", "ベェ/ b e:", "ボォ/ b o:", "パァ/ p a:", "ピィ/ p i:", "プゥ/ p u:", "プャ/ py a", "プュ/ py u", "プョ/ py o", "ペェ/ p e:", "ポォ/ p o:", "マァ/ m a:", "ミィ/ m i:", "ムゥ/ m u:", "ムャ/ my a", "ムュ/ my u", "ムョ/ my o", "メェ/ m e:", "モォ/ m o:", "ヤァ/ y a:", "ユゥ/ y u:", "ユャ/ y a:", "ユュ/ y u:", "ユョ/ y o:", "ヨォ/ y o:", "ラァ/ r a:", "リィ/ r i:", "ルゥ/ r u:", "ルャ/ ry a", "ルュ/ ry u", "ルョ/ ry o", "レェ/ r e:", "ロォ/ r o:", "ワァ/ w a:", "ヲォ/ o:", "ディ/ d i", "デェ/ d e:", "デャ/ dy a", "デュ/ dy u", "デョ/ dy o", "ティ/ t i", "テェ/ t e:", "テャ/ ty a", "テュ/ ty u", "テョ/ ty o", "スィ/ s i", "ズァ/ z u a", "ズィ/ z i", "ズゥ/ z u", "ズャ/ zy a", "ズュ/ zy u", "ズョ/ zy o", "ズェ/ z e", "ズォ/ z o", "キャ/ ky a", "キュ/ ky u", "キョ/ ky o", "シャ/ sh a", "シュ/ sh u", "シェ/ sh e", "ショ/ sh o", "チャ/ ch a", "チュ/ ch u", "チェ/ ch e", "チョ/ ch o", "トゥ/ t u", "トャ/ ty a", "トュ/ ty u", "トョ/ ty o", "ドァ/ d o a", "ドゥ/ d u", "ドャ/ dy a", "ドュ/ dy u", "ドョ/ dy o", "ドォ/ d o:", "ニャ/ ny a", "ニュ/ ny u", "ニョ/ ny o", "ヒャ/ hy a", "ヒュ/ hy u", "ヒョ/ hy o", "ミャ/ my a", "ミュ/ my u", "ミョ/ my o", "リャ/ ry a", "リュ/ ry u", "リョ/ ry o", "ギャ/ gy a", "ギュ/ gy u", "ギョ/ gy o", "ヂェ/ j e", "ヂャ/ j a", "ヂュ/ j u", "ヂョ/ j o", "ジェ/ j e", "ジャ/ j a", "ジュ/ j u", "ジョ/ j o", "ビャ/ by a", "ビュ/ by u", "ビョ/ by o", "ピャ/ py a", "ピュ/ py u", "ピョ/ py o", "ウァ/ u a", "ウィ/ w i", "ウェ/ w e", "ウォ/ w o", "ファ/ f a", "フィ/ f i", "フゥ/ f u", "フャ/ hy a", "フュ/ hy u", "フョ/ hy o", "フェ/ f e", "フォ/ f o", "ヴァ/ b a", "ヴィ/ b i", "ヴェ/ b e", "ヴォ/ b o", "ヴュ/ by u", # Conversion of 1 letter "ア/ a", "イ/ i", "ウ/ u", "エ/ e", "オ/ o", "カ/ k a", "キ/ k i", "ク/ k u", "ケ/ k e", "コ/ k o", "サ/ s a", "シ/ sh i", "ス/ s u", "セ/ s e", "ソ/ s o", "タ/ t a", "チ/ ch i", "ツ/ ts u", "テ/ t e", "ト/ t o", "ナ/ n a", "ニ/ n i", "ヌ/ n u", "ネ/ n e", "ノ/ n o", "ハ/ h a", "ヒ/ h i", "フ/ f u", "ヘ/ h e", "ホ/ h o", "マ/ m a", "ミ/ m i", "ム/ m u", "メ/ m e", "モ/ m o", "ラ/ r a", "リ/ r i", "ル/ r u", "レ/ r e", "ロ/ r o", "ガ/ g a", "ギ/ g i", "グ/ g u", "ゲ/ g e", "ゴ/ g o", "ザ/ z a", "ジ/ j i", "ズ/ z u", "ゼ/ z e", "ゾ/ z o", "ダ/ d a", "ヂ/ j i", "ヅ/ z u", "デ/ d e", "ド/ d o", "バ/ b a", "ビ/ b i", "ブ/ b u", "ベ/ b e", "ボ/ b o", "パ/ p a", "ピ/ p i", "プ/ p u", "ペ/ p e", "ポ/ p o", "ヤ/ y a", "ユ/ y u", "ヨ/ y o", "ワ/ w a", "ヰ/ i", "ヱ/ e", "ヲ/ o", "ン/ N", "ッ/ q", "ヴ/ b u", "ー/:", # Try converting broken text "ァ/ a", "ィ/ i", "ゥ/ u", "ェ/ e", "ォ/ o", "ヮ/ w a", "ォ/ o", # Symbols "、/ ,", "。/ .", "!/ !", "?/ ?", "・/ ,", ] _COLON_RX = re.compile(":+") _REJECT_RX = re.compile("[^ a-zA-Z:,.?]") def _makerulemap(): l = [tuple(x.split("/")) for x in _CONVRULES] return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2)) _RULEMAP1, _RULEMAP2 = _makerulemap() def kata2phoneme(text: str) -> str: """Convert katakana text to phonemes.""" text = text.strip() res = [] while text: if len(text) >= 2: x = _RULEMAP2.get(text[:2]) if x is not None: text = text[2:] res += x.split(" ")[1:] continue x = _RULEMAP1.get(text[0]) if x is not None: text = text[1:] res += x.split(" ")[1:] continue res.append(text[0]) text = text[1:] # res = _COLON_RX.sub(":", res) return res _KATAKANA = "".join(chr(ch) for ch in range(ord("ァ"), ord("ン") + 1)) _HIRAGANA = "".join(chr(ch) for ch in range(ord("ぁ"), ord("ん") + 1)) _HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA) def hira2kata(text: str) -> str: text = text.translate(_HIRA2KATATRANS) return text.replace("う゛", "ヴ") _SYMBOL_TOKENS = set(list("・、。?!")) _NO_YOMI_TOKENS = set(list("「」『』―()[][]")) _TAGGER = MeCab.Tagger() def text2kata(text: str) -> str: parsed = _TAGGER.parse(text) res = [] for line in parsed.split("\n"): if line == "EOS": break parts = line.split("\t") word, yomi = parts[0], parts[1] if yomi: res.append(yomi) else: if word in _SYMBOL_TOKENS: res.append(word) elif word in ("っ", "ッ"): res.append("ッ") elif word in _NO_YOMI_TOKENS: pass else: res.append(word) return hira2kata("".join(res)) _ALPHASYMBOL_YOMI = { "#": "シャープ", "%": "パーセント", "&": "アンド", "+": "プラス", "-": "マイナス", ":": "コロン", ";": "セミコロン", "<": "小なり", "=": "イコール", ">": "大なり", "@": "アット", "a": "エー", "b": "ビー", "c": "シー", "d": "ディー", "e": "イー", "f": "エフ", "g": "ジー", "h": "エイチ", "i": "アイ", "j": "ジェー", "k": "ケー", "l": "エル", "m": "エム", "n": "エヌ", "o": "オー", "p": "ピー", "q": "キュー", "r": "アール", "s": "エス", "t": "ティー", "u": "ユー", "v": "ブイ", "w": "ダブリュー", "x": "エックス", "y": "ワイ", "z": "ゼット", "α": "アルファ", "β": "ベータ", "γ": "ガンマ", "δ": "デルタ", "ε": "イプシロン", "ζ": "ゼータ", "η": "イータ", "θ": "シータ", "ι": "イオタ", "κ": "カッパ", "λ": "ラムダ", "μ": "ミュー", "ν": "ニュー", "ξ": "クサイ", "ο": "オミクロン", "π": "パイ", "ρ": "ロー", "σ": "シグマ", "τ": "タウ", "υ": "ウプシロン", "φ": "ファイ", "χ": "カイ", "ψ": "プサイ", "ω": "オメガ", } _NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+") _CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"} _CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])") _NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?") def japanese_convert_numbers_to_words(text: str) -> str: res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text) res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res) res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res) return res def japanese_convert_alpha_symbols_to_words(text: str) -> str: return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()]) def japanese_text_to_phonemes(text: str) -> str: """Convert Japanese text to phonemes.""" res = unicodedata.normalize("NFKC", text) res = japanese_convert_numbers_to_words(res) # res = japanese_convert_alpha_symbols_to_words(res) res = text2kata(res) res = kata2phoneme(res) return res def is_japanese_character(char): # 定义日语文字系统的 Unicode 范围 japanese_ranges = [ (0x3040, 0x309F), # 平假名 (0x30A0, 0x30FF), # 片假名 (0x4E00, 0x9FFF), # 汉字 (CJK Unified Ideographs) (0x3400, 0x4DBF), # 汉字扩展 A (0x20000, 0x2A6DF), # 汉字扩展 B # 可以根据需要添加其他汉字扩展范围 ] # 将字符的 Unicode 编码转换为整数 char_code = ord(char) # 检查字符是否在任何一个日语范围内 for start, end in japanese_ranges: if start <= char_code <= end: return True return False rep_map = { ":": ",", ";": ",", ",": ",", "。": ".", "!": "!", "?": "?", "\n": ".", "·": ",", "、": ",", "...": "…", } def replace_punctuation(text): pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) replaced_text = re.sub( r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF" + "".join(punctuation) + r"]+", "", replaced_text, ) return replaced_text def text_normalize(text): res = unicodedata.normalize("NFKC", text) res = japanese_convert_numbers_to_words(res) # res = "".join([i for i in res if is_japanese_character(i)]) res = replace_punctuation(res) return res def distribute_phone(n_phone, n_word): phones_per_word = [0] * n_word for task in range(n_phone): min_tasks = min(phones_per_word) min_index = phones_per_word.index(min_tasks) phones_per_word[min_index] += 1 return phones_per_word tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") def g2p(norm_text): tokenized = tokenizer.tokenize(norm_text) phs = [] ph_groups = [] for t in tokenized: if not t.startswith("#"): ph_groups.append([t]) else: ph_groups[-1].append(t.replace("#", "")) word2ph = [] for group in ph_groups: phonemes = kata2phoneme(text2kata("".join(group))) # phonemes = [i for i in phonemes if i in symbols] for i in phonemes: assert i in symbols, (group, norm_text, tokenized) phone_len = len(phonemes) word_len = len(group) aaa = distribute_phone(phone_len, word_len) word2ph += aaa phs += phonemes phones = ["_"] + phs + ["_"] tones = [0 for i in phones] word2ph = [1] + word2ph + [1] return phones, tones, word2ph if __name__ == "__main__": tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") text = "hello,こんにちは、世界!……" from text.japanese_bert import get_bert_feature text = text_normalize(text) print(text) phones, tones, word2ph = g2p(text) bert = get_bert_feature(text, word2ph) print(phones, tones, word2ph, bert.shape)