# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py import re import sys import pyopenjtalk from . import symbols # Regular expression matching Japanese without punctuation marks: _japanese_characters = re.compile( r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" ) # Regular expression matching non-Japanese characters or punctuation marks: _japanese_marks = re.compile( r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" ) # List of (symbol, Japanese) pairs for marks: _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]] # List of (consonant, sokuon) pairs: _real_sokuon = [ (re.compile("%s" % x[0]), x[1]) for x in [ (r"Q([↑↓]*[kg])", r"k#\1"), (r"Q([↑↓]*[tdjʧ])", r"t#\1"), (r"Q([↑↓]*[sʃ])", r"s\1"), (r"Q([↑↓]*[pb])", r"p#\1"), ] ] # List of (consonant, hatsuon) pairs: _real_hatsuon = [ (re.compile("%s" % x[0]), x[1]) for x in [ (r"N([↑↓]*[pbm])", r"m\1"), (r"N([↑↓]*[ʧʥj])", r"n^\1"), (r"N([↑↓]*[tdn])", r"n\1"), (r"N([↑↓]*[kg])", r"ŋ\1"), ] ] def post_replace_ph(ph): rep_map = { ":": ",", ";": ",", ",": ",", "。": ".", "!": "!", "?": "?", "\n": ".", "·": ",", "、": ",", "...": "…", "v": "V", } if ph in rep_map.keys(): ph = rep_map[ph] if ph in symbols: return ph if ph not in symbols: ph = "UNK" return ph def symbols_to_japanese(text): for regex, replacement in _symbols_to_japanese: text = re.sub(regex, replacement, text) return text def preprocess_jap(text): """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html""" text = symbols_to_japanese(text) sentences = re.split(_japanese_marks, text) marks = re.findall(_japanese_marks, text) text = [] for i, sentence in enumerate(sentences): if re.match(_japanese_characters, sentence): p = pyopenjtalk.g2p(sentence) text += p.split(" ") if i < len(marks): text += [marks[i].replace(" ", "")] return text def text_normalize(text): # todo: jap text normalize return text def g2p(norm_text): phones = preprocess_jap(norm_text) phones = [post_replace_ph(i) for i in phones] # todo: implement tones and word2ph tones = [0 for i in phones] word2ph = [1 for i in phones] return phones, tones, word2ph if __name__ == "__main__": for line in open("../../../Downloads/transcript_utf8.txt").readlines(): text = line.split(":")[1] phones, tones, word2ph = g2p(text) for p in phones: if p == "z": print(text, phones) sys.exit(0)