Spaces:
Runtime error
Runtime error
| # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py | |
| import re | |
| import sys | |
| import pyopenjtalk | |
| from . import symbols | |
| # Regular expression matching Japanese without punctuation marks: | |
| _japanese_characters = re.compile( | |
| r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" | |
| ) | |
| # Regular expression matching non-Japanese characters or punctuation marks: | |
| _japanese_marks = re.compile( | |
| r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" | |
| ) | |
| # List of (symbol, Japanese) pairs for marks: | |
| _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("οΌ ", "γγΌγ»γ³γ")]] | |
| # List of (consonant, sokuon) pairs: | |
| _real_sokuon = [ | |
| (re.compile("%s" % x[0]), x[1]) | |
| for x in [ | |
| (r"Q([ββ]*[kg])", r"k#\1"), | |
| (r"Q([ββ]*[tdjΚ§])", r"t#\1"), | |
| (r"Q([ββ]*[sΚ])", r"s\1"), | |
| (r"Q([ββ]*[pb])", r"p#\1"), | |
| ] | |
| ] | |
| # List of (consonant, hatsuon) pairs: | |
| _real_hatsuon = [ | |
| (re.compile("%s" % x[0]), x[1]) | |
| for x in [ | |
| (r"N([ββ]*[pbm])", r"m\1"), | |
| (r"N([ββ]*[Κ§Κ₯j])", r"n^\1"), | |
| (r"N([ββ]*[tdn])", r"n\1"), | |
| (r"N([ββ]*[kg])", r"Ε\1"), | |
| ] | |
| ] | |
| def post_replace_ph(ph): | |
| rep_map = { | |
| "οΌ": ",", | |
| "οΌ": ",", | |
| "οΌ": ",", | |
| "γ": ".", | |
| "οΌ": "!", | |
| "οΌ": "?", | |
| "\n": ".", | |
| "Β·": ",", | |
| "γ": ",", | |
| "...": "β¦", | |
| "v": "V", | |
| } | |
| if ph in rep_map.keys(): | |
| ph = rep_map[ph] | |
| if ph in symbols: | |
| return ph | |
| if ph not in symbols: | |
| ph = "UNK" | |
| return ph | |
| def symbols_to_japanese(text): | |
| for regex, replacement in _symbols_to_japanese: | |
| text = re.sub(regex, replacement, text) | |
| return text | |
| def preprocess_jap(text): | |
| """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html""" | |
| text = symbols_to_japanese(text) | |
| sentences = re.split(_japanese_marks, text) | |
| marks = re.findall(_japanese_marks, text) | |
| text = [] | |
| for i, sentence in enumerate(sentences): | |
| if re.match(_japanese_characters, sentence): | |
| p = pyopenjtalk.g2p(sentence) | |
| text += p.split(" ") | |
| if i < len(marks): | |
| text += [marks[i].replace(" ", "")] | |
| return text | |
| def text_normalize(text): | |
| # todo: jap text normalize | |
| return text | |
| def g2p(norm_text): | |
| phones = preprocess_jap(norm_text) | |
| phones = [post_replace_ph(i) for i in phones] | |
| # todo: implement tones and word2ph | |
| tones = [0 for i in phones] | |
| word2ph = [1 for i in phones] | |
| return phones, tones, word2ph | |
| if __name__ == "__main__": | |
| for line in open("../../../Downloads/transcript_utf8.txt").readlines(): | |
| text = line.split(":")[1] | |
| phones, tones, word2ph = g2p(text) | |
| for p in phones: | |
| if p == "z": | |
| print(text, phones) | |
| sys.exit(0) | |