Spaces:
Running
Running
File size: 2,779 Bytes
26be912 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
import re
import sys
import pyopenjtalk
from text import symbols
# Regular expression matching Japanese without punctuation marks:
_japanese_characters = re.compile(
r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
)
# Regular expression matching non-Japanese characters or punctuation marks:
_japanese_marks = re.compile(
r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
)
# List of (symbol, Japanese) pairs for marks:
_symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("οΌ
", "γγΌγ»γ³γ")]]
# List of (consonant, sokuon) pairs:
_real_sokuon = [
(re.compile("%s" % x[0]), x[1])
for x in [
(r"Q([ββ]*[kg])", r"k#\1"),
(r"Q([ββ]*[tdjΚ§])", r"t#\1"),
(r"Q([ββ]*[sΚ])", r"s\1"),
(r"Q([ββ]*[pb])", r"p#\1"),
]
]
# List of (consonant, hatsuon) pairs:
_real_hatsuon = [
(re.compile("%s" % x[0]), x[1])
for x in [
(r"N([ββ]*[pbm])", r"m\1"),
(r"N([ββ]*[Κ§Κ₯j])", r"n^\1"),
(r"N([ββ]*[tdn])", r"n\1"),
(r"N([ββ]*[kg])", r"Ε\1"),
]
]
def post_replace_ph(ph):
rep_map = {
"οΌ": ",",
"οΌ": ",",
"οΌ": ",",
"γ": ".",
"οΌ": "!",
"οΌ": "?",
"\n": ".",
"Β·": ",",
"γ": ",",
"...": "β¦",
}
if ph in rep_map.keys():
ph = rep_map[ph]
if ph in symbols:
return ph
if ph not in symbols:
ph = "UNK"
return ph
def symbols_to_japanese(text):
for regex, replacement in _symbols_to_japanese:
text = re.sub(regex, replacement, text)
return text
def preprocess_jap(text):
"""Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
text = symbols_to_japanese(text)
sentences = re.split(_japanese_marks, text)
marks = re.findall(_japanese_marks, text)
text = []
for i, sentence in enumerate(sentences):
if re.match(_japanese_characters, sentence):
p = pyopenjtalk.g2p(sentence)
text += p.split(" ")
if i < len(marks):
text += [marks[i].replace(" ", "")]
return text
def text_normalize(text):
# todo: jap text normalize
return text
def g2p(norm_text):
phones = preprocess_jap(norm_text)
phones = [post_replace_ph(i) for i in phones]
# todo: implement tones and word2ph
return phones
if __name__ == "__main__":
for line in open("../../../Downloads/transcript_utf8.txt").readlines():
text = line.split(":")[1]
phones = g2p(text)
print(phones)
|