MyGO_VIts-bert / text /japanese.py
Mahiruoshi's picture
Upload 103 files
5f275a8 verified
# Convert Japanese text to phonemes which is
# compatible with Julius https://github.com/julius-speech/segmentation-kit
import re
import unicodedata
from transformers import AutoTokenizer
from text import punctuation, symbols
from num2words import num2words
import pyopenjtalk
import jaconv
# Mapping of hiragana to phonetic representation
hiragana_map = {
"ใ†ใ‚›ใ": " v a",
"ใ†ใ‚›ใƒ": " v i",
"ใ†ใ‚›ใ‡": " v e",
"ใ†ใ‚›ใ‰": " v o",
"ใ†ใ‚›ใ‚…": " by u",
"ใ…ใ‚›": " v u",
# ใ‚”็ญ‰ใฎๅ‡ฆ็†ใ‚’่ฟฝๅŠ 
"ใ‚”ใ": " v a",
"ใ‚”ใƒ": " v i",
"ใ‚”ใ‡": " v e",
"ใ‚”ใ‰": " v o",
"ใ‚”ใ‚…": " by u",
# 2ๆ–‡ๅญ—ใ‹ใ‚‰ใชใ‚‹ๅค‰ๆ›่ฆๅ‰‡
"ใ‚ใ": " a a",
"ใ„ใƒ": " i i",
"ใ„ใ‡": " i e",
"ใ„ใ‚ƒ": " y a",
"ใ†ใ…": " u:",
"ใˆใ‡": " e e",
"ใŠใ‰": " o:",
"ใ‹ใ": " k a:",
"ใใƒ": " k i:",
"ใใ…": " k u:",
"ใใ‚ƒ": " ky a",
"ใใ‚…": " ky u",
"ใใ‚‡": " ky o",
"ใ‘ใ‡": " k e:",
"ใ“ใ‰": " k o:",
"ใŒใ": " g a:",
"ใŽใƒ": " g i:",
"ใใ…": " g u:",
"ใใ‚ƒ": " gy a",
"ใใ‚…": " gy u",
"ใใ‚‡": " gy o",
"ใ’ใ‡": " g e:",
"ใ”ใ‰": " g o:",
"ใ•ใ": " s a:",
"ใ—ใƒ": " sh i",
"ใ™ใ…": " s u:",
"ใ™ใ‚ƒ": " sh a",
"ใ™ใ‚…": " sh u",
"ใ™ใ‚‡": " sh o",
"ใ›ใ‡": " s e:",
"ใใ‰": " s o:",
"ใ–ใ": " z a:",
"ใ˜ใƒ": " j i:",
"ใšใ…": " z u:",
"ใšใ‚ƒ": " zy a",
"ใšใ‚…": " zy u",
"ใšใ‚‡": " zy o",
"ใœใ‡": " z e:",
"ใžใ‰": " z o:",
"ใŸใ": " t a:",
"ใกใƒ": " ch i",
"ใคใ": " ts a",
"ใคใƒ": " ts i",
"ใคใ…": " ts u",
"ใคใ‚ƒ": " ch a",
"ใคใ‚…": " ch u",
"ใคใ‚‡": " ch o",
"ใคใ‡": " ts e",
"ใคใ‰": " ts o",
"ใฆใ‡": " t e:",
"ใจใ‰": " t o:",
"ใ ใ": " d a:",
"ใขใƒ": " j i:",
"ใฅใ…": " d u:",
"ใฅใ‚ƒ": " zy a",
"ใฅใ‚…": " zy u",
"ใฅใ‚‡": " zy o",
"ใงใ‡": " d e:",
"ใชใ": " n a:",
"ใซใƒ": " n i:",
"ใฌใ…": " n u:",
"ใฌใ‚ƒ": " ny a",
"ใฌใ‚…": " ny u",
"ใฌใ‚‡": " ny o",
"ใญใ‡": " n e:",
"ใฎใ‰": " n o:",
"ใฏใ": " h a:",
"ใฒใƒ": " h i:",
"ใตใ…": " f u:",
"ใตใ‚ƒ": " hy a",
"ใธใ‡": " h e:",
"ใปใ‰": " h o:",
"ใฐใ": " b a:",
"ใณใƒ": " b i:",
"ใถใ…": " b u:",
"ใถใ‚…": " by u",
"ในใ‡": " b e:",
"ใผใ‰": " b o:",
"ใฑใ": " p a:",
"ใดใƒ": " p i:",
"ใทใ…": " p u:",
"ใทใ‚ƒ": " py a",
"ใทใ‚…": " py u",
"ใทใ‚‡": " py o",
"ใบใ‡": " p e:",
"ใฝใ‰": " p o:",
"ใพใ": " m a:",
"ใฟใƒ": " m i:",
"ใ‚€ใ…": " m u:",
"ใ‚€ใ‚ƒ": " my a",
"ใ‚€ใ‚…": " my u",
"ใ‚€ใ‚‡": " my o",
"ใ‚ใ‡": " m e:",
"ใ‚‚ใ‰": " m o:",
"ใ‚„ใ": " y a:",
"ใ‚†ใ…": " y u:",
"ใ‚†ใ‚ƒ": " y a:",
"ใ‚†ใ‚…": " y u:",
"ใ‚†ใ‚‡": " y o:",
"ใ‚ˆใ‰": " y o:",
"ใ‚‰ใ": " r a:",
"ใ‚Šใƒ": " r i:",
"ใ‚‹ใ…": " r u:",
"ใ‚‹ใ‚ƒ": " ry a",
"ใ‚‹ใ‚…": " ry u",
"ใ‚‹ใ‚‡": " ry o",
"ใ‚Œใ‡": " r e:",
"ใ‚ใ‰": " r o:",
"ใ‚ใ": " w a:",
"ใ‚’ใ‰": " o:",
"ใ†ใ‚›": " b u",
"ใงใƒ": " d i",
"ใงใ‚ƒ": " dy a",
"ใงใ‚…": " dy u",
"ใงใ‚‡": " dy o",
"ใฆใƒ": " t i",
"ใฆใ‚ƒ": " ty a",
"ใฆใ‚…": " ty u",
"ใฆใ‚‡": " ty o",
"ใ™ใƒ": " s i",
"ใšใ": " z u",
"ใšใƒ": " z i",
"ใšใ‡": " z e",
"ใšใ‰": " z o",
"ใใ‚ƒ": " ky a",
"ใใ‚…": " ky u",
"ใใ‚‡": " ky o",
"ใ—ใ‚ƒ": " sh a",
"ใ—ใ‚…": " sh u",
"ใ—ใ‡": " sh e",
"ใ—ใ‚‡": " sh o",
"ใกใ‚ƒ": " ch a",
"ใกใ‚…": " ch u",
"ใกใ‡": " ch e",
"ใกใ‚‡": " ch o",
"ใจใ…": " t u",
"ใจใ‚ƒ": " ty a",
"ใจใ‚…": " ty u",
"ใจใ‚‡": " ty o",
"ใฉใ": " d o ",
"ใฉใ…": " d u",
"ใฉใ‚ƒ": " dy a",
"ใฉใ‚…": " dy u",
"ใฉใ‚‡": " dy o",
"ใฉใ‰": " d o:",
"ใซใ‚ƒ": " ny a",
"ใซใ‚…": " ny u",
"ใซใ‚‡": " ny o",
"ใฒใ‚ƒ": " hy a",
"ใฒใ‚…": " hy u",
"ใฒใ‚‡": " hy o",
"ใฟใ‚ƒ": " my a",
"ใฟใ‚…": " my u",
"ใฟใ‚‡": " my o",
"ใ‚Šใ‚ƒ": " ry a",
"ใ‚Šใ‚…": " ry u",
"ใ‚Šใ‚‡": " ry o",
"ใŽใ‚ƒ": " gy a",
"ใŽใ‚…": " gy u",
"ใŽใ‚‡": " gy o",
"ใขใ‡": " j e",
"ใขใ‚ƒ": " j a",
"ใขใ‚…": " j u",
"ใขใ‚‡": " j o",
"ใ˜ใ‡": " j e",
"ใ˜ใ‚ƒ": " j a",
"ใ˜ใ‚…": " j u",
"ใ˜ใ‚‡": " j o",
"ใณใ‚ƒ": " by a",
"ใณใ‚…": " by u",
"ใณใ‚‡": " by o",
"ใดใ‚ƒ": " py a",
"ใดใ‚…": " py u",
"ใดใ‚‡": " py o",
"ใ†ใ": " u a",
"ใ†ใƒ": " w i",
"ใ†ใ‡": " w e",
"ใ†ใ‰": " w o",
"ใตใ": " f a",
"ใตใƒ": " f i",
"ใตใ‚…": " hy u",
"ใตใ‚‡": " hy o",
"ใตใ‡": " f e",
"ใตใ‰": " f o",
# 1้Ÿณใ‹ใ‚‰ใชใ‚‹ๅค‰ๆ›่ฆๅ‰‡
"ใ‚": " a",
"ใ„": " i",
"ใ†": " u",
"ใ‚”": " v u", # ใ‚”ใฎๅ‡ฆ็†ใ‚’่ฟฝๅŠ 
"ใˆ": " e",
"ใŠ": " o",
"ใ‹": " k a",
"ใ": " k i",
"ใ": " k u",
"ใ‘": " k e",
"ใ“": " k o",
"ใ•": " s a",
"ใ—": " sh i",
"ใ™": " s u",
"ใ›": " s e",
"ใ": " s o",
"ใŸ": " t a",
"ใก": " ch i",
"ใค": " ts u",
"ใฆ": " t e",
"ใจ": " t o",
"ใช": " n a",
"ใซ": " n i",
"ใฌ": " n u",
"ใญ": " n e",
"ใฎ": " n o",
"ใฏ": " h a",
"ใฒ": " h i",
"ใต": " f u",
"ใธ": " h e",
"ใป": " h o",
"ใพ": " m a",
"ใฟ": " m i",
"ใ‚€": " m u",
"ใ‚": " m e",
"ใ‚‚": " m o",
"ใ‚‰": " r a",
"ใ‚Š": " r i",
"ใ‚‹": " r u",
"ใ‚Œ": " r e",
"ใ‚": " r o",
"ใŒ": " g a",
"ใŽ": " g i",
"ใ": " g u",
"ใ’": " g e",
"ใ”": " g o",
"ใ–": " z a",
"ใ˜": " j i",
"ใš": " z u",
"ใœ": " z e",
"ใž": " z o",
"ใ ": " d a",
"ใข": " j i",
"ใฅ": " z u",
"ใง": " d e",
"ใฉ": " d o",
"ใฐ": " b a",
"ใณ": " b i",
"ใถ": " b u",
"ใน": " b e",
"ใผ": " b o",
"ใฑ": " p a",
"ใด": " p i",
"ใท": " p u",
"ใบ": " p e",
"ใฝ": " p o",
"ใ‚„": " y a",
"ใ‚†": " y u",
"ใ‚ˆ": " y o",
"ใ‚": " w a",
"ใ‚": " i",
"ใ‚‘": " e",
"ใ‚“": " N",
"ใฃ": " q",
# ใ“ใ“ใพใงใซๅ‡ฆ็†ใ•ใ‚Œใฆใชใ„ ใใƒใ…ใ‡ใ‰ ใฏใใฎใพใพๅคงๆ–‡ๅญ—ๆ‰ฑใ„
"ใ": " a",
"ใƒ": " i",
"ใ…": " u",
"ใ‡": " e",
"ใ‰": " o",
"ใ‚Ž": " w a",
# ้•ท้Ÿณใฎๅ‡ฆ็†
# for (pattern, replace_str) in JULIUS_LONG_VOWEL:
# text = pattern.sub(replace_str, text)
# text = text.replace("o u", "o:") # ใŠใ† -> ใŠใƒผใฎ้Ÿณไพฟ
"ใƒผ": ":",
"ใ€œ": ":",
"โˆ’": ":",
"-": ":",
# ใใฎไป–็‰นๅˆฅใชๅ‡ฆ็†
"ใ‚’": " o",
# ใ“ใ“ใพใงใซๅ‡ฆ็†ใ•ใ‚Œใฆใ„ใชใ„ใ‚…็ญ‰ใ‚‚ใใฎใพใพๅคงๆ–‡ๅญ—ๆ‰ฑใ„๏ผˆ่ฟฝๅŠ ๏ผ‰
"ใ‚ƒ": " y a",
"ใ‚…": " y u",
"ใ‚‡": " y o",
}
def hiragana2p(txt: str) -> str:
"""
Modification of `jaconv.hiragana2julius`.
- avoid using `:`, instead, `ใ‚ใƒผใƒผใƒผ` -> `a a a a`.
- avoid converting `o u` to `o o` (because the input is already actual `yomi`).
- avoid using `N` for `ใ‚“` (for compatibility)
- use `v` for `ใ‚”` related text.
- add bare `ใ‚ƒ` `ใ‚…` `ใ‚‡` to `y a` `y u` `y o` (for compatibility).
"""
result = []
skip = 0
for i in range(len(txt)):
if skip:
skip -= 1
continue
for length in range(3, 0, -1):
if txt[i : i + length] in hiragana_map:
result.append(hiragana_map[txt[i : i + length]])
skip = length - 1
break
txt = "".join(result)
txt = txt.strip()
txt = txt.replace(":+", ":")
# ใ“ใ“ใพใง`jaconv.hiragana2julius`ใจ้Ÿณไพฟๅ‡ฆ็†ใจ้•ท้Ÿณๅ‡ฆ็†ใ‚’ใฎใžใ„ใฆๅŒใ˜
# ใ“ใ“ใ‹ใ‚‰`k a:: k i:`โ†’`k a a a k i i`ใฎใ‚ˆใ†ใซ`:`ใฎๆ•ฐใ ใ‘็นฐใ‚Š่ฟ”ใ™ๅ‡ฆ็†
pattern = r"(\w)(:*)"
replacement = lambda m: m.group(1) + (" " + m.group(1)) * len(m.group(2))
txt = re.sub(pattern, replacement, txt)
txt = txt.replace("N", "n") # ไฟƒ้ŸณใฎNใ‚’nใซๅค‰ๆ›
return txt
def kata2phoneme(text: str) -> str:
"""Convert katakana text to phonemes."""
text = text.strip()
if text == "ใƒผ":
return ["ใƒผ"]
elif text.startswith("ใƒผ"):
return ["ใƒผ"] + kata2phoneme(text[1:])
res = []
prev = None
while text:
if re.match(_MARKS, text):
res.append(text)
text = text[1:]
continue
if text.startswith("ใƒผ"):
if prev:
res.append(prev[-1])
text = text[1:]
continue
res += hiragana2p(jaconv.kata2hira(text)).split(" ")
break
# res = _COLON_RX.sub(":", res)
return res
_SYMBOL_TOKENS = set(list("ใƒปใ€ใ€‚๏ผŸ๏ผ"))
_NO_YOMI_TOKENS = set(list("ใ€Œใ€ใ€Žใ€โ€•๏ผˆ๏ผ‰๏ผป๏ผฝ[]"))
_MARKS = re.compile(
r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
)
def text2sep_kata(text: str):
parsed = pyopenjtalk.run_frontend(text)
res = []
sep = []
for parts in parsed:
word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
"โ€™", ""
)
if yomi:
if re.match(_MARKS, yomi):
if len(word) > 1:
word = [replace_punctuation(i) for i in list(word)]
yomi = word
res += yomi
sep += word
continue
elif word not in rep_map.keys() and word not in rep_map.values():
word = ","
yomi = word
res.append(yomi)
else:
if word in _SYMBOL_TOKENS:
res.append(word)
elif word in ("ใฃ", "ใƒƒ"):
res.append("ใƒƒ")
elif word in _NO_YOMI_TOKENS:
pass
else:
res.append(word)
sep.append(word)
return sep, res, get_accent(parsed)
def get_accent(parsed):
labels = pyopenjtalk.make_label(parsed)
phonemes = []
accents = []
for n, label in enumerate(labels):
phoneme = re.search(r"\-([^\+]*)\+", label).group(1)
if phoneme not in ["sil", "pau"]:
phonemes.append(phoneme.replace("cl", "q").lower())
else:
continue
a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
a2 = int(re.search(r"\+(\d+)\+", label).group(1))
if re.search(r"\-([^\+]*)\+", labels[n + 1]).group(1) in ["sil", "pau"]:
a2_next = -1
else:
a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
# Falling
if a1 == 0 and a2_next == a2 + 1:
accents.append(-1)
# Rising
elif a2 == 1 and a2_next == 2:
accents.append(1)
else:
accents.append(0)
return list(zip(phonemes, accents))
_ALPHASYMBOL_YOMI = {
"#": "ใ‚ทใƒฃใƒผใƒ—",
"%": "ใƒ‘ใƒผใ‚ปใƒณใƒˆ",
"&": "ใ‚ขใƒณใƒ‰",
"+": "ใƒ—ใƒฉใ‚น",
"-": "ใƒžใ‚คใƒŠใ‚น",
":": "ใ‚ณใƒญใƒณ",
";": "ใ‚ปใƒŸใ‚ณใƒญใƒณ",
"<": "ๅฐใชใ‚Š",
"=": "ใ‚คใ‚ณใƒผใƒซ",
">": "ๅคงใชใ‚Š",
"@": "ใ‚ขใƒƒใƒˆ",
"a": "ใ‚จใƒผ",
"b": "ใƒ“ใƒผ",
"c": "ใ‚ทใƒผ",
"d": "ใƒ‡ใ‚ฃใƒผ",
"e": "ใ‚คใƒผ",
"f": "ใ‚จใƒ•",
"g": "ใ‚ธใƒผ",
"h": "ใ‚จใ‚คใƒ",
"i": "ใ‚ขใ‚ค",
"j": "ใ‚ธใ‚งใƒผ",
"k": "ใ‚ฑใƒผ",
"l": "ใ‚จใƒซ",
"m": "ใ‚จใƒ ",
"n": "ใ‚จใƒŒ",
"o": "ใ‚ชใƒผ",
"p": "ใƒ”ใƒผ",
"q": "ใ‚ญใƒฅใƒผ",
"r": "ใ‚ขใƒผใƒซ",
"s": "ใ‚จใ‚น",
"t": "ใƒ†ใ‚ฃใƒผ",
"u": "ใƒฆใƒผ",
"v": "ใƒ–ใ‚ค",
"w": "ใƒ€ใƒ–ใƒชใƒฅใƒผ",
"x": "ใ‚จใƒƒใ‚ฏใ‚น",
"y": "ใƒฏใ‚ค",
"z": "ใ‚ผใƒƒใƒˆ",
"ฮฑ": "ใ‚ขใƒซใƒ•ใ‚ก",
"ฮฒ": "ใƒ™ใƒผใ‚ฟ",
"ฮณ": "ใ‚ฌใƒณใƒž",
"ฮด": "ใƒ‡ใƒซใ‚ฟ",
"ฮต": "ใ‚คใƒ—ใ‚ทใƒญใƒณ",
"ฮถ": "ใ‚ผใƒผใ‚ฟ",
"ฮท": "ใ‚คใƒผใ‚ฟ",
"ฮธ": "ใ‚ทใƒผใ‚ฟ",
"ฮน": "ใ‚คใ‚ชใ‚ฟ",
"ฮบ": "ใ‚ซใƒƒใƒ‘",
"ฮป": "ใƒฉใƒ ใƒ€",
"ฮผ": "ใƒŸใƒฅใƒผ",
"ฮฝ": "ใƒ‹ใƒฅใƒผ",
"ฮพ": "ใ‚ฏใ‚ตใ‚ค",
"ฮฟ": "ใ‚ชใƒŸใ‚ฏใƒญใƒณ",
"ฯ€": "ใƒ‘ใ‚ค",
"ฯ": "ใƒญใƒผ",
"ฯƒ": "ใ‚ทใ‚ฐใƒž",
"ฯ„": "ใ‚ฟใ‚ฆ",
"ฯ…": "ใ‚ฆใƒ—ใ‚ทใƒญใƒณ",
"ฯ†": "ใƒ•ใ‚กใ‚ค",
"ฯ‡": "ใ‚ซใ‚ค",
"ฯˆ": "ใƒ—ใ‚ตใ‚ค",
"ฯ‰": "ใ‚ชใƒกใ‚ฌ",
}
_NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
_CURRENCY_MAP = {"$": "ใƒ‰ใƒซ", "ยฅ": "ๅ††", "ยฃ": "ใƒใƒณใƒ‰", "โ‚ฌ": "ใƒฆใƒผใƒญ"}
_CURRENCY_RX = re.compile(r"([$ยฅยฃโ‚ฌ])([0-9.]*[0-9])")
_NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
def japanese_convert_numbers_to_words(text: str) -> str:
res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
return res
def japanese_convert_alpha_symbols_to_words(text: str) -> str:
return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
def is_japanese_character(char):
# ๅฎšไน‰ๆ—ฅ่ฏญๆ–‡ๅญ—็ณป็ปŸ็š„ Unicode ่Œƒๅ›ด
japanese_ranges = [
(0x3040, 0x309F), # ๅนณๅ‡ๅ
(0x30A0, 0x30FF), # ็‰‡ๅ‡ๅ
(0x4E00, 0x9FFF), # ๆฑ‰ๅญ— (CJK Unified Ideographs)
(0x3400, 0x4DBF), # ๆฑ‰ๅญ—ๆ‰ฉๅฑ• A
(0x20000, 0x2A6DF), # ๆฑ‰ๅญ—ๆ‰ฉๅฑ• B
# ๅฏไปฅๆ นๆฎ้œ€่ฆๆทปๅŠ ๅ…ถไป–ๆฑ‰ๅญ—ๆ‰ฉๅฑ•่Œƒๅ›ด
]
# ๅฐ†ๅญ—็ฌฆ็š„ Unicode ็ผ–็ ่ฝฌๆขไธบๆ•ดๆ•ฐ
char_code = ord(char)
# ๆฃ€ๆŸฅๅญ—็ฌฆๆ˜ฏๅฆๅœจไปปไฝ•ไธ€ไธชๆ—ฅ่ฏญ่Œƒๅ›ดๅ†…
for start, end in japanese_ranges:
if start <= char_code <= end:
return True
return False
rep_map = {
"๏ผš": ",",
"๏ผ›": ",",
"๏ผŒ": ",",
"ใ€‚": ".",
"๏ผ": "!",
"๏ผŸ": "?",
"\n": ".",
"๏ผŽ": ".",
"โ€ฆ": "...",
"ยทยทยท": "...",
"ใƒปใƒปใƒป": "...",
"ยท": ",",
"ใƒป": ",",
"ใ€": ",",
"$": ".",
"โ€œ": "'",
"โ€": "'",
'"': "'",
"โ€˜": "'",
"โ€™": "'",
"๏ผˆ": "'",
"๏ผ‰": "'",
"(": "'",
")": "'",
"ใ€Š": "'",
"ใ€‹": "'",
"ใ€": "'",
"ใ€‘": "'",
"[": "'",
"]": "'",
"โ€”": "-",
"โˆ’": "-",
"๏ฝž": "-",
"~": "-",
"ใ€Œ": "'",
"ใ€": "'",
}
def replace_punctuation(text):
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
replaced_text = re.sub(
r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
+ "".join(punctuation)
+ r"]+",
"",
replaced_text,
)
return replaced_text
def text_normalize(text):
res = unicodedata.normalize("NFKC", text)
res = japanese_convert_numbers_to_words(res)
# res = "".join([i for i in res if is_japanese_character(i)])
res = replace_punctuation(res)
res = res.replace("ใ‚™", "")
return res
def distribute_phone(n_phone, n_word):
phones_per_word = [0] * n_word
for task in range(n_phone):
min_tasks = min(phones_per_word)
min_index = phones_per_word.index(min_tasks)
phones_per_word[min_index] += 1
return phones_per_word
def handle_long(sep_phonemes):
for i in range(len(sep_phonemes)):
if sep_phonemes[i][0] == "ใƒผ":
sep_phonemes[i][0] = sep_phonemes[i - 1][-1]
if "ใƒผ" in sep_phonemes[i]:
for j in range(len(sep_phonemes[i])):
if sep_phonemes[i][j] == "ใƒผ":
sep_phonemes[i][j] = sep_phonemes[i][j - 1][-1]
return sep_phonemes
tokenizer = AutoTokenizer.from_pretrained("./bert/deberta-v2-large-japanese-char-wwm")
def align_tones(phones, tones):
res = []
for pho in phones:
temp = [0] * len(pho)
for idx, p in enumerate(pho):
if len(tones) == 0:
break
if p == tones[0][0]:
temp[idx] = tones[0][1]
if idx > 0:
temp[idx] += temp[idx - 1]
tones.pop(0)
temp = [0] + temp
temp = temp[:-1]
if -1 in temp:
temp = [i + 1 for i in temp]
res.append(temp)
res = [i for j in res for i in j]
assert not any([i < 0 for i in res]) and not any([i > 1 for i in res])
return res
def rearrange_tones(tones, phones):
res = [0] * len(tones)
for i in range(len(tones)):
if i == 0:
if tones[i] not in punctuation:
res[i] = 1
elif tones[i] == prev:
if phones[i] in punctuation:
res[i] = 0
else:
res[i] = 1
elif tones[i] > prev:
res[i] = 2
elif tones[i] < prev:
res[i - 1] = 3
res[i] = 1
prev = tones[i]
return res
def g2p(norm_text):
sep_text, sep_kata, acc = text2sep_kata(norm_text)
sep_tokenized = []
for i in sep_text:
if i not in punctuation:
sep_tokenized.append(tokenizer.tokenize(i))
else:
sep_tokenized.append([i])
sep_phonemes = handle_long([kata2phoneme(i) for i in sep_kata])
# ๅผ‚ๅธธๅค„็†๏ผŒMeCabไธ่ฎค่ฏ†็š„่ฏ็š„่ฏไผšไธ€่ทฏไผ ๅˆฐ่ฟ™้‡Œๆฅ๏ผŒ็„ถๅŽ็‚ธๆŽ‰ใ€‚็›ฎๅ‰ๆฅ็œ‹ๅชๆœ‰้‚ฃไบ›่ถ…็บง็จ€ๆœ‰็š„็”Ÿๅƒป่ฏไผšๅ‡บ็Žฐ่ฟ™็งๆƒ…ๅ†ต
for i in sep_phonemes:
for j in i:
assert j in symbols, (sep_text, sep_kata, sep_phonemes)
tones = align_tones(sep_phonemes, acc)
word2ph = []
for token, phoneme in zip(sep_tokenized, sep_phonemes):
phone_len = len(phoneme)
word_len = len(token)
aaa = distribute_phone(phone_len, word_len)
word2ph += aaa
phones = ["_"] + [j for i in sep_phonemes for j in i] + ["_"]
# tones = [0] + rearrange_tones(tones, phones[1:-1]) + [0]
tones = [0] + tones + [0]
word2ph = [1] + word2ph + [1]
assert len(phones) == len(tones)
return phones, tones, word2ph
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("./bert/deberta-v2-large-japanese")
text = "hello,ใ“ใ‚“ใซใกใฏใ€ไธ–็•Œใƒผ๏ผโ€ฆโ€ฆ"
from text.japanese_bert import get_bert_feature
text = text_normalize(text)
print(text)
phones, tones, word2ph = g2p(text)
bert = get_bert_feature(text, word2ph)
print(phones, tones, word2ph, bert.shape)