ru / tokenizer /g2p /g2p.py
TeraSpace's picture
Upload 13 files
dfc143a
softletters=set(u"яёюиье")
startsyl=set(u"#ъьаяоёуюэеиы-")
others = set(["#", "+", "-", u"ь", u"ъ"])
softhard_cons = {
u"б" : u"b",
u"в" : u"v",
u"г" : u"g",
u"Г" : u"g",
u"д" : u"d",
u"з" : u"z",
u"к" : u"k",
u"л" : u"l",
u"м" : u"m",
u"н" : u"n",
u"п" : u"p",
u"р" : u"r",
u"с" : u"s",
u"т" : u"t",
u"ф" : u"f",
u"х" : u"h"
}
other_cons = {
u"ж" : u"zh",
u"ц" : u"c",
u"ч" : u"ch",
u"ш" : u"sh",
u"щ" : u"sch",
u"й" : u"j"
}
vowels = {
u"а" : u"a",
u"я" : u"a",
u"у" : u"u",
u"ю" : u"u",
u"о" : u"o",
u"ё" : u"o",
u"э" : u"e",
u"е" : u"e",
u"и" : u"i",
u"ы" : u"y",
}
def pallatize(phones):
for i, phone in enumerate(phones[:-1]):
if phone[0] in softhard_cons:
if phones[i+1][0] in softletters:
phones[i] = (softhard_cons[phone[0]] + "j", 0)
else:
phones[i] = (softhard_cons[phone[0]], 0)
if phone[0] in other_cons:
phones[i] = (other_cons[phone[0]], 0)
def convert_vowels(phones):
new_phones = []
prev = ""
for phone in phones:
if prev in startsyl:
if phone[0] in set(u"яюеё"):
new_phones.append("j")
if phone[0] in vowels:
new_phones.append(vowels[phone[0]] + str(phone[1]))
else:
new_phones.append(phone[0])
prev = phone[0]
return new_phones
def convert(stressword):
phones = ("#" + stressword + "#")
# Assign stress marks
stress_phones = []
stress = 0
for phone in phones:
if phone == "+":
stress = 1
else:
stress_phones.append((phone, stress))
stress = 0
# Pallatize
pallatize(stress_phones)
# Assign stress
phones = convert_vowels(stress_phones)
# Filter
phones = [x for x in phones if x not in others]
return " ".join(phones)