Spaces:
Running
Running
File size: 2,496 Bytes
e82212c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
from text import japanese, cleaned_text_to_sequence, english,korean,cantonese
print(japanese.__file__)
import os
if os.environ.get("version","v1")=="v1":
from text import chinese
from text.symbols import symbols
else:
from text import chinese2 as chinese
from text.symbols2 import symbols
print("THIS IS IN CLEANER.py")
language_module_map = {"zh": chinese, "ja": japanese, "en": english, "ko": korean,"yue":cantonese}
special = [
# ("%", "zh", "SP"),
("¥", "zh", "SP2"),
("^", "zh", "SP3"),
# ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧
]
def clean_text(text, language):
print('this is clean_text')
if(language not in language_module_map):
language="en"
text=" "
for special_s, special_l, target_symbol in special:
if special_s in text and language == special_l:
return clean_special(text, language, special_s, target_symbol)
language_module = language_module_map[language]
if hasattr(language_module,"text_normalize"):
norm_text = language_module.text_normalize(text)
else:
norm_text=text
if language == "zh" or language=="yue":##########
phones, word2ph = language_module.g2p(norm_text)
assert len(phones) == sum(word2ph)
assert len(norm_text) == len(word2ph)
elif language == "en":
phones = language_module.g2p(norm_text)
if len(phones) < 4:
phones = [','] * (4 - len(phones)) + phones
word2ph = None
else:
phones = language_module.g2p(norm_text)
word2ph = None
for ph in phones:
assert ph in symbols, ph
return phones, word2ph, norm_text
def clean_special(text, language, special_s, target_symbol):
"""
特殊静音段sp符号处理
"""
text = text.replace(special_s, ",")
language_module = language_module_map[language]
norm_text = language_module.text_normalize(text)
phones = language_module.g2p(norm_text)
new_ph = []
for ph in phones[0]:
assert ph in symbols
if ph == ",":
new_ph.append(target_symbol)
else:
new_ph.append(ph)
return new_ph, phones[1], norm_text
def text_to_sequence(text, language):
phones = clean_text(text)
return cleaned_text_to_sequence(phones)
if __name__ == "__main__":
print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh"))
|