Spaces:
Running
on
T4
Running
on
T4
# punctuation = ["!", "?", "…", ",", ".", "'", "-"] | |
punctuation = ["!", "?", "…", ",", ".", "'", "-", "¿", "¡"] | |
pu_symbols = punctuation + ["SP", "UNK"] | |
pad = "_" | |
# chinese | |
zh_symbols = [ | |
"E", | |
"En", | |
"a", | |
"ai", | |
"an", | |
"ang", | |
"ao", | |
"b", | |
"c", | |
"ch", | |
"d", | |
"e", | |
"ei", | |
"en", | |
"eng", | |
"er", | |
"f", | |
"g", | |
"h", | |
"i", | |
"i0", | |
"ia", | |
"ian", | |
"iang", | |
"iao", | |
"ie", | |
"in", | |
"ing", | |
"iong", | |
"ir", | |
"iu", | |
"j", | |
"k", | |
"l", | |
"m", | |
"n", | |
"o", | |
"ong", | |
"ou", | |
"p", | |
"q", | |
"r", | |
"s", | |
"sh", | |
"t", | |
"u", | |
"ua", | |
"uai", | |
"uan", | |
"uang", | |
"ui", | |
"un", | |
"uo", | |
"v", | |
"van", | |
"ve", | |
"vn", | |
"w", | |
"x", | |
"y", | |
"z", | |
"zh", | |
"AA", | |
"EE", | |
"OO", | |
] | |
num_zh_tones = 6 | |
# japanese | |
ja_symbols = [ | |
"N", | |
"a", | |
"a:", | |
"b", | |
"by", | |
"ch", | |
"d", | |
"dy", | |
"e", | |
"e:", | |
"f", | |
"g", | |
"gy", | |
"h", | |
"hy", | |
"i", | |
"i:", | |
"j", | |
"k", | |
"ky", | |
"m", | |
"my", | |
"n", | |
"ny", | |
"o", | |
"o:", | |
"p", | |
"py", | |
"q", | |
"r", | |
"ry", | |
"s", | |
"sh", | |
"t", | |
"ts", | |
"ty", | |
"u", | |
"u:", | |
"w", | |
"y", | |
"z", | |
"zy", | |
] | |
num_ja_tones = 1 | |
# English | |
en_symbols = [ | |
"aa", | |
"ae", | |
"ah", | |
"ao", | |
"aw", | |
"ay", | |
"b", | |
"ch", | |
"d", | |
"dh", | |
"eh", | |
"er", | |
"ey", | |
"f", | |
"g", | |
"hh", | |
"ih", | |
"iy", | |
"jh", | |
"k", | |
"l", | |
"m", | |
"n", | |
"ng", | |
"ow", | |
"oy", | |
"p", | |
"r", | |
"s", | |
"sh", | |
"t", | |
"th", | |
"uh", | |
"uw", | |
"V", | |
"w", | |
"y", | |
"z", | |
"zh", | |
] | |
num_en_tones = 4 | |
# Korean | |
kr_symbols = ['ᄌ', 'ᅥ', 'ᆫ', 'ᅦ', 'ᄋ', 'ᅵ', 'ᄅ', 'ᅴ', 'ᄀ', 'ᅡ', 'ᄎ', 'ᅪ', 'ᄑ', 'ᅩ', 'ᄐ', 'ᄃ', 'ᅢ', 'ᅮ', 'ᆼ', 'ᅳ', 'ᄒ', 'ᄆ', 'ᆯ', 'ᆷ', 'ᄂ', 'ᄇ', 'ᄉ', 'ᆮ', 'ᄁ', 'ᅬ', 'ᅣ', 'ᄄ', 'ᆨ', 'ᄍ', 'ᅧ', 'ᄏ', 'ᆸ', 'ᅭ', '(', 'ᄊ', ')', 'ᅲ', 'ᅨ', 'ᄈ', 'ᅱ', 'ᅯ', 'ᅫ', 'ᅰ', 'ᅤ', '~', '\\', '[', ']', '/', '^', ':', 'ㄸ', '*'] | |
num_kr_tones = 1 | |
# Spanish | |
es_symbols = [ | |
"N", | |
"Q", | |
"a", | |
"b", | |
"d", | |
"e", | |
"f", | |
"g", | |
"h", | |
"i", | |
"j", | |
"k", | |
"l", | |
"m", | |
"n", | |
"o", | |
"p", | |
"s", | |
"t", | |
"u", | |
"v", | |
"w", | |
"x", | |
"y", | |
"z", | |
"ɑ", | |
"æ", | |
"ʃ", | |
"ʑ", | |
"ç", | |
"ɯ", | |
"ɪ", | |
"ɔ", | |
"ɛ", | |
"ɹ", | |
"ð", | |
"ə", | |
"ɫ", | |
"ɥ", | |
"ɸ", | |
"ʊ", | |
"ɾ", | |
"ʒ", | |
"θ", | |
"β", | |
"ŋ", | |
"ɦ", | |
"ɡ", | |
"r", | |
"ɲ", | |
"ʝ", | |
"ɣ", | |
"ʎ", | |
"ˈ", | |
"ˌ", | |
"ː" | |
] | |
num_es_tones = 1 | |
# French | |
fr_symbols = [ | |
"\u0303", | |
"œ", | |
"ø", | |
"ʁ", | |
"ɒ", | |
"ʌ", | |
"ɜ", | |
"ɐ" | |
] | |
num_fr_tones = 1 | |
# German | |
de_symbols = [ | |
"ʏ", | |
"̩" | |
] | |
num_de_tones = 1 | |
# Russian | |
ru_symbols = [ | |
"ɭ", | |
"ʲ", | |
"ɕ", | |
"\"", | |
"ɵ", | |
"^", | |
"ɬ" | |
] | |
num_ru_tones = 1 | |
# combine all symbols | |
normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols + kr_symbols + es_symbols + fr_symbols + de_symbols + ru_symbols)) | |
symbols = [pad] + normal_symbols + pu_symbols | |
sil_phonemes_ids = [symbols.index(i) for i in pu_symbols] | |
# combine all tones | |
num_tones = num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones + num_es_tones + num_fr_tones + num_de_tones + num_ru_tones | |
# language maps | |
language_id_map = {"ZH": 0, "JP": 1, "EN": 2, "ZH_MIX_EN": 3, 'KR': 4, 'ES': 5, 'SP': 5 ,'FR': 6} | |
num_languages = len(language_id_map.keys()) | |
language_tone_start_map = { | |
"ZH": 0, | |
"ZH_MIX_EN": 0, | |
"JP": num_zh_tones, | |
"EN": num_zh_tones + num_ja_tones, | |
'KR': num_zh_tones + num_ja_tones + num_en_tones, | |
"ES": num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones, | |
"SP": num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones, | |
"FR": num_zh_tones + num_ja_tones + num_en_tones + num_kr_tones + num_es_tones, | |
} | |
if __name__ == "__main__": | |
a = set(zh_symbols) | |
b = set(en_symbols) | |
print(sorted(a & b)) | |