File size: 1,171 Bytes
b8b70ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
_pad = '_'
_punc = ";:,.!?¡¿—-…«»'“”~() "

_jamo_leads = "".join([chr(_) for _ in range(0x1100, 0x1113)])
_jamo_vowels = "".join([chr(_) for _ in range(0x1161, 0x1176)])
_jamo_tails = "".join([chr(_) for _ in range(0x11A8, 0x11C3)])
_kor_characters = _jamo_leads + _jamo_vowels + _jamo_tails

_cmu_characters = [
    'AA', 'AE', 'AH',
    'AO', 'AW', 'AY',
    'B', 'CH', 'D', 'DH', 'EH', 'ER', 'EY',
    'F', 'G', 'HH', 'IH', 'IY',
    'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OY',
    'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UW',
    'V', 'W', 'Y', 'Z', 'ZH'
]


lang_to_symbols = {
    'common': [_pad] + list(_punc),
    'ko_KR': list(_kor_characters), 
    'en_US': _cmu_characters, 
}

def lang_to_dict(lang):
    symbol_lang = lang_to_symbols['common'] + lang_to_symbols[lang]
    dict_lang = {s: i for i, s in enumerate(symbol_lang)}
    return dict_lang

def lang_to_dict_inverse(lang):
    symbol_lang = lang_to_symbols['common'] + lang_to_symbols[lang]
    dict_lang = {i: s for i, s in enumerate(symbol_lang)}
    return dict_lang

def symbol_len(lang):
    symbol_lang = lang_to_symbols['common'] + lang_to_symbols[lang]
    return len(symbol_lang)