File size: 1,091 Bytes
e0cfda2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be80077
e0cfda2
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import re
import opencc


dialects = {'SZ': 'suzhou', 'WX': 'wuxi', 'CZ': 'changzhou', 'HZ': 'hangzhou',
            'SX': 'shaoxing', 'NB': 'ningbo', 'JJ': 'jingjiang', 'YX': 'yixing',
            'JD': 'jiading', 'ZR': 'zhenru', 'PH': 'pinghu', 'TX': 'tongxiang',
            'JS': 'jiashan', 'HN': 'xiashi', 'LP': 'linping', 'XS': 'xiaoshan',
            'FY': 'fuyang', 'RA': 'ruao', 'CX': 'cixi', 'SM': 'sanmen',
            'TT': 'tiantai', 'WZ': 'wenzhou', 'SC': 'suichang', 'YB': 'youbu'}

converters = {}

for dialect in dialects.values():
    try:
        converters[dialect] = opencc.OpenCC("chinese_dialect_lexicons/"+dialect)
    except:
        pass


def ngu_dialect_to_ipa(text, dialect):
    dialect = dialects[dialect]
    text = converters[dialect].convert(text).replace('-','').replace('$',' ')
    text = re.sub(r'[、;:]', ',', text)
    text = re.sub(r'\s*,\s*', ', ', text)
    text = re.sub(r'\s*。\s*', '. ', text)
    text = re.sub(r'\s*?\s*', '? ', text)
    text = re.sub(r'\s*!\s*', '! ', text)
    text = re.sub(r'\s*$', '', text)
    return text