File size: 2,179 Bytes
c5ed230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5854014
 
 
 
 
 
 
 
 
c5ed230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import re
import cn2an
import opencc
import config

converter = opencc.OpenCC(config.ABS_PATH + '/chinese_dialect_lexicons/jyutjyu_2')

# List of (Latin alphabet, ipa) pairs:
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
    ('A', 'ei˥'),
    ('B', 'biː˥'),
    ('C', 'siː˥'),
    ('D', 'tiː˥'),
    ('E', 'iː˥'),
    ('F', 'e˥fuː˨˩'),
    ('G', 'tsiː˥'),
    ('H', 'ɪk̚˥tsʰyː˨˩'),
    ('I', 'ɐi˥'),
    ('J', 'tsei˥'),
    ('K', 'kʰei˥'),
    ('L', 'e˥llou˨˩'),
    ('M', 'ɛːm˥'),
    ('N', 'ɛːn˥'),
    ('O', 'ou˥'),
    ('P', 'pʰiː˥'),
    ('Q', 'kʰiːu˥'),
    ('R', 'aː˥lou˨˩'),
    ('S', 'ɛː˥siː˨˩'),
    ('T', 'tʰiː˥'),
    ('U', 'juː˥'),
    ('V', 'wiː˥'),
    ('W', 'tʊk̚˥piː˥juː˥'),
    ('X', 'ɪk̚˥siː˨˩'),
    ('Y', 'waːi˥'),
    ('Z', 'iː˨sɛːt̚˥')
]]

_symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
    ('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
    ('([0-9]+)/([0-9]+)', r'\2分之\1'),
    ('\+', r'加'),
    ('([0-9]+)-([0-9]+)', r'\1减\2'),
    ('×', r'乘以'),
    ('([0-9]+)x([0-9]+)', r'\1乘以\2'),
    ('([0-9]+)\*([0-9]+)', r'\1乘以\2'),
    ('÷', r'除以'),
    ('=', r'等于'),
    ('≠', r'不等于'),
]]


def symbols_to_chinese(text):
    for regex, replacement in _symbols_to_chinese:
        text = re.sub(regex, replacement, text)
    return text


def number_to_cantonese(text):
    return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)


def latin_to_ipa(text):
    for regex, replacement in _latin_to_ipa:
        text = re.sub(regex, replacement, text)
    return text


def cantonese_to_ipa(text):
    text = symbols_to_chinese(text)
    text = number_to_cantonese(text.upper())
    text = converter.convert(text).replace('-', '').replace('$', ' ')
    text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group()) + ' ', text)
    text = re.sub(r'[、;:]', ',', text)
    text = re.sub(r'\s*,\s*', ', ', text)
    text = re.sub(r'\s*。\s*', '. ', text)
    text = re.sub(r'\s*?\s*', '? ', text)
    text = re.sub(r'\s*!\s*', '! ', text)
    text = re.sub(r'\s*$', '', text)
    return text