ORI-Muchim commited on
Commit
f3e1830
1 Parent(s): 4954e6e

Delete text/korean.py

Browse files
Files changed (1) hide show
  1. text/korean.py +0 -203
text/korean.py DELETED
@@ -1,203 +0,0 @@
1
- import re
2
- from jamo import h2j, j2hcj
3
- import ko_pron
4
-
5
-
6
- # This is a list of Korean classifiers preceded by pure Korean numerals.
7
- _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
8
-
9
- # List of (hangul, hangul divided) pairs:
10
- _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
11
- ('ㄳ', 'ㄱㅅ'),
12
- ('ㄵ', 'ㄴㅈ'),
13
- ('ㄶ', 'ㄴㅎ'),
14
- ('ㄺ', 'ㄹㄱ'),
15
- ('ㄻ', 'ㄹㅁ'),
16
- ('ㄼ', 'ㄹㅂ'),
17
- ('ㄽ', 'ㄹㅅ'),
18
- ('ㄾ', 'ㄹㅌ'),
19
- ('ㄿ', 'ㄹㅍ'),
20
- ('ㅀ', 'ㄹㅎ'),
21
- ('ㅄ', 'ㅂㅅ'),
22
- ('ㅘ', 'ㅗㅏ'),
23
- ('ㅙ', 'ㅗㅐ'),
24
- ('ㅚ', 'ㅗㅣ'),
25
- ('ㅝ', 'ㅜㅓ'),
26
- ('ㅞ', 'ㅜㅔ'),
27
- ('ㅟ', 'ㅜㅣ'),
28
- ('ㅢ', 'ㅡㅣ'),
29
- ('ㅑ', 'ㅣㅏ'),
30
- ('ㅒ', 'ㅣㅐ'),
31
- ('ㅕ', 'ㅣㅓ'),
32
- ('ㅖ', 'ㅣㅔ'),
33
- ('ㅛ', 'ㅣㅗ'),
34
- ('ㅠ', 'ㅣㅜ')
35
- ]]
36
-
37
- # List of (Latin alphabet, hangul) pairs:
38
- _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
39
- ('a', '에이'),
40
- ('b', '비'),
41
- ('c', '시'),
42
- ('d', '디'),
43
- ('e', '이'),
44
- ('f', '에프'),
45
- ('g', '지'),
46
- ('h', '에이치'),
47
- ('i', '아이'),
48
- ('j', '제이'),
49
- ('k', '케이'),
50
- ('l', '엘'),
51
- ('m', '엠'),
52
- ('n', '엔'),
53
- ('o', '오'),
54
- ('p', '피'),
55
- ('q', '큐'),
56
- ('r', '아르'),
57
- ('s', '에스'),
58
- ('t', '티'),
59
- ('u', '유'),
60
- ('v', '브이'),
61
- ('w', '더블유'),
62
- ('x', '엑스'),
63
- ('y', '와이'),
64
- ('z', '제트')
65
- ]]
66
-
67
- # List of (ipa, lazy ipa) pairs:
68
- _ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
69
- ('t͡ɕ','ʧ'),
70
- ('d͡ʑ','ʥ'),
71
- ('ɲ','n^'),
72
- ('ɕ','ʃ'),
73
- ('ʷ','w'),
74
- ('ɭ','l`'),
75
- ('ʎ','ɾ'),
76
- ('ɣ','ŋ'),
77
- ('ɰ','ɯ'),
78
- ('ʝ','j'),
79
- ('ʌ','ə'),
80
- ('ɡ','g'),
81
- ('\u031a','#'),
82
- ('\u0348','='),
83
- ('\u031e',''),
84
- ('\u0320',''),
85
- ('\u0339','')
86
- ]]
87
-
88
-
89
- def latin_to_hangul(text):
90
- for regex, replacement in _latin_to_hangul:
91
- text = re.sub(regex, replacement, text)
92
- return text
93
-
94
-
95
- def hangul_number(num, sino=True):
96
- '''Reference https://github.com/Kyubyong/g2pK'''
97
- num = re.sub(',', '', num)
98
-
99
- if num == '0':
100
- return '영'
101
- if not sino and num == '20':
102
- return '스무'
103
-
104
- digits = '123456789'
105
- names = '일이삼사오육칠팔구'
106
- digit2name = {d: n for d, n in zip(digits, names)}
107
-
108
- modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
109
- decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
110
- digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
111
- digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
112
-
113
- spelledout = []
114
- for i, digit in enumerate(num):
115
- i = len(num) - i - 1
116
- if sino:
117
- if i == 0:
118
- name = digit2name.get(digit, '')
119
- elif i == 1:
120
- name = digit2name.get(digit, '') + '십'
121
- name = name.replace('일십', '십')
122
- else:
123
- if i == 0:
124
- name = digit2mod.get(digit, '')
125
- elif i == 1:
126
- name = digit2dec.get(digit, '')
127
- if digit == '0':
128
- if i % 4 == 0:
129
- last_three = spelledout[-min(3, len(spelledout)):]
130
- if ''.join(last_three) == '':
131
- spelledout.append('')
132
- continue
133
- else:
134
- spelledout.append('')
135
- continue
136
- if i == 2:
137
- name = digit2name.get(digit, '') + '백'
138
- name = name.replace('일백', '백')
139
- elif i == 3:
140
- name = digit2name.get(digit, '') + '천'
141
- name = name.replace('일천', '천')
142
- elif i == 4:
143
- name = digit2name.get(digit, '') + '만'
144
- name = name.replace('일만', '만')
145
- elif i == 5:
146
- name = digit2name.get(digit, '') + '십'
147
- name = name.replace('일십', '십')
148
- elif i == 6:
149
- name = digit2name.get(digit, '') + '백'
150
- name = name.replace('일백', '백')
151
- elif i == 7:
152
- name = digit2name.get(digit, '') + '천'
153
- name = name.replace('일천', '천')
154
- elif i == 8:
155
- name = digit2name.get(digit, '') + '억'
156
- elif i == 9:
157
- name = digit2name.get(digit, '') + '십'
158
- elif i == 10:
159
- name = digit2name.get(digit, '') + '백'
160
- elif i == 11:
161
- name = digit2name.get(digit, '') + '천'
162
- elif i == 12:
163
- name = digit2name.get(digit, '') + '조'
164
- elif i == 13:
165
- name = digit2name.get(digit, '') + '십'
166
- elif i == 14:
167
- name = digit2name.get(digit, '') + '백'
168
- elif i == 15:
169
- name = digit2name.get(digit, '') + '천'
170
- spelledout.append(name)
171
- return ''.join(elem for elem in spelledout)
172
-
173
-
174
- def number_to_hangul(text):
175
- '''Reference https://github.com/Kyubyong/g2pK'''
176
- tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
177
- for token in tokens:
178
- num, classifier = token
179
- if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
180
- spelledout = hangul_number(num, sino=False)
181
- else:
182
- spelledout = hangul_number(num, sino=True)
183
- text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
184
- # digit by digit for remaining digits
185
- digits = '0123456789'
186
- names = '영일이삼사오육칠팔구'
187
- for d, n in zip(digits, names):
188
- text = text.replace(d, n)
189
- return text
190
-
191
-
192
- def korean_to_lazy_ipa(text):
193
- text = latin_to_hangul(text)
194
- text = number_to_hangul(text)
195
- text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
196
- for regex, replacement in _ipa_to_lazy_ipa:
197
- text = re.sub(regex, replacement, text)
198
- return text
199
-
200
-
201
- def korean_to_ipa(text):
202
- text = korean_to_lazy_ipa(text)
203
- return text.replace('ʧ','tʃ').replace('ʥ','dʑ')