Dantra1 commited on
Commit
1314848
1 Parent(s): c9ac7aa

Upload 4 files

Browse files
Files changed (4) hide show
  1. text/LICENSE.txt +19 -0
  2. text/__init__.py +57 -0
  3. text/cleaners.py +475 -0
  4. text/symbols.py +39 -0
text/LICENSE.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2017 Keith Ito
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
text/__init__.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+ from text import cleaners
3
+ from text.symbols import symbols
4
+
5
+
6
+ # Mappings from symbol to numeric ID and vice versa:
7
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
8
+ _id_to_symbol = {i: s for i, s in enumerate(symbols)}
9
+
10
+
11
+ def text_to_sequence(text, symbols, cleaner_names):
12
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
13
+ Args:
14
+ text: string to convert to a sequence
15
+ cleaner_names: names of the cleaner functions to run the text through
16
+ Returns:
17
+ List of integers corresponding to the symbols in the text
18
+ '''
19
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
20
+ sequence = []
21
+
22
+ clean_text = _clean_text(text, cleaner_names)
23
+ for symbol in clean_text:
24
+ if symbol not in _symbol_to_id.keys():
25
+ continue
26
+ symbol_id = _symbol_to_id[symbol]
27
+ sequence += [symbol_id]
28
+ return sequence, clean_text
29
+
30
+
31
+ def cleaned_text_to_sequence(cleaned_text):
32
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
33
+ Args:
34
+ text: string to convert to a sequence
35
+ Returns:
36
+ List of integers corresponding to the symbols in the text
37
+ '''
38
+ sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
39
+ return sequence
40
+
41
+
42
+ def sequence_to_text(sequence):
43
+ '''Converts a sequence of IDs back to a string'''
44
+ result = ''
45
+ for symbol_id in sequence:
46
+ s = _id_to_symbol[symbol_id]
47
+ result += s
48
+ return result
49
+
50
+
51
+ def _clean_text(text, cleaner_names):
52
+ for name in cleaner_names:
53
+ cleaner = getattr(cleaners, name)
54
+ if not cleaner:
55
+ raise Exception('Unknown cleaner: %s' % name)
56
+ text = cleaner(text)
57
+ return text
text/cleaners.py ADDED
@@ -0,0 +1,475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+
3
+ '''
4
+ Cleaners are transformations that run over the input text at both training and eval time.
5
+
6
+ Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
7
+ hyperparameter. Some cleaners are English-specific. You'll typically want to use:
8
+ 1. "english_cleaners" for English text
9
+ 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10
+ the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11
+ 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12
+ the symbols in symbols.py to match your data).
13
+ '''
14
+
15
+ import re
16
+ from unidecode import unidecode
17
+ import pyopenjtalk
18
+ from jamo import h2j, j2hcj
19
+ from pypinyin import lazy_pinyin, BOPOMOFO
20
+ import jieba, cn2an
21
+
22
+
23
+ # This is a list of Korean classifiers preceded by pure Korean numerals.
24
+ _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
25
+
26
+ # Regular expression matching whitespace:
27
+ _whitespace_re = re.compile(r'\s+')
28
+
29
+ # Regular expression matching Japanese without punctuation marks:
30
+ _japanese_characters = re.compile(r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
31
+
32
+ # Regular expression matching non-Japanese characters or punctuation marks:
33
+ _japanese_marks = re.compile(r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
34
+
35
+ # List of (regular expression, replacement) pairs for abbreviations:
36
+ _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
37
+ ('mrs', 'misess'),
38
+ ('mr', 'mister'),
39
+ ('dr', 'doctor'),
40
+ ('st', 'saint'),
41
+ ('co', 'company'),
42
+ ('jr', 'junior'),
43
+ ('maj', 'major'),
44
+ ('gen', 'general'),
45
+ ('drs', 'doctors'),
46
+ ('rev', 'reverend'),
47
+ ('lt', 'lieutenant'),
48
+ ('hon', 'honorable'),
49
+ ('sgt', 'sergeant'),
50
+ ('capt', 'captain'),
51
+ ('esq', 'esquire'),
52
+ ('ltd', 'limited'),
53
+ ('col', 'colonel'),
54
+ ('ft', 'fort'),
55
+ ]]
56
+
57
+ # List of (hangul, hangul divided) pairs:
58
+ _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
59
+ ('ㄳ', 'ㄱㅅ'),
60
+ ('ㄵ', 'ㄴㅈ'),
61
+ ('ㄶ', 'ㄴㅎ'),
62
+ ('ㄺ', 'ㄹㄱ'),
63
+ ('ㄻ', 'ㄹㅁ'),
64
+ ('ㄼ', 'ㄹㅂ'),
65
+ ('ㄽ', 'ㄹㅅ'),
66
+ ('ㄾ', 'ㄹㅌ'),
67
+ ('ㄿ', 'ㄹㅍ'),
68
+ ('ㅀ', 'ㄹㅎ'),
69
+ ('ㅄ', 'ㅂㅅ'),
70
+ ('ㅘ', 'ㅗㅏ'),
71
+ ('ㅙ', 'ㅗㅐ'),
72
+ ('ㅚ', 'ㅗㅣ'),
73
+ ('ㅝ', 'ㅜㅓ'),
74
+ ('ㅞ', 'ㅜㅔ'),
75
+ ('ㅟ', 'ㅜㅣ'),
76
+ ('ㅢ', 'ㅡㅣ'),
77
+ ('ㅑ', 'ㅣㅏ'),
78
+ ('ㅒ', 'ㅣㅐ'),
79
+ ('ㅕ', 'ㅣㅓ'),
80
+ ('ㅖ', 'ㅣㅔ'),
81
+ ('ㅛ', 'ㅣㅗ'),
82
+ ('ㅠ', 'ㅣㅜ')
83
+ ]]
84
+
85
+ # List of (Latin alphabet, hangul) pairs:
86
+ _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
87
+ ('a', '에이'),
88
+ ('b', '비'),
89
+ ('c', '시'),
90
+ ('d', '디'),
91
+ ('e', '이'),
92
+ ('f', '에프'),
93
+ ('g', '지'),
94
+ ('h', '에이치'),
95
+ ('i', '아이'),
96
+ ('j', '제이'),
97
+ ('k', '케이'),
98
+ ('l', '엘'),
99
+ ('m', '엠'),
100
+ ('n', '엔'),
101
+ ('o', '오'),
102
+ ('p', '피'),
103
+ ('q', '큐'),
104
+ ('r', '아르'),
105
+ ('s', '에스'),
106
+ ('t', '티'),
107
+ ('u', '유'),
108
+ ('v', '브이'),
109
+ ('w', '더블유'),
110
+ ('x', '엑스'),
111
+ ('y', '와이'),
112
+ ('z', '제트')
113
+ ]]
114
+
115
+ # List of (Latin alphabet, bopomofo) pairs:
116
+ _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
117
+ ('a', 'ㄟˉ'),
118
+ ('b', 'ㄅㄧˋ'),
119
+ ('c', 'ㄙㄧˉ'),
120
+ ('d', 'ㄉㄧˋ'),
121
+ ('e', 'ㄧˋ'),
122
+ ('f', 'ㄝˊㄈㄨˋ'),
123
+ ('g', 'ㄐㄧˋ'),
124
+ ('h', 'ㄝˇㄑㄩˋ'),
125
+ ('i', 'ㄞˋ'),
126
+ ('j', 'ㄐㄟˋ'),
127
+ ('k', 'ㄎㄟˋ'),
128
+ ('l', 'ㄝˊㄛˋ'),
129
+ ('m', 'ㄝˊㄇㄨˋ'),
130
+ ('n', 'ㄣˉ'),
131
+ ('o', 'ㄡˉ'),
132
+ ('p', 'ㄆㄧˉ'),
133
+ ('q', 'ㄎㄧㄡˉ'),
134
+ ('r', 'ㄚˋ'),
135
+ ('s', 'ㄝˊㄙˋ'),
136
+ ('t', 'ㄊㄧˋ'),
137
+ ('u', 'ㄧㄡˉ'),
138
+ ('v', 'ㄨㄧˉ'),
139
+ ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
140
+ ('x', 'ㄝˉㄎㄨˋㄙˋ'),
141
+ ('y', 'ㄨㄞˋ'),
142
+ ('z', 'ㄗㄟˋ')
143
+ ]]
144
+
145
+
146
+ # List of (bopomofo, romaji) pairs:
147
+ _bopomofo_to_romaji = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
148
+ ('ㄅㄛ', 'p⁼wo'),
149
+ ('ㄆㄛ', 'pʰwo'),
150
+ ('ㄇㄛ', 'mwo'),
151
+ ('ㄈㄛ', 'fwo'),
152
+ ('ㄅ', 'p⁼'),
153
+ ('ㄆ', 'pʰ'),
154
+ ('ㄇ', 'm'),
155
+ ('ㄈ', 'f'),
156
+ ('ㄉ', 't⁼'),
157
+ ('ㄊ', 'tʰ'),
158
+ ('ㄋ', 'n'),
159
+ ('ㄌ', 'l'),
160
+ ('ㄍ', 'k⁼'),
161
+ ('ㄎ', 'kʰ'),
162
+ ('ㄏ', 'h'),
163
+ ('ㄐ', 'ʧ⁼'),
164
+ ('ㄑ', 'ʧʰ'),
165
+ ('ㄒ', 'ʃ'),
166
+ ('ㄓ', 'ʦ`⁼'),
167
+ ('ㄔ', 'ʦ`ʰ'),
168
+ ('ㄕ', 's`'),
169
+ ('ㄖ', 'ɹ`'),
170
+ ('ㄗ', 'ʦ⁼'),
171
+ ('ㄘ', 'ʦʰ'),
172
+ ('ㄙ', 's'),
173
+ ('ㄚ', 'a'),
174
+ ('ㄛ', 'o'),
175
+ ('ㄜ', 'ə'),
176
+ ('ㄝ', 'e'),
177
+ ('ㄞ', 'ai'),
178
+ ('ㄟ', 'ei'),
179
+ ('ㄠ', 'au'),
180
+ ('ㄡ', 'ou'),
181
+ ('ㄧㄢ', 'yeNN'),
182
+ ('ㄢ', 'aNN'),
183
+ ('ㄧㄣ', 'iNN'),
184
+ ('ㄣ', 'əNN'),
185
+ ('ㄤ', 'aNg'),
186
+ ('ㄧㄥ', 'iNg'),
187
+ ('ㄨㄥ', 'uNg'),
188
+ ('ㄩㄥ', 'yuNg'),
189
+ ('ㄥ', 'əNg'),
190
+ ('ㄦ', 'əɻ'),
191
+ ('ㄧ', 'i'),
192
+ ('ㄨ', 'u'),
193
+ ('ㄩ', 'ɥ'),
194
+ ('ˉ', '→'),
195
+ ('ˊ', '↑'),
196
+ ('ˇ', '↓↑'),
197
+ ('ˋ', '↓'),
198
+ ('˙', ''),
199
+ (',', ','),
200
+ ('。', '.'),
201
+ ('!', '!'),
202
+ ('?', '?'),
203
+ ('—', '-')
204
+ ]]
205
+
206
+
207
+ def expand_abbreviations(text):
208
+ for regex, replacement in _abbreviations:
209
+ text = re.sub(regex, replacement, text)
210
+ return text
211
+
212
+
213
+ def lowercase(text):
214
+ return text.lower()
215
+
216
+
217
+ def collapse_whitespace(text):
218
+ return re.sub(_whitespace_re, ' ', text)
219
+
220
+
221
+ def convert_to_ascii(text):
222
+ return unidecode(text)
223
+
224
+
225
+ def japanese_to_romaji_with_accent(text):
226
+ '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
227
+ sentences = re.split(_japanese_marks, text)
228
+ marks = re.findall(_japanese_marks, text)
229
+ text = ''
230
+ for i, sentence in enumerate(sentences):
231
+ if re.match(_japanese_characters, sentence):
232
+ if text!='':
233
+ text+=' '
234
+ labels = pyopenjtalk.extract_fullcontext(sentence)
235
+ for n, label in enumerate(labels):
236
+ phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
237
+ if phoneme not in ['sil','pau']:
238
+ text += phoneme.replace('ch','ʧ').replace('sh','ʃ').replace('cl','Q')
239
+ else:
240
+ continue
241
+ n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
242
+ a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
243
+ a2 = int(re.search(r"\+(\d+)\+", label).group(1))
244
+ a3 = int(re.search(r"\+(\d+)/", label).group(1))
245
+ if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil','pau']:
246
+ a2_next=-1
247
+ else:
248
+ a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
249
+ # Accent phrase boundary
250
+ if a3 == 1 and a2_next == 1:
251
+ text += ' '
252
+ # Falling
253
+ elif a1 == 0 and a2_next == a2 + 1 and a2 != n_moras:
254
+ text += '↓'
255
+ # Rising
256
+ elif a2 == 1 and a2_next == 2:
257
+ text += '↑'
258
+ if i<len(marks):
259
+ text += unidecode(marks[i]).replace(' ','')
260
+ return text
261
+
262
+
263
+ def latin_to_hangul(text):
264
+ for regex, replacement in _latin_to_hangul:
265
+ text = re.sub(regex, replacement, text)
266
+ return text
267
+
268
+
269
+ def divide_hangul(text):
270
+ for regex, replacement in _hangul_divided:
271
+ text = re.sub(regex, replacement, text)
272
+ return text
273
+
274
+
275
+ def hangul_number(num, sino=True):
276
+ '''Reference https://github.com/Kyubyong/g2pK'''
277
+ num = re.sub(',', '', num)
278
+
279
+ if num == '0':
280
+ return '영'
281
+ if not sino and num == '20':
282
+ return '스무'
283
+
284
+ digits = '123456789'
285
+ names = '일이삼사오육칠팔구'
286
+ digit2name = {d: n for d, n in zip(digits, names)}
287
+
288
+ modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
289
+ decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
290
+ digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
291
+ digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
292
+
293
+ spelledout = []
294
+ for i, digit in enumerate(num):
295
+ i = len(num) - i - 1
296
+ if sino:
297
+ if i == 0:
298
+ name = digit2name.get(digit, '')
299
+ elif i == 1:
300
+ name = digit2name.get(digit, '') + '십'
301
+ name = name.replace('일십', '십')
302
+ else:
303
+ if i == 0:
304
+ name = digit2mod.get(digit, '')
305
+ elif i == 1:
306
+ name = digit2dec.get(digit, '')
307
+ if digit == '0':
308
+ if i % 4 == 0:
309
+ last_three = spelledout[-min(3, len(spelledout)):]
310
+ if ''.join(last_three) == '':
311
+ spelledout.append('')
312
+ continue
313
+ else:
314
+ spelledout.append('')
315
+ continue
316
+ if i == 2:
317
+ name = digit2name.get(digit, '') + '백'
318
+ name = name.replace('일백', '백')
319
+ elif i == 3:
320
+ name = digit2name.get(digit, '') + '천'
321
+ name = name.replace('일천', '천')
322
+ elif i == 4:
323
+ name = digit2name.get(digit, '') + '만'
324
+ name = name.replace('일만', '만')
325
+ elif i == 5:
326
+ name = digit2name.get(digit, '') + '십'
327
+ name = name.replace('일십', '십')
328
+ elif i == 6:
329
+ name = digit2name.get(digit, '') + '백'
330
+ name = name.replace('일백', '백')
331
+ elif i == 7:
332
+ name = digit2name.get(digit, '') + '천'
333
+ name = name.replace('일천', '천')
334
+ elif i == 8:
335
+ name = digit2name.get(digit, '') + '억'
336
+ elif i == 9:
337
+ name = digit2name.get(digit, '') + '십'
338
+ elif i == 10:
339
+ name = digit2name.get(digit, '') + '백'
340
+ elif i == 11:
341
+ name = digit2name.get(digit, '') + '천'
342
+ elif i == 12:
343
+ name = digit2name.get(digit, '') + '조'
344
+ elif i == 13:
345
+ name = digit2name.get(digit, '') + '십'
346
+ elif i == 14:
347
+ name = digit2name.get(digit, '') + '백'
348
+ elif i == 15:
349
+ name = digit2name.get(digit, '') + '천'
350
+ spelledout.append(name)
351
+ return ''.join(elem for elem in spelledout)
352
+
353
+
354
+ def number_to_hangul(text):
355
+ '''Reference https://github.com/Kyubyong/g2pK'''
356
+ tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
357
+ for token in tokens:
358
+ num, classifier = token
359
+ if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
360
+ spelledout = hangul_number(num, sino=False)
361
+ else:
362
+ spelledout = hangul_number(num, sino=True)
363
+ text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
364
+ # digit by digit for remaining digits
365
+ digits = '0123456789'
366
+ names = '영일이삼사오육칠팔구'
367
+ for d, n in zip(digits, names):
368
+ text = text.replace(d, n)
369
+ return text
370
+
371
+
372
+ def number_to_chinese(text):
373
+ numbers = re.findall(r'\d+(?:\.?\d+)?', text)
374
+ for number in numbers:
375
+ text = text.replace(number, cn2an.an2cn(number),1)
376
+ return text
377
+
378
+
379
+ def chinese_to_bopomofo(text):
380
+ text=text.replace('、',',').replace(';',',').replace(':',',')
381
+ words=jieba.lcut(text,cut_all=False)
382
+ text=''
383
+ for word in words:
384
+ bopomofos=lazy_pinyin(word,BOPOMOFO)
385
+ if not re.search('[\u4e00-\u9fff]',word):
386
+ text+=word
387
+ continue
388
+ for i in range(len(bopomofos)):
389
+ if re.match('[\u3105-\u3129]',bopomofos[i][-1]):
390
+ bopomofos[i]+='ˉ'
391
+ if text!='':
392
+ text+=' '
393
+ text+=''.join(bopomofos)
394
+ return text
395
+
396
+
397
+ def latin_to_bopomofo(text):
398
+ for regex, replacement in _latin_to_bopomofo:
399
+ text = re.sub(regex, replacement, text)
400
+ return text
401
+
402
+
403
+ def bopomofo_to_romaji(text):
404
+ for regex, replacement in _bopomofo_to_romaji:
405
+ text = re.sub(regex, replacement, text)
406
+ return text
407
+
408
+
409
+ def basic_cleaners(text):
410
+ '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
411
+ text = lowercase(text)
412
+ text = collapse_whitespace(text)
413
+ return text
414
+
415
+
416
+ def transliteration_cleaners(text):
417
+ '''Pipeline for non-English text that transliterates to ASCII.'''
418
+ text = convert_to_ascii(text)
419
+ text = lowercase(text)
420
+ text = collapse_whitespace(text)
421
+ return text
422
+
423
+
424
+ def japanese_cleaners(text):
425
+ text=japanese_to_romaji_with_accent(text)
426
+ if re.match('[A-Za-z]',text[-1]):
427
+ text += '.'
428
+ return text
429
+
430
+
431
+ def japanese_cleaners2(text):
432
+ return japanese_cleaners(text).replace('ts','ʦ').replace('...','…')
433
+
434
+
435
+ def korean_cleaners(text):
436
+ '''Pipeline for Korean text'''
437
+ text = latin_to_hangul(text)
438
+ text = number_to_hangul(text)
439
+ text = j2hcj(h2j(text))
440
+ text = divide_hangul(text)
441
+ if re.match('[\u3131-\u3163]',text[-1]):
442
+ text += '.'
443
+ return text
444
+
445
+
446
+ def chinese_cleaners(text):
447
+ '''Pipeline for Chinese text'''
448
+ text=number_to_chinese(text)
449
+ text=chinese_to_bopomofo(text)
450
+ text=latin_to_bopomofo(text)
451
+ if re.match('[ˉˊˇˋ˙]',text[-1]):
452
+ text += '。'
453
+ return text
454
+
455
+
456
+ def zh_ja_mixture_cleaners(text):
457
+ chinese_texts=re.findall(r'\[ZH\].*?\[ZH\]',text)
458
+ japanese_texts=re.findall(r'\[JA\].*?\[JA\]',text)
459
+ for chinese_text in chinese_texts:
460
+ cleaned_text=number_to_chinese(chinese_text[4:-4])
461
+ cleaned_text=chinese_to_bopomofo(cleaned_text)
462
+ cleaned_text=latin_to_bopomofo(cleaned_text)
463
+ cleaned_text=bopomofo_to_romaji(cleaned_text)
464
+ cleaned_text=re.sub('i[aoe]',lambda x:'y'+x.group(0)[1:],cleaned_text)
465
+ cleaned_text=re.sub('u[aoəe]',lambda x:'w'+x.group(0)[1:],cleaned_text)
466
+ cleaned_text=re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑]+)',lambda x:x.group(1)+'ɹ`'+x.group(2),cleaned_text).replace('ɻ','ɹ`')
467
+ cleaned_text=re.sub('([ʦs][⁼ʰ]?)([→↓↑]+)',lambda x:x.group(1)+'ɹ'+x.group(2),cleaned_text)
468
+ text = text.replace(chinese_text,cleaned_text+' ',1)
469
+ for japanese_text in japanese_texts:
470
+ cleaned_text=japanese_to_romaji_with_accent(japanese_text[4:-4]).replace('ts','ʦ').replace('u','ɯ').replace('...','…')
471
+ text = text.replace(japanese_text,cleaned_text+' ',1)
472
+ text=text[:-1]
473
+ if len(text) and re.match('[A-Za-zɯɹəɥ→↓↑]',text[-1]):
474
+ text += '.'
475
+ return text
text/symbols.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Defines the set of symbols used in text input to the model.
3
+ '''
4
+
5
+ '''# japanese_cleaners
6
+ _pad = '_'
7
+ _punctuation = ',.!?-'
8
+ _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧ↓↑ '
9
+ '''
10
+
11
+ '''# japanese_cleaners2
12
+ _pad = '_'
13
+ _punctuation = ',.!?-~…'
14
+ _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
15
+ '''
16
+
17
+ '''# korean_cleaners
18
+ _pad = '_'
19
+ _punctuation = ',.!?…~'
20
+ _letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
21
+ '''
22
+
23
+ '''# chinese_cleaners
24
+ _pad = '_'
25
+ _punctuation = ',。!?—…'
26
+ _letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
27
+ '''
28
+
29
+ # zh_ja_mixture_cleaners
30
+ _pad = '_'
31
+ _punctuation = ',.!?-~…'
32
+ _letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
33
+
34
+
35
+ # Export all symbols:
36
+ symbols = [_pad] + list(_punctuation) + list(_letters)
37
+
38
+ # Special symbol ids
39
+ SPACE_ID = symbols.index(" ")