Mahiruoshi commited on
Commit
6c035be
1 Parent(s): f0ca36a

Upload 25 files

Browse files
text/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (2.89 kB). View file
 
text/__pycache__/__init__.cpython-38.pyc CHANGED
Binary files a/text/__pycache__/__init__.cpython-38.pyc and b/text/__pycache__/__init__.cpython-38.pyc differ
 
text/__pycache__/cleaners.cpython-311.pyc ADDED
Binary file (13.1 kB). View file
 
text/__pycache__/cleaners.cpython-38.pyc CHANGED
Binary files a/text/__pycache__/cleaners.cpython-38.pyc and b/text/__pycache__/cleaners.cpython-38.pyc differ
 
text/__pycache__/japanese.cpython-311.pyc ADDED
Binary file (8.36 kB). View file
 
text/__pycache__/japanese.cpython-38.pyc CHANGED
Binary files a/text/__pycache__/japanese.cpython-38.pyc and b/text/__pycache__/japanese.cpython-38.pyc differ
 
text/__pycache__/mandarin.cpython-38.pyc CHANGED
Binary files a/text/__pycache__/mandarin.cpython-38.pyc and b/text/__pycache__/mandarin.cpython-38.pyc differ
 
text/__pycache__/symbols.cpython-38.pyc CHANGED
Binary files a/text/__pycache__/symbols.cpython-38.pyc and b/text/__pycache__/symbols.cpython-38.pyc differ
 
text/cantonese.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import cn2an
3
+ import opencc
4
+
5
+
6
+ converter = opencc.OpenCC('jyutjyu')
7
+
8
+ # List of (Latin alphabet, ipa) pairs:
9
+ _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
10
+ ('A', 'ei˥'),
11
+ ('B', 'biː˥'),
12
+ ('C', 'siː˥'),
13
+ ('D', 'tiː˥'),
14
+ ('E', 'iː˥'),
15
+ ('F', 'e˥fuː˨˩'),
16
+ ('G', 'tsiː˥'),
17
+ ('H', 'ɪk̚˥tsʰyː˨˩'),
18
+ ('I', 'ɐi˥'),
19
+ ('J', 'tsei˥'),
20
+ ('K', 'kʰei˥'),
21
+ ('L', 'e˥llou˨˩'),
22
+ ('M', 'ɛːm˥'),
23
+ ('N', 'ɛːn˥'),
24
+ ('O', 'ou˥'),
25
+ ('P', 'pʰiː˥'),
26
+ ('Q', 'kʰiːu˥'),
27
+ ('R', 'aː˥lou˨˩'),
28
+ ('S', 'ɛː˥siː˨˩'),
29
+ ('T', 'tʰiː˥'),
30
+ ('U', 'juː˥'),
31
+ ('V', 'wiː˥'),
32
+ ('W', 'tʊk̚˥piː˥juː˥'),
33
+ ('X', 'ɪk̚˥siː˨˩'),
34
+ ('Y', 'waːi˥'),
35
+ ('Z', 'iː˨sɛːt̚˥')
36
+ ]]
37
+
38
+
39
+ def number_to_cantonese(text):
40
+ return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
41
+
42
+
43
+ def latin_to_ipa(text):
44
+ for regex, replacement in _latin_to_ipa:
45
+ text = re.sub(regex, replacement, text)
46
+ return text
47
+
48
+
49
+ def cantonese_to_ipa(text):
50
+ text = number_to_cantonese(text.upper())
51
+ text = converter.convert(text).replace('-','').replace('$',' ')
52
+ text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
53
+ text = re.sub(r'[、;:]', ',', text)
54
+ text = re.sub(r'\s*,\s*', ', ', text)
55
+ text = re.sub(r'\s*。\s*', '. ', text)
56
+ text = re.sub(r'\s*?\s*', '? ', text)
57
+ text = re.sub(r'\s*!\s*', '! ', text)
58
+ text = re.sub(r'\s*$', '', text)
59
+ return text
text/cleaners.py CHANGED
@@ -1,19 +1,18 @@
1
  import re
2
- from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
3
  from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
 
4
  from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
5
- # from text.sanskrit import devanagari_to_ipa
6
- # from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
7
- # from text.thai import num_to_thai, latin_to_thai
8
- # from text.shanghainese import shanghainese_to_ipa
9
- # from text.cantonese import cantonese_to_ipa
10
- # from text.ngu_dialect import ngu_dialect_to_ipa
11
 
12
 
13
  def japanese_cleaners(text):
14
  text = japanese_to_romaji_with_accent(text)
15
- if re.match('[A-Za-z]', text[-1]):
16
- text += '.'
17
  return text
18
 
19
 
@@ -26,8 +25,7 @@ def korean_cleaners(text):
26
  text = latin_to_hangul(text)
27
  text = number_to_hangul(text)
28
  text = divide_hangul(text)
29
- if re.match('[\u3131-\u3163]', text[-1]):
30
- text += '.'
31
  return text
32
 
33
 
@@ -36,110 +34,67 @@ def chinese_cleaners(text):
36
  text = number_to_chinese(text)
37
  text = chinese_to_bopomofo(text)
38
  text = latin_to_bopomofo(text)
39
- if re.match('[ˉˊˇˋ˙]', text[-1]):
40
- text += '。'
41
  return text
42
 
43
 
44
  def zh_ja_mixture_cleaners(text):
45
- chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
46
- japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
47
- for chinese_text in chinese_texts:
48
- cleaned_text = chinese_to_romaji(chinese_text[4:-4])
49
- text = text.replace(chinese_text, cleaned_text+' ', 1)
50
- for japanese_text in japanese_texts:
51
- cleaned_text = japanese_to_romaji_with_accent(
52
- japanese_text[4:-4]).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')
53
- text = text.replace(japanese_text, cleaned_text+' ', 1)
54
- text = text[:-1]
55
- if re.match('[A-Za-zɯɹəɥ→↓↑]', text[-1]):
56
- text += '.'
57
  return text
58
 
59
 
60
  def sanskrit_cleaners(text):
61
  text = text.replace('॥', '।').replace('ॐ', 'ओम्')
62
- if text[-1] != '।':
63
- text += ' ।'
64
  return text
65
 
66
 
67
  def cjks_cleaners(text):
68
- chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
69
- japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
70
- korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
71
- sanskrit_texts = re.findall(r'\[SA\].*?\[SA\]', text)
72
- english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
73
- for chinese_text in chinese_texts:
74
- cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
75
- text = text.replace(chinese_text, cleaned_text+' ', 1)
76
- for japanese_text in japanese_texts:
77
- cleaned_text = japanese_to_ipa(japanese_text[4:-4])
78
- text = text.replace(japanese_text, cleaned_text+' ', 1)
79
- for korean_text in korean_texts:
80
- cleaned_text = korean_to_lazy_ipa(korean_text[4:-4])
81
- text = text.replace(korean_text, cleaned_text+' ', 1)
82
- for sanskrit_text in sanskrit_texts:
83
- cleaned_text = devanagari_to_ipa(sanskrit_text[4:-4])
84
- text = text.replace(sanskrit_text, cleaned_text+' ', 1)
85
- for english_text in english_texts:
86
- cleaned_text = english_to_lazy_ipa(english_text[4:-4])
87
- text = text.replace(english_text, cleaned_text+' ', 1)
88
- text = text[:-1]
89
- if re.match(r'[^\.,!\?\-…~]', text[-1]):
90
- text += '.'
91
  return text
92
 
93
 
94
  def cjke_cleaners(text):
95
- chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
96
- japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
97
- korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
98
- english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
99
- for chinese_text in chinese_texts:
100
- cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
101
- cleaned_text = cleaned_text.replace(
102
- 'ʧ', '').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')
103
- text = text.replace(chinese_text, cleaned_text+' ', 1)
104
- for japanese_text in japanese_texts:
105
- cleaned_text = japanese_to_ipa(japanese_text[4:-4])
106
- cleaned_text = cleaned_text.replace('ʧ', 'tʃ').replace(
107
- 'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')
108
- text = text.replace(japanese_text, cleaned_text+' ', 1)
109
- for korean_text in korean_texts:
110
- cleaned_text = korean_to_ipa(korean_text[4:-4])
111
- text = text.replace(korean_text, cleaned_text+' ', 1)
112
- for english_text in english_texts:
113
- cleaned_text = english_to_ipa2(english_text[4:-4])
114
- cleaned_text = cleaned_text.replace('ɑ', 'a').replace(
115
- 'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')
116
- text = text.replace(english_text, cleaned_text+' ', 1)
117
- text = text[:-1]
118
- if re.match(r'[^\.,!\?\-…~]', text[-1]):
119
- text += '.'
120
  return text
121
 
122
 
123
  def cjke_cleaners2(text):
124
- chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
125
- japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
126
- korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
127
- english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
128
- for chinese_text in chinese_texts:
129
- cleaned_text = chinese_to_ipa(chinese_text[4:-4])
130
- text = text.replace(chinese_text, cleaned_text+' ', 1)
131
- for japanese_text in japanese_texts:
132
- cleaned_text = japanese_to_ipa2(japanese_text[4:-4])
133
- text = text.replace(japanese_text, cleaned_text+' ', 1)
134
- for korean_text in korean_texts:
135
- cleaned_text = korean_to_ipa(korean_text[4:-4])
136
- text = text.replace(korean_text, cleaned_text+' ', 1)
137
- for english_text in english_texts:
138
- cleaned_text = english_to_ipa2(english_text[4:-4])
139
- text = text.replace(english_text, cleaned_text+' ', 1)
140
- text = text[:-1]
141
- if re.match(r'[^\.,!\?\-…~]', text[-1]):
142
- text += '.'
143
  return text
144
 
145
 
@@ -151,16 +106,13 @@ def thai_cleaners(text):
151
 
152
  def shanghainese_cleaners(text):
153
  text = shanghainese_to_ipa(text)
154
- if re.match(r'[^\.,!\?\-…~]', text[-1]):
155
- text += '.'
156
  return text
157
 
158
 
159
  def chinese_dialect_cleaners(text):
160
- text = re.sub(r'\[MD\](.*?)\[MD\]',
161
  lambda x: chinese_to_ipa2(x.group(1))+' ', text)
162
- text = re.sub(r'\[TW\](.*?)\[TW\]',
163
- lambda x: chinese_to_ipa2(x.group(1), True)+' ', text)
164
  text = re.sub(r'\[JA\](.*?)\[JA\]',
165
  lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
166
  text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
 
1
  import re
 
2
  from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
3
+ from text.korean import latin_to_hangul, number_to_hangul, divide_hangul, korean_to_lazy_ipa, korean_to_ipa
4
  from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
5
+ from text.sanskrit import devanagari_to_ipa
6
+ from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
7
+ from text.thai import num_to_thai, latin_to_thai
8
+ from text.shanghainese import shanghainese_to_ipa
9
+ from text.cantonese import cantonese_to_ipa
10
+ from text.ngu_dialect import ngu_dialect_to_ipa
11
 
12
 
13
  def japanese_cleaners(text):
14
  text = japanese_to_romaji_with_accent(text)
15
+ text = re.sub(r'([A-Za-z])$', r'\1.', text)
 
16
  return text
17
 
18
 
 
25
  text = latin_to_hangul(text)
26
  text = number_to_hangul(text)
27
  text = divide_hangul(text)
28
+ text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
 
29
  return text
30
 
31
 
 
34
  text = number_to_chinese(text)
35
  text = chinese_to_bopomofo(text)
36
  text = latin_to_bopomofo(text)
37
+ text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
 
38
  return text
39
 
40
 
41
  def zh_ja_mixture_cleaners(text):
42
+ text = re.sub(r'\[ZH\](.*?)\[ZH\]',
43
+ lambda x: chinese_to_romaji(x.group(1))+' ', text)
44
+ text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
45
+ x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text)
46
+ text = re.sub(r'\s+$', '', text)
47
+ text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
 
 
 
 
 
 
48
  return text
49
 
50
 
51
  def sanskrit_cleaners(text):
52
  text = text.replace('॥', '।').replace('ॐ', 'ओम्')
53
+ text = re.sub(r'([^।])$', r'\1।', text)
 
54
  return text
55
 
56
 
57
  def cjks_cleaners(text):
58
+ text = re.sub(r'\[ZH\](.*?)\[ZH\]',
59
+ lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
60
+ text = re.sub(r'\[JA\](.*?)\[JA\]',
61
+ lambda x: japanese_to_ipa(x.group(1))+' ', text)
62
+ text = re.sub(r'\[KO\](.*?)\[KO\]',
63
+ lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
64
+ text = re.sub(r'\[SA\](.*?)\[SA\]',
65
+ lambda x: devanagari_to_ipa(x.group(1))+' ', text)
66
+ text = re.sub(r'\[EN\](.*?)\[EN\]',
67
+ lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
68
+ text = re.sub(r'\s+$', '', text)
69
+ text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
 
 
 
 
 
 
 
 
 
 
 
70
  return text
71
 
72
 
73
  def cjke_cleaners(text):
74
+ text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
75
+ 'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text)
76
+ text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
77
+ 'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text)
78
+ text = re.sub(r'\[KO\](.*?)\[KO\]',
79
+ lambda x: korean_to_ipa(x.group(1))+' ', text)
80
+ text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
81
+ 'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text)
82
+ text = re.sub(r'\s+$', '', text)
83
+ text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  return text
85
 
86
 
87
  def cjke_cleaners2(text):
88
+ text = re.sub(r'\[ZH\](.*?)\[ZH\]',
89
+ lambda x: chinese_to_ipa(x.group(1))+' ', text)
90
+ text = re.sub(r'\[JA\](.*?)\[JA\]',
91
+ lambda x: japanese_to_ipa2(x.group(1))+' ', text)
92
+ text = re.sub(r'\[KO\](.*?)\[KO\]',
93
+ lambda x: korean_to_ipa(x.group(1))+' ', text)
94
+ text = re.sub(r'\[EN\](.*?)\[EN\]',
95
+ lambda x: english_to_ipa2(x.group(1))+' ', text)
96
+ text = re.sub(r'\s+$', '', text)
97
+ text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
 
 
 
 
 
 
 
 
 
98
  return text
99
 
100
 
 
106
 
107
  def shanghainese_cleaners(text):
108
  text = shanghainese_to_ipa(text)
109
+ text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
 
110
  return text
111
 
112
 
113
  def chinese_dialect_cleaners(text):
114
+ text = re.sub(r'\[ZH\](.*?)\[ZH\]',
115
  lambda x: chinese_to_ipa2(x.group(1))+' ', text)
 
 
116
  text = re.sub(r'\[JA\](.*?)\[JA\]',
117
  lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
118
  text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
text/korean.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from jamo import h2j, j2hcj
3
+ import ko_pron
4
+
5
+
6
+ # This is a list of Korean classifiers preceded by pure Korean numerals.
7
+ _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
8
+
9
+ # List of (hangul, hangul divided) pairs:
10
+ _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
11
+ ('ㄳ', 'ㄱㅅ'),
12
+ ('ㄵ', 'ㄴㅈ'),
13
+ ('ㄶ', 'ㄴㅎ'),
14
+ ('ㄺ', 'ㄹㄱ'),
15
+ ('ㄻ', 'ㄹㅁ'),
16
+ ('ㄼ', 'ㄹㅂ'),
17
+ ('ㄽ', 'ㄹㅅ'),
18
+ ('ㄾ', 'ㄹㅌ'),
19
+ ('ㄿ', 'ㄹㅍ'),
20
+ ('ㅀ', 'ㄹㅎ'),
21
+ ('ㅄ', 'ㅂㅅ'),
22
+ ('ㅘ', 'ㅗㅏ'),
23
+ ('ㅙ', 'ㅗㅐ'),
24
+ ('ㅚ', 'ㅗㅣ'),
25
+ ('ㅝ', 'ㅜㅓ'),
26
+ ('ㅞ', 'ㅜㅔ'),
27
+ ('ㅟ', 'ㅜㅣ'),
28
+ ('ㅢ', 'ㅡㅣ'),
29
+ ('ㅑ', 'ㅣㅏ'),
30
+ ('ㅒ', 'ㅣㅐ'),
31
+ ('ㅕ', 'ㅣㅓ'),
32
+ ('ㅖ', 'ㅣㅔ'),
33
+ ('ㅛ', 'ㅣㅗ'),
34
+ ('ㅠ', 'ㅣㅜ')
35
+ ]]
36
+
37
+ # List of (Latin alphabet, hangul) pairs:
38
+ _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
39
+ ('a', '에이'),
40
+ ('b', '비'),
41
+ ('c', '시'),
42
+ ('d', '디'),
43
+ ('e', '이'),
44
+ ('f', '에프'),
45
+ ('g', '지'),
46
+ ('h', '에이치'),
47
+ ('i', '아이'),
48
+ ('j', '제이'),
49
+ ('k', '케이'),
50
+ ('l', '엘'),
51
+ ('m', '엠'),
52
+ ('n', '엔'),
53
+ ('o', '오'),
54
+ ('p', '피'),
55
+ ('q', '큐'),
56
+ ('r', '아르'),
57
+ ('s', '에스'),
58
+ ('t', '티'),
59
+ ('u', '유'),
60
+ ('v', '브이'),
61
+ ('w', '더블유'),
62
+ ('x', '엑스'),
63
+ ('y', '와이'),
64
+ ('z', '제트')
65
+ ]]
66
+
67
+ # List of (ipa, lazy ipa) pairs:
68
+ _ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
69
+ ('t͡ɕ','ʧ'),
70
+ ('d͡ʑ','ʥ'),
71
+ ('ɲ','n^'),
72
+ ('ɕ','ʃ'),
73
+ ('ʷ','w'),
74
+ ('ɭ','l`'),
75
+ ('ʎ','ɾ'),
76
+ ('ɣ','ŋ'),
77
+ ('ɰ','ɯ'),
78
+ ('ʝ','j'),
79
+ ('ʌ','ə'),
80
+ ('ɡ','g'),
81
+ ('\u031a','#'),
82
+ ('\u0348','='),
83
+ ('\u031e',''),
84
+ ('\u0320',''),
85
+ ('\u0339','')
86
+ ]]
87
+
88
+
89
+ def latin_to_hangul(text):
90
+ for regex, replacement in _latin_to_hangul:
91
+ text = re.sub(regex, replacement, text)
92
+ return text
93
+
94
+
95
+ def divide_hangul(text):
96
+ text = j2hcj(h2j(text))
97
+ for regex, replacement in _hangul_divided:
98
+ text = re.sub(regex, replacement, text)
99
+ return text
100
+
101
+
102
+ def hangul_number(num, sino=True):
103
+ '''Reference https://github.com/Kyubyong/g2pK'''
104
+ num = re.sub(',', '', num)
105
+
106
+ if num == '0':
107
+ return '영'
108
+ if not sino and num == '20':
109
+ return '스무'
110
+
111
+ digits = '123456789'
112
+ names = '일이삼사오육칠팔구'
113
+ digit2name = {d: n for d, n in zip(digits, names)}
114
+
115
+ modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
116
+ decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
117
+ digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
118
+ digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
119
+
120
+ spelledout = []
121
+ for i, digit in enumerate(num):
122
+ i = len(num) - i - 1
123
+ if sino:
124
+ if i == 0:
125
+ name = digit2name.get(digit, '')
126
+ elif i == 1:
127
+ name = digit2name.get(digit, '') + '십'
128
+ name = name.replace('일십', '십')
129
+ else:
130
+ if i == 0:
131
+ name = digit2mod.get(digit, '')
132
+ elif i == 1:
133
+ name = digit2dec.get(digit, '')
134
+ if digit == '0':
135
+ if i % 4 == 0:
136
+ last_three = spelledout[-min(3, len(spelledout)):]
137
+ if ''.join(last_three) == '':
138
+ spelledout.append('')
139
+ continue
140
+ else:
141
+ spelledout.append('')
142
+ continue
143
+ if i == 2:
144
+ name = digit2name.get(digit, '') + '백'
145
+ name = name.replace('일백', '백')
146
+ elif i == 3:
147
+ name = digit2name.get(digit, '') + '천'
148
+ name = name.replace('일천', '천')
149
+ elif i == 4:
150
+ name = digit2name.get(digit, '') + '만'
151
+ name = name.replace('일만', '만')
152
+ elif i == 5:
153
+ name = digit2name.get(digit, '') + '십'
154
+ name = name.replace('일십', '십')
155
+ elif i == 6:
156
+ name = digit2name.get(digit, '') + '백'
157
+ name = name.replace('일백', '백')
158
+ elif i == 7:
159
+ name = digit2name.get(digit, '') + '천'
160
+ name = name.replace('일천', '천')
161
+ elif i == 8:
162
+ name = digit2name.get(digit, '') + '억'
163
+ elif i == 9:
164
+ name = digit2name.get(digit, '') + '십'
165
+ elif i == 10:
166
+ name = digit2name.get(digit, '') + '백'
167
+ elif i == 11:
168
+ name = digit2name.get(digit, '') + '천'
169
+ elif i == 12:
170
+ name = digit2name.get(digit, '') + '조'
171
+ elif i == 13:
172
+ name = digit2name.get(digit, '') + '십'
173
+ elif i == 14:
174
+ name = digit2name.get(digit, '') + '백'
175
+ elif i == 15:
176
+ name = digit2name.get(digit, '') + '천'
177
+ spelledout.append(name)
178
+ return ''.join(elem for elem in spelledout)
179
+
180
+
181
+ def number_to_hangul(text):
182
+ '''Reference https://github.com/Kyubyong/g2pK'''
183
+ tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
184
+ for token in tokens:
185
+ num, classifier = token
186
+ if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
187
+ spelledout = hangul_number(num, sino=False)
188
+ else:
189
+ spelledout = hangul_number(num, sino=True)
190
+ text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
191
+ # digit by digit for remaining digits
192
+ digits = '0123456789'
193
+ names = '영일이삼사오육칠팔구'
194
+ for d, n in zip(digits, names):
195
+ text = text.replace(d, n)
196
+ return text
197
+
198
+
199
+ def korean_to_lazy_ipa(text):
200
+ text = latin_to_hangul(text)
201
+ text = number_to_hangul(text)
202
+ text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
203
+ for regex, replacement in _ipa_to_lazy_ipa:
204
+ text = re.sub(regex, replacement, text)
205
+ return text
206
+
207
+
208
+ def korean_to_ipa(text):
209
+ text = korean_to_lazy_ipa(text)
210
+ return text.replace('ʧ','tʃ').replace('ʥ','dʑ')
text/mandarin.py CHANGED
@@ -4,6 +4,7 @@ import re
4
  from pypinyin import lazy_pinyin, BOPOMOFO
5
  import jieba
6
  import cn2an
 
7
 
8
 
9
  # List of (Latin alphabet, bopomofo) pairs:
@@ -239,7 +240,7 @@ def number_to_chinese(text):
239
  return text
240
 
241
 
242
- def chinese_to_bopomofo(text, taiwanese=False):
243
  text = text.replace('、', ',').replace(';', ',').replace(':', ',')
244
  words = jieba.lcut(text, cut_all=False)
245
  text = ''
@@ -252,10 +253,7 @@ def chinese_to_bopomofo(text, taiwanese=False):
252
  bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
253
  if text != '':
254
  text += ' '
255
- if taiwanese:
256
- text += '#'+'#'.join(bopomofos)
257
- else:
258
- text += ''.join(bopomofos)
259
  return text
260
 
261
 
@@ -316,9 +314,9 @@ def chinese_to_ipa(text):
316
  return text
317
 
318
 
319
- def chinese_to_ipa2(text, taiwanese=False):
320
  text = number_to_chinese(text)
321
- text = chinese_to_bopomofo(text, taiwanese)
322
  text = latin_to_bopomofo(text)
323
  text = bopomofo_to_ipa2(text)
324
  text = re.sub(r'i([aoe])', r'j\1', text)
 
4
  from pypinyin import lazy_pinyin, BOPOMOFO
5
  import jieba
6
  import cn2an
7
+ import logging
8
 
9
 
10
  # List of (Latin alphabet, bopomofo) pairs:
 
240
  return text
241
 
242
 
243
+ def chinese_to_bopomofo(text):
244
  text = text.replace('、', ',').replace(';', ',').replace(':', ',')
245
  words = jieba.lcut(text, cut_all=False)
246
  text = ''
 
253
  bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
254
  if text != '':
255
  text += ' '
256
+ text += ''.join(bopomofos)
 
 
 
257
  return text
258
 
259
 
 
314
  return text
315
 
316
 
317
+ def chinese_to_ipa2(text):
318
  text = number_to_chinese(text)
319
+ text = chinese_to_bopomofo(text)
320
  text = latin_to_bopomofo(text)
321
  text = bopomofo_to_ipa2(text)
322
  text = re.sub(r'i([aoe])', r'j\1', text)
text/ngu_dialect.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import opencc
3
+
4
+
5
+ dialects = {'SZ': 'suzhou', 'WX': 'wuxi', 'CZ': 'changzhou', 'HZ': 'hangzhou',
6
+ 'SX': 'shaoxing', 'NB': 'ningbo', 'JJ': 'jingjiang', 'YX': 'yixing',
7
+ 'JD': 'jiading', 'ZR': 'zhenru', 'PH': 'pinghu', 'TX': 'tongxiang',
8
+ 'JS': 'jiashan', 'HN': 'xiashi', 'LP': 'linping', 'XS': 'xiaoshan',
9
+ 'FY': 'fuyang', 'RA': 'ruao', 'CX': 'cixi', 'SM': 'sanmen',
10
+ 'TT': 'tiantai', 'WZ': 'wenzhou', 'SC': 'suichang', 'YB': 'youbu'}
11
+
12
+ converters = {}
13
+
14
+ for dialect in dialects.values():
15
+ try:
16
+ converters[dialect] = opencc.OpenCC(dialect)
17
+ except:
18
+ pass
19
+
20
+
21
+ def ngu_dialect_to_ipa(text, dialect):
22
+ dialect = dialects[dialect]
23
+ text = converters[dialect].convert(text).replace('-','').replace('$',' ')
24
+ text = re.sub(r'[、;:]', ',', text)
25
+ text = re.sub(r'\s*,\s*', ', ', text)
26
+ text = re.sub(r'\s*。\s*', '. ', text)
27
+ text = re.sub(r'\s*?\s*', '? ', text)
28
+ text = re.sub(r'\s*!\s*', '! ', text)
29
+ text = re.sub(r'\s*$', '', text)
30
+ return text
text/shanghainese.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import cn2an
3
+ import opencc
4
+
5
+
6
+ converter = opencc.OpenCC('zaonhe')
7
+
8
+ # List of (Latin alphabet, ipa) pairs:
9
+ _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
10
+ ('A', 'ᴇ'),
11
+ ('B', 'bi'),
12
+ ('C', 'si'),
13
+ ('D', 'di'),
14
+ ('E', 'i'),
15
+ ('F', 'ᴇf'),
16
+ ('G', 'dʑi'),
17
+ ('H', 'ᴇtɕʰ'),
18
+ ('I', 'ᴀi'),
19
+ ('J', 'dʑᴇ'),
20
+ ('K', 'kʰᴇ'),
21
+ ('L', 'ᴇl'),
22
+ ('M', 'ᴇm'),
23
+ ('N', 'ᴇn'),
24
+ ('O', 'o'),
25
+ ('P', 'pʰi'),
26
+ ('Q', 'kʰiu'),
27
+ ('R', 'ᴀl'),
28
+ ('S', 'ᴇs'),
29
+ ('T', 'tʰi'),
30
+ ('U', 'ɦiu'),
31
+ ('V', 'vi'),
32
+ ('W', 'dᴀbɤliu'),
33
+ ('X', 'ᴇks'),
34
+ ('Y', 'uᴀi'),
35
+ ('Z', 'zᴇ')
36
+ ]]
37
+
38
+
39
+ def _number_to_shanghainese(num):
40
+ num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两')
41
+ return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num)
42
+
43
+
44
+ def number_to_shanghainese(text):
45
+ return re.sub(r'\d+(?:\.?\d+)?', lambda x: _number_to_shanghainese(x.group()), text)
46
+
47
+
48
+ def latin_to_ipa(text):
49
+ for regex, replacement in _latin_to_ipa:
50
+ text = re.sub(regex, replacement, text)
51
+ return text
52
+
53
+
54
+ def shanghainese_to_ipa(text):
55
+ text = number_to_shanghainese(text.upper())
56
+ text = converter.convert(text).replace('-','').replace('$',' ')
57
+ text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
58
+ text = re.sub(r'[、;:]', ',', text)
59
+ text = re.sub(r'\s*,\s*', ', ', text)
60
+ text = re.sub(r'\s*。\s*', '. ', text)
61
+ text = re.sub(r'\s*?\s*', '? ', text)
62
+ text = re.sub(r'\s*!\s*', '! ', text)
63
+ text = re.sub(r'\s*$', '', text)
64
+ return text
text/symbols.py CHANGED
@@ -1,15 +1,18 @@
1
  '''
2
  Defines the set of symbols used in text input to the model.
3
  '''
 
 
4
  _pad = '_'
5
- _punctuation = ',.!?-~…'
6
- _letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
7
  '''
 
8
  # japanese_cleaners2
9
  _pad = '_'
10
  _punctuation = ',.!?-~…'
11
  _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
12
- '''
13
 
14
  '''# korean_cleaners
15
  _pad = '_'
@@ -23,6 +26,11 @@ _punctuation = ',。!?—…'
23
  _letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
24
  '''
25
 
 
 
 
 
 
26
 
27
  '''# sanskrit_cleaners
28
  _pad = '_'
@@ -57,7 +65,7 @@ _letters = 'abdfghiklmnopstuvyzøŋȵɑɔɕəɤɦɪɿʑʔʰ̩̃ᴀᴇ15678 '
57
  '''# chinese_dialect_cleaners
58
  _pad = '_'
59
  _punctuation = ',.!?~…─'
60
- _letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚αᴀᴇ↑↓∅ⱼ '
61
  '''
62
 
63
  # Export all symbols:
 
1
  '''
2
  Defines the set of symbols used in text input to the model.
3
  '''
4
+
5
+ '''# japanese_cleaners
6
  _pad = '_'
7
+ _punctuation = ',.!?-'
8
+ _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧ↓↑ '
9
  '''
10
+
11
  # japanese_cleaners2
12
  _pad = '_'
13
  _punctuation = ',.!?-~…'
14
  _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
15
+
16
 
17
  '''# korean_cleaners
18
  _pad = '_'
 
26
  _letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
27
  '''
28
 
29
+ '''# zh_ja_mixture_cleaners
30
+ _pad = '_'
31
+ _punctuation = ',.!?-~…'
32
+ _letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
33
+ '''
34
 
35
  '''# sanskrit_cleaners
36
  _pad = '_'
 
65
  '''# chinese_dialect_cleaners
66
  _pad = '_'
67
  _punctuation = ',.!?~…─'
68
+ _letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚ᴀᴇ↑↓∅ⱼ '
69
  '''
70
 
71
  # Export all symbols:
text/thai.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from num_thai.thainumbers import NumThai
3
+
4
+
5
+ num = NumThai()
6
+
7
+ # List of (Latin alphabet, Thai) pairs:
8
+ _latin_to_thai = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
9
+ ('a', 'เอ'),
10
+ ('b','บี'),
11
+ ('c','ซี'),
12
+ ('d','ดี'),
13
+ ('e','อี'),
14
+ ('f','เอฟ'),
15
+ ('g','จี'),
16
+ ('h','เอช'),
17
+ ('i','ไอ'),
18
+ ('j','เจ'),
19
+ ('k','เค'),
20
+ ('l','แอล'),
21
+ ('m','เอ็ม'),
22
+ ('n','เอ็น'),
23
+ ('o','โอ'),
24
+ ('p','พี'),
25
+ ('q','คิว'),
26
+ ('r','แอร์'),
27
+ ('s','เอส'),
28
+ ('t','ที'),
29
+ ('u','ยู'),
30
+ ('v','วี'),
31
+ ('w','ดับเบิลยู'),
32
+ ('x','เอ็กซ์'),
33
+ ('y','วาย'),
34
+ ('z','ซี')
35
+ ]]
36
+
37
+
38
+ def num_to_thai(text):
39
+ return re.sub(r'(?:\d+(?:,?\d+)?)+(?:\.\d+(?:,?\d+)?)?', lambda x: ''.join(num.NumberToTextThai(float(x.group(0).replace(',', '')))), text)
40
+
41
+ def latin_to_thai(text):
42
+ for regex, replacement in _latin_to_thai:
43
+ text = re.sub(regex, replacement, text)
44
+ return text