skytnt commited on
Commit
1a40474
1 Parent(s): 8505133

add models

Browse files
app.py CHANGED
@@ -73,6 +73,12 @@ def create_vc_fn(model, hps, speaker_ids):
73
  return vc_fn
74
 
75
 
 
 
 
 
 
 
76
  css = """
77
  #advanced-btn {
78
  color: white;
@@ -93,9 +99,12 @@ css = """
93
 
94
  if __name__ == '__main__':
95
  models = []
96
- with open("saved_model/names.json", "r", encoding="utf-8") as f:
97
- models_names = json.load(f)
98
- for i, models_name in models_names.items():
 
 
 
99
  config_path = f"saved_model/{i}/config.json"
100
  model_path = f"saved_model/{i}/model.pth"
101
  cover_path = f"saved_model/{i}/cover.jpg"
@@ -111,8 +120,9 @@ if __name__ == '__main__':
111
  speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
112
  speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
113
 
114
- models.append((models_name, cover_path, speakers, hps.symbols,
115
- create_tts_fn(model, hps, speaker_ids), create_vc_fn(model, hps, speaker_ids)))
 
116
 
117
  app = gr.Blocks(css=css)
118
 
@@ -126,12 +136,14 @@ if __name__ == '__main__':
126
  with gr.Tabs():
127
  with gr.TabItem("TTS"):
128
  with gr.Tabs():
129
- for i, (model_name, cover_path, speakers, symbols, tts_fn, vc_fn) in enumerate(models):
 
130
  with gr.TabItem(f"model{i}"):
131
  with gr.Column():
132
- gr.Markdown(f"## {model_name}\n\n"
133
- f"![cover](file/{cover_path})")
134
- tts_input1 = gr.TextArea(label="Text (60 words limitation)", value="こんにちは。")
 
135
  tts_input2 = gr.Dropdown(label="Speaker", choices=speakers,
136
  type="index", value=speakers[0])
137
  tts_input3 = gr.Slider(label="Speed", value=1, minimum=0.5, maximum=2, step=0.1)
@@ -157,16 +169,16 @@ if __name__ == '__main__':
157
  }""")
158
  tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, phoneme_input],
159
  [tts_output1, tts_output2])
160
- to_phoneme_btn.click(lambda x: _clean_text(x, hps.data.text_cleaners) if x != "" else x,
161
- [tts_input1], [tts_input1])
162
  phoneme_list.click(None, [phoneme_list, phoneme_list_json, tts_input1], [tts_input1],
163
  _js="(i,phonemes, text) => text + phonemes[i]")
164
 
165
  with gr.TabItem("Voice Conversion"):
166
  with gr.Tabs():
167
- for i, (model_name, cover_path, speakers, symbols, tts_fn, vc_fn) in enumerate(models):
 
168
  with gr.TabItem(f"model{i}"):
169
- gr.Markdown(f"## {model_name}\n\n"
170
  f"![cover](file/{cover_path})")
171
  vc_input1 = gr.Dropdown(label="Original Speaker", choices=speakers, type="index",
172
  value=speakers[0])
73
  return vc_fn
74
 
75
 
76
+ def create_to_phoneme_fn(hps):
77
+ def to_phoneme_fn(text):
78
+ return _clean_text(text, hps.data.text_cleaners) if text != "" else ""
79
+ return to_phoneme_fn
80
+
81
+
82
  css = """
83
  #advanced-btn {
84
  color: white;
99
 
100
  if __name__ == '__main__':
101
  models = []
102
+ with open("saved_model/info.json", "r", encoding="utf-8") as f:
103
+ models_info = json.load(f)
104
+ for i, info in models_info.items():
105
+ name = info["title"]
106
+ lang = info["lang"]
107
+ example = info["example"]
108
  config_path = f"saved_model/{i}/config.json"
109
  model_path = f"saved_model/{i}/model.pth"
110
  cover_path = f"saved_model/{i}/cover.jpg"
120
  speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
121
  speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
122
 
123
+ models.append((name, lang, example, cover_path, speakers, hps.symbols,
124
+ create_tts_fn(model, hps, speaker_ids), create_vc_fn(model, hps, speaker_ids),
125
+ create_to_phoneme_fn(hps)))
126
 
127
  app = gr.Blocks(css=css)
128
 
136
  with gr.Tabs():
137
  with gr.TabItem("TTS"):
138
  with gr.Tabs():
139
+ for i, (name, lang, example, cover_path, speakers,
140
+ symbols, tts_fn, vc_fn, to_phoneme_fn) in enumerate(models):
141
  with gr.TabItem(f"model{i}"):
142
  with gr.Column():
143
+ gr.Markdown(f"## {name}\n\n"
144
+ f"![cover](file/{cover_path})\n\n"
145
+ f"lang: {lang}")
146
+ tts_input1 = gr.TextArea(label="Text (60 words limitation)", value=example)
147
  tts_input2 = gr.Dropdown(label="Speaker", choices=speakers,
148
  type="index", value=speakers[0])
149
  tts_input3 = gr.Slider(label="Speed", value=1, minimum=0.5, maximum=2, step=0.1)
169
  }""")
170
  tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, phoneme_input],
171
  [tts_output1, tts_output2])
172
+ to_phoneme_btn.click(to_phoneme_fn, [tts_input1], [tts_input1])
 
173
  phoneme_list.click(None, [phoneme_list, phoneme_list_json, tts_input1], [tts_input1],
174
  _js="(i,phonemes, text) => text + phonemes[i]")
175
 
176
  with gr.TabItem("Voice Conversion"):
177
  with gr.Tabs():
178
+ for i, (name, lang, example, cover_path, speakers,
179
+ symbols, tts_fn, vc_fn, to_phoneme_fn) in enumerate(models):
180
  with gr.TabItem(f"model{i}"):
181
+ gr.Markdown(f"## {name}\n\n"
182
  f"![cover](file/{cover_path})")
183
  vc_input1 = gr.Dropdown(label="Original Speaker", choices=speakers, type="index",
184
  value=speakers[0])
requirements.txt CHANGED
@@ -9,4 +9,8 @@ torch
9
  torchvision
10
  Unidecode
11
  pyopenjtalk
 
 
 
 
12
  gradio
9
  torchvision
10
  Unidecode
11
  pyopenjtalk
12
+ jamo
13
+ pypinyin
14
+ jieba
15
+ cn2an
16
  gradio
saved_model/5/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d08ea4e940cd92bebaa656762031eb085439d47d6f636cdafb37411f24c927d1
3
+ size 1262
saved_model/5/cover.jpg ADDED

Git LFS Details

  • SHA256: dbed43668741a90c3a7faef3c3b5aace7723b94c251106fb5925a0f1ba0d7c5c
  • Pointer size: 130 Bytes
  • Size of remote file: 30.5 kB
saved_model/5/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edfb6b428c398fab83a85b5ae41e13cb5a9f7be12692129e8a880d4553701f7b
3
+ size 158888013
saved_model/6/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a7d6956086537898264526d08e780c9abc4af8533bf75358dd960016c13da8b
3
+ size 1218
saved_model/6/cover.jpg ADDED

Git LFS Details

  • SHA256: 38e71373daa8849f04bd7867845676afab2057e69a5e0a1e312c2b6cfdd72794
  • Pointer size: 131 Bytes
  • Size of remote file: 146 kB
saved_model/6/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b545a33fe870c214e3828da9ab8e756c6c75a30a6acee74670637fbbd3a58a0d
3
+ size 158875981
saved_model/{names.json → info.json} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1cfb2b973343bfcf64350ed974902b24b12b9903baff7c21ff17fdd9763abe1
3
- size 231
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ae450ecf80251796929594abecca61537612c4115cf947d363c805055f0b199
3
+ size 905
text/__init__.py CHANGED
@@ -3,30 +3,31 @@ from text import cleaners
3
 
4
 
5
  def text_to_sequence(text, symbols, cleaner_names):
6
- '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
7
  Args:
8
  text: string to convert to a sequence
9
  cleaner_names: names of the cleaner functions to run the text through
10
  Returns:
11
  List of integers corresponding to the symbols in the text
12
  '''
13
- _symbol_to_id = {s: i for i, s in enumerate(symbols)}
14
 
15
- sequence = []
16
 
17
- clean_text = _clean_text(text, cleaner_names)
18
- for symbol in clean_text:
19
- if symbol not in _symbol_to_id.keys():
20
- continue
21
- symbol_id = _symbol_to_id[symbol]
22
- sequence += [symbol_id]
23
- return sequence
24
 
25
 
26
  def _clean_text(text, cleaner_names):
27
- for name in cleaner_names:
28
- cleaner = getattr(cleaners, name)
29
- if not cleaner:
30
- raise Exception('Unknown cleaner: %s' % name)
31
- text = cleaner(text)
32
- return text
 
3
 
4
 
5
  def text_to_sequence(text, symbols, cleaner_names):
6
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
7
  Args:
8
  text: string to convert to a sequence
9
  cleaner_names: names of the cleaner functions to run the text through
10
  Returns:
11
  List of integers corresponding to the symbols in the text
12
  '''
13
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
14
 
15
+ sequence = []
16
 
17
+ clean_text = _clean_text(text, cleaner_names)
18
+ for symbol in clean_text:
19
+ if symbol not in _symbol_to_id.keys():
20
+ continue
21
+ symbol_id = _symbol_to_id[symbol]
22
+ sequence += [symbol_id]
23
+ return sequence
24
 
25
 
26
  def _clean_text(text, cleaner_names):
27
+ for name in cleaner_names:
28
+ cleaner = getattr(cleaners, name)
29
+ if not cleaner:
30
+ raise Exception('Unknown cleaner: %s' % name)
31
+ text = cleaner(text)
32
+ print(text, cleaner_names)
33
+ return text
text/cleaners.py CHANGED
@@ -1,9 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
- from unidecode import unidecode
 
 
 
3
  import pyopenjtalk
 
 
 
4
 
 
 
5
  pyopenjtalk._lazy_init()
6
 
 
 
 
 
 
 
7
  # Regular expression matching Japanese without punctuation marks:
8
  _japanese_characters = re.compile(
9
  r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
@@ -12,10 +40,209 @@ _japanese_characters = re.compile(
12
  _japanese_marks = re.compile(
13
  r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- def japanese_cleaners(text):
17
- '''Pipeline for notating accent in Japanese text.'''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
 
19
  sentences = re.split(_japanese_marks, text)
20
  marks = re.findall(_japanese_marks, text)
21
  text = ''
@@ -49,10 +276,221 @@ def japanese_cleaners(text):
49
  text += '↑'
50
  if i < len(marks):
51
  text += unidecode(marks[i]).replace(' ', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  if re.match('[A-Za-z]', text[-1]):
53
  text += '.'
54
  return text
55
 
56
 
57
  def japanese_cleaners2(text):
58
- return japanese_cleaners(text).replace('ts', 'ʦ')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+
3
+ '''
4
+ Cleaners are transformations that run over the input text at both training and eval time.
5
+ Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
6
+ hyperparameter. Some cleaners are English-specific. You'll typically want to use:
7
+ 1. "english_cleaners" for English text
8
+ 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
9
+ the Unidecode library (https://pypi.python.org/pypi/Unidecode)
10
+ 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
11
+ the symbols in symbols.py to match your data).
12
+ '''
13
+
14
+ import os
15
  import re
16
+ import sys
17
+
18
+ import cn2an
19
+ import jieba
20
  import pyopenjtalk
21
+ from jamo import h2j, j2hcj
22
+ from pypinyin import lazy_pinyin, BOPOMOFO
23
+ from unidecode import unidecode
24
 
25
+ jieba.set_dictionary(os.path.dirname(sys.argv[0]) + '/text/jieba_dict.txt')
26
+ jieba.initialize()
27
  pyopenjtalk._lazy_init()
28
 
29
+ # This is a list of Korean classifiers preceded by pure Korean numerals.
30
+ _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
31
+
32
+ # Regular expression matching whitespace:
33
+ _whitespace_re = re.compile(r'\s+')
34
+
35
  # Regular expression matching Japanese without punctuation marks:
36
  _japanese_characters = re.compile(
37
  r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
40
  _japanese_marks = re.compile(
41
  r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
42
 
43
+ # List of (regular expression, replacement) pairs for abbreviations:
44
+ _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
45
+ ('mrs', 'misess'),
46
+ ('mr', 'mister'),
47
+ ('dr', 'doctor'),
48
+ ('st', 'saint'),
49
+ ('co', 'company'),
50
+ ('jr', 'junior'),
51
+ ('maj', 'major'),
52
+ ('gen', 'general'),
53
+ ('drs', 'doctors'),
54
+ ('rev', 'reverend'),
55
+ ('lt', 'lieutenant'),
56
+ ('hon', 'honorable'),
57
+ ('sgt', 'sergeant'),
58
+ ('capt', 'captain'),
59
+ ('esq', 'esquire'),
60
+ ('ltd', 'limited'),
61
+ ('col', 'colonel'),
62
+ ('ft', 'fort'),
63
+ ]]
64
 
65
+ # List of (symbol, Japanese) pairs for marks:
66
+ _symbols_to_japanese = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
67
+ ('%', 'パーセント')
68
+ ]]
69
+
70
+ # List of (hangul, hangul divided) pairs:
71
+ _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
72
+ ('ㄳ', 'ㄱㅅ'),
73
+ ('ㄵ', 'ㄴㅈ'),
74
+ ('ㄶ', 'ㄴㅎ'),
75
+ ('ㄺ', 'ㄹㄱ'),
76
+ ('ㄻ', 'ㄹㅁ'),
77
+ ('ㄼ', 'ㄹㅂ'),
78
+ ('ㄽ', 'ㄹㅅ'),
79
+ ('ㄾ', 'ㄹㅌ'),
80
+ ('ㄿ', 'ㄹㅍ'),
81
+ ('ㅀ', 'ㄹㅎ'),
82
+ ('ㅄ', 'ㅂㅅ'),
83
+ ('ㅘ', 'ㅗㅏ'),
84
+ ('ㅙ', 'ㅗㅐ'),
85
+ ('ㅚ', 'ㅗㅣ'),
86
+ ('ㅝ', 'ㅜㅓ'),
87
+ ('ㅞ', 'ㅜㅔ'),
88
+ ('ㅟ', 'ㅜㅣ'),
89
+ ('ㅢ', 'ㅡㅣ'),
90
+ ('ㅑ', 'ㅣㅏ'),
91
+ ('ㅒ', 'ㅣㅐ'),
92
+ ('ㅕ', 'ㅣㅓ'),
93
+ ('ㅖ', 'ㅣㅔ'),
94
+ ('ㅛ', 'ㅣㅗ'),
95
+ ('ㅠ', 'ㅣㅜ')
96
+ ]]
97
+
98
+ # List of (Latin alphabet, hangul) pairs:
99
+ _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
100
+ ('a', '에이'),
101
+ ('b', '비'),
102
+ ('c', '시'),
103
+ ('d', '디'),
104
+ ('e', '이'),
105
+ ('f', '에프'),
106
+ ('g', '지'),
107
+ ('h', '에이치'),
108
+ ('i', '아이'),
109
+ ('j', '제이'),
110
+ ('k', '케이'),
111
+ ('l', '엘'),
112
+ ('m', '엠'),
113
+ ('n', '엔'),
114
+ ('o', '오'),
115
+ ('p', '피'),
116
+ ('q', '큐'),
117
+ ('r', '아르'),
118
+ ('s', '에스'),
119
+ ('t', '티'),
120
+ ('u', '유'),
121
+ ('v', '브이'),
122
+ ('w', '더블유'),
123
+ ('x', '엑스'),
124
+ ('y', '와이'),
125
+ ('z', '제트')
126
+ ]]
127
+
128
+ # List of (Latin alphabet, bopomofo) pairs:
129
+ _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
130
+ ('a', 'ㄟˉ'),
131
+ ('b', 'ㄅㄧˋ'),
132
+ ('c', 'ㄙㄧˉ'),
133
+ ('d', 'ㄉㄧˋ'),
134
+ ('e', 'ㄧˋ'),
135
+ ('f', 'ㄝˊㄈㄨˋ'),
136
+ ('g', 'ㄐㄧˋ'),
137
+ ('h', 'ㄝˇㄑㄩˋ'),
138
+ ('i', 'ㄞˋ'),
139
+ ('j', 'ㄐㄟˋ'),
140
+ ('k', 'ㄎㄟˋ'),
141
+ ('l', 'ㄝˊㄛˋ'),
142
+ ('m', 'ㄝˊㄇㄨˋ'),
143
+ ('n', 'ㄣˉ'),
144
+ ('o', 'ㄡˉ'),
145
+ ('p', 'ㄆㄧˉ'),
146
+ ('q', 'ㄎㄧㄡˉ'),
147
+ ('r', 'ㄚˋ'),
148
+ ('s', 'ㄝˊㄙˋ'),
149
+ ('t', 'ㄊㄧˋ'),
150
+ ('u', 'ㄧㄡˉ'),
151
+ ('v', 'ㄨㄧˉ'),
152
+ ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
153
+ ('x', 'ㄝˉㄎㄨˋㄙˋ'),
154
+ ('y', 'ㄨㄞˋ'),
155
+ ('z', 'ㄗㄟˋ')
156
+ ]]
157
+
158
+ # List of (bopomofo, romaji) pairs:
159
+ _bopomofo_to_romaji = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
160
+ ('ㄅㄛ', 'p⁼wo'),
161
+ ('ㄆㄛ', 'pʰwo'),
162
+ ('ㄇㄛ', 'mwo'),
163
+ ('ㄈㄛ', 'fwo'),
164
+ ('ㄅ', 'p⁼'),
165
+ ('ㄆ', 'pʰ'),
166
+ ('ㄇ', 'm'),
167
+ ('ㄈ', 'f'),
168
+ ('ㄉ', 't⁼'),
169
+ ('ㄊ', 'tʰ'),
170
+ ('ㄋ', 'n'),
171
+ ('ㄌ', 'l'),
172
+ ('ㄍ', 'k⁼'),
173
+ ('ㄎ', 'kʰ'),
174
+ ('ㄏ', 'h'),
175
+ ('ㄐ', 'ʧ⁼'),
176
+ ('ㄑ', 'ʧʰ'),
177
+ ('ㄒ', 'ʃ'),
178
+ ('ㄓ', 'ʦ`⁼'),
179
+ ('ㄔ', 'ʦ`ʰ'),
180
+ ('ㄕ', 's`'),
181
+ ('ㄖ', 'ɹ`'),
182
+ ('ㄗ', 'ʦ⁼'),
183
+ ('ㄘ', 'ʦʰ'),
184
+ ('ㄙ', 's'),
185
+ ('ㄚ', 'a'),
186
+ ('ㄛ', 'o'),
187
+ ('ㄜ', 'ə'),
188
+ ('ㄝ', 'e'),
189
+ ('ㄞ', 'ai'),
190
+ ('ㄟ', 'ei'),
191
+ ('ㄠ', 'au'),
192
+ ('ㄡ', 'ou'),
193
+ ('ㄧㄢ', 'yeNN'),
194
+ ('ㄢ', 'aNN'),
195
+ ('ㄧㄣ', 'iNN'),
196
+ ('ㄣ', 'əNN'),
197
+ ('ㄤ', 'aNg'),
198
+ ('ㄧㄥ', 'iNg'),
199
+ ('ㄨㄥ', 'uNg'),
200
+ ('ㄩㄥ', 'yuNg'),
201
+ ('ㄥ', 'əNg'),
202
+ ('ㄦ', 'əɻ'),
203
+ ('ㄧ', 'i'),
204
+ ('ㄨ', 'u'),
205
+ ('ㄩ', 'ɥ'),
206
+ ('ˉ', '→'),
207
+ ('ˊ', '↑'),
208
+ ('ˇ', '↓↑'),
209
+ ('ˋ', '↓'),
210
+ ('˙', ''),
211
+ (',', ','),
212
+ ('。', '.'),
213
+ ('!', '!'),
214
+ ('?', '?'),
215
+ ('—', '-')
216
+ ]]
217
+
218
+
219
+ def expand_abbreviations(text):
220
+ for regex, replacement in _abbreviations:
221
+ text = re.sub(regex, replacement, text)
222
+ return text
223
+
224
+
225
+ def lowercase(text):
226
+ return text.lower()
227
+
228
+
229
+ def collapse_whitespace(text):
230
+ return re.sub(_whitespace_re, ' ', text)
231
+
232
+
233
+ def convert_to_ascii(text):
234
+ return unidecode(text)
235
+
236
+
237
+ def symbols_to_japanese(text):
238
+ for regex, replacement in _symbols_to_japanese:
239
+ text = re.sub(regex, replacement, text)
240
+ return text
241
+
242
+
243
+ def japanese_to_romaji_with_accent(text):
244
  '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
245
+ text = symbols_to_japanese(text)
246
  sentences = re.split(_japanese_marks, text)
247
  marks = re.findall(_japanese_marks, text)
248
  text = ''
276
  text += '↑'
277
  if i < len(marks):
278
  text += unidecode(marks[i]).replace(' ', '')
279
+ return text
280
+
281
+
282
+ def latin_to_hangul(text):
283
+ for regex, replacement in _latin_to_hangul:
284
+ text = re.sub(regex, replacement, text)
285
+ return text
286
+
287
+
288
+ def divide_hangul(text):
289
+ for regex, replacement in _hangul_divided:
290
+ text = re.sub(regex, replacement, text)
291
+ return text
292
+
293
+
294
+ def hangul_number(num, sino=True):
295
+ '''Reference https://github.com/Kyubyong/g2pK'''
296
+ num = re.sub(',', '', num)
297
+
298
+ if num == '0':
299
+ return '영'
300
+ if not sino and num == '20':
301
+ return '스무'
302
+
303
+ digits = '123456789'
304
+ names = '일이삼사오육칠팔구'
305
+ digit2name = {d: n for d, n in zip(digits, names)}
306
+
307
+ modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
308
+ decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
309
+ digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
310
+ digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
311
+
312
+ spelledout = []
313
+ for i, digit in enumerate(num):
314
+ i = len(num) - i - 1
315
+ if sino:
316
+ if i == 0:
317
+ name = digit2name.get(digit, '')
318
+ elif i == 1:
319
+ name = digit2name.get(digit, '') + '십'
320
+ name = name.replace('일십', '십')
321
+ else:
322
+ if i == 0:
323
+ name = digit2mod.get(digit, '')
324
+ elif i == 1:
325
+ name = digit2dec.get(digit, '')
326
+ if digit == '0':
327
+ if i % 4 == 0:
328
+ last_three = spelledout[-min(3, len(spelledout)):]
329
+ if ''.join(last_three) == '':
330
+ spelledout.append('')
331
+ continue
332
+ else:
333
+ spelledout.append('')
334
+ continue
335
+ if i == 2:
336
+ name = digit2name.get(digit, '') + '백'
337
+ name = name.replace('일백', '백')
338
+ elif i == 3:
339
+ name = digit2name.get(digit, '') + '천'
340
+ name = name.replace('일천', '천')
341
+ elif i == 4:
342
+ name = digit2name.get(digit, '') + '만'
343
+ name = name.replace('일만', '만')
344
+ elif i == 5:
345
+ name = digit2name.get(digit, '') + '십'
346
+ name = name.replace('일십', '십')
347
+ elif i == 6:
348
+ name = digit2name.get(digit, '') + '백'
349
+ name = name.replace('일백', '백')
350
+ elif i == 7:
351
+ name = digit2name.get(digit, '') + '천'
352
+ name = name.replace('일천', '천')
353
+ elif i == 8:
354
+ name = digit2name.get(digit, '') + '억'
355
+ elif i == 9:
356
+ name = digit2name.get(digit, '') + '십'
357
+ elif i == 10:
358
+ name = digit2name.get(digit, '') + '백'
359
+ elif i == 11:
360
+ name = digit2name.get(digit, '') + '천'
361
+ elif i == 12:
362
+ name = digit2name.get(digit, '') + '조'
363
+ elif i == 13:
364
+ name = digit2name.get(digit, '') + '십'
365
+ elif i == 14:
366
+ name = digit2name.get(digit, '') + '백'
367
+ elif i == 15:
368
+ name = digit2name.get(digit, '') + '천'
369
+ spelledout.append(name)
370
+ return ''.join(elem for elem in spelledout)
371
+
372
+
373
+ def number_to_hangul(text):
374
+ '''Reference https://github.com/Kyubyong/g2pK'''
375
+ tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
376
+ for token in tokens:
377
+ num, classifier = token
378
+ if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
379
+ spelledout = hangul_number(num, sino=False)
380
+ else:
381
+ spelledout = hangul_number(num, sino=True)
382
+ text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
383
+ # digit by digit for remaining digits
384
+ digits = '0123456789'
385
+ names = '영일이삼사오육칠팔구'
386
+ for d, n in zip(digits, names):
387
+ text = text.replace(d, n)
388
+ return text
389
+
390
+
391
+ def number_to_chinese(text):
392
+ numbers = re.findall(r'\d+(?:\.?\d+)?', text)
393
+ for number in numbers:
394
+ text = text.replace(number, cn2an.an2cn(number), 1)
395
+ return text
396
+
397
+
398
+ def chinese_to_bopomofo(text):
399
+ text = text.replace('、', ',').replace(';', ',').replace(':', ',')
400
+ words = jieba.lcut(text, cut_all=False)
401
+ text = ''
402
+ for word in words:
403
+ bopomofos = lazy_pinyin(word, BOPOMOFO)
404
+ if not re.search('[\u4e00-\u9fff]', word):
405
+ text += word
406
+ continue
407
+ for i in range(len(bopomofos)):
408
+ if re.match('[\u3105-\u3129]', bopomofos[i][-1]):
409
+ bopomofos[i] += 'ˉ'
410
+ if text != '':
411
+ text += ' '
412
+ text += ''.join(bopomofos)
413
+ return text
414
+
415
+
416
+ def latin_to_bopomofo(text):
417
+ for regex, replacement in _latin_to_bopomofo:
418
+ text = re.sub(regex, replacement, text)
419
+ return text
420
+
421
+
422
+ def bopomofo_to_romaji(text):
423
+ for regex, replacement in _bopomofo_to_romaji:
424
+ text = re.sub(regex, replacement, text)
425
+ return text
426
+
427
+
428
+ def basic_cleaners(text):
429
+ '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
430
+ text = lowercase(text)
431
+ text = collapse_whitespace(text)
432
+ return text
433
+
434
+
435
+ def transliteration_cleaners(text):
436
+ '''Pipeline for non-English text that transliterates to ASCII.'''
437
+ text = convert_to_ascii(text)
438
+ text = lowercase(text)
439
+ text = collapse_whitespace(text)
440
+ return text
441
+
442
+
443
+ def japanese_cleaners(text):
444
+ text = japanese_to_romaji_with_accent(text)
445
  if re.match('[A-Za-z]', text[-1]):
446
  text += '.'
447
  return text
448
 
449
 
450
  def japanese_cleaners2(text):
451
+ return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')
452
+
453
+
454
+ def korean_cleaners(text):
455
+ '''Pipeline for Korean text'''
456
+ text = latin_to_hangul(text)
457
+ text = number_to_hangul(text)
458
+ text = j2hcj(h2j(text))
459
+ text = divide_hangul(text)
460
+ if re.match('[\u3131-\u3163]', text[-1]):
461
+ text += '.'
462
+ return text
463
+
464
+
465
+ def chinese_cleaners(text):
466
+ '''Pipeline for Chinese text'''
467
+ text = number_to_chinese(text)
468
+ text = chinese_to_bopomofo(text)
469
+ text = latin_to_bopomofo(text)
470
+ if re.match('[ˉˊˇˋ˙]', text[-1]):
471
+ text += '。'
472
+ return text
473
+
474
+
475
+ def zh_ja_mixture_cleaners(text):
476
+ chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
477
+ japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
478
+ for chinese_text in chinese_texts:
479
+ cleaned_text = number_to_chinese(chinese_text[4:-4])
480
+ cleaned_text = chinese_to_bopomofo(cleaned_text)
481
+ cleaned_text = latin_to_bopomofo(cleaned_text)
482
+ cleaned_text = bopomofo_to_romaji(cleaned_text)
483
+ cleaned_text = re.sub('i[aoe]', lambda x: 'y' + x.group(0)[1:], cleaned_text)
484
+ cleaned_text = re.sub('u[aoəe]', lambda x: 'w' + x.group(0)[1:], cleaned_text)
485
+ cleaned_text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑]+)', lambda x: x.group(1) + 'ɹ`' + x.group(2), cleaned_text).replace(
486
+ 'ɻ', 'ɹ`')
487
+ cleaned_text = re.sub('([ʦs][⁼ʰ]?)([→↓↑]+)', lambda x: x.group(1) + 'ɹ' + x.group(2), cleaned_text)
488
+ text = text.replace(chinese_text, cleaned_text + ' ', 1)
489
+ for japanese_text in japanese_texts:
490
+ cleaned_text = japanese_to_romaji_with_accent(japanese_text[4:-4]).replace('ts', 'ʦ').replace('u', 'ɯ').replace(
491
+ '...', '…')
492
+ text = text.replace(japanese_text, cleaned_text + ' ', 1)
493
+ text = text[:-1]
494
+ if re.match('[A-Za-zɯɹəɥ→↓↑]', text[-1]):
495
+ text += '.'
496
+ return text
text/jieba_dict.txt ADDED
The diff for this file is too large to render. See raw diff