txya900619 commited on
Commit
c4d001b
·
1 Parent(s): 707851b

feat: add app.py

Browse files
Files changed (8) hide show
  1. DEMO.md +17 -0
  2. app.py +253 -3
  3. configs/g2p.yaml +1 -0
  4. configs/models.yaml +7 -0
  5. ipa/__init__.py +76 -0
  6. ipa/ipa.py +37 -0
  7. models/__init__.py +37 -0
  8. requirements.txt +2 -0
DEMO.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 原語會族語語音合成系統
2
+
3
+ ILRDF Formosan Text-To-Speech System
4
+
5
+ ## 研發團隊
6
+
7
+ - [李鴻欣 Hung-Shin Lee](mailto:hungshinlee@gmail.com)
8
+ - [陳力瑋 Li-Wei Chen](mailto:wayne900619@gmail.com)
9
+ - [意傳科技](https://ithuan.tw/)
10
+ - [原住民族語言研究發展基金會](https://www.ilrdf.org.tw/)
11
+
12
+ ## 特別致謝
13
+ - [聯和科創](https://www.104.com.tw/company/1a2x6bmu75)
14
+ - [台灣阿美族語言永續發展學會/原民會阿美族語言推動組織](https://www.facebook.com/groups/ypspt/about)
15
+ - [台灣太魯閣族語言發展學會](https://qkktt.com/)
16
+ - [台灣原住民族賽德克族語言文化學會](https://www.facebook.com/3S3TBL/)
17
+ - 族語老師們
app.py CHANGED
@@ -1,7 +1,257 @@
1
  import gradio as gr
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
  demo.launch()
 
1
  import gradio as gr
2
+ import numpy as np
3
 
4
+ from ipa import g2p
5
+ from ipa.ipa import text_to_ipa
6
+ from models import models_config
7
+
8
+
9
+ def _do_tts(model_id, ipa, language_name, speaker_name=None, speaker_wav=None):
10
+ model = models_config[model_id]["model"]
11
+ if speaker_wav is not None:
12
+ return model.tts(
13
+ ipa,
14
+ speaker_wav=speaker_wav,
15
+ language_name=language_name,
16
+ split_sentences=False,
17
+ )
18
+ return model.tts(
19
+ ipa,
20
+ speaker_name=speaker_name,
21
+ language_name=language_name,
22
+ split_sentences=False,
23
+ )
24
+
25
+
26
+ def text_to_speech(
27
+ model_id: str,
28
+ use_default_emb_or_custom: str,
29
+ speaker_wav,
30
+ speaker: str,
31
+ language: str,
32
+ dialect: str,
33
+ speed: float,
34
+ text: str,
35
+ ):
36
+ if len(text) == 0:
37
+ raise gr.Error("請勿輸入空字串。")
38
+ tag = language
39
+ if language not in g2p:
40
+ tag = f"{language}_{dialect}"
41
+
42
+ ipa = text_to_ipa(text, tag, g2p)
43
+
44
+ models_config[model_id]["model"].tts_model.length_scale = speed
45
+ if use_default_emb_or_custom == "預設語者":
46
+ wav = _do_tts(
47
+ model_id,
48
+ ipa,
49
+ speaker_name=speaker
50
+ if len(models_config[model_id]["speaker_mapping"]) > 1
51
+ else None,
52
+ language_name=language,
53
+ )
54
+ else:
55
+ wav = _do_tts(
56
+ model_id,
57
+ ipa,
58
+ speaker_wav=speaker_wav,
59
+ language_name=language,
60
+ )
61
+
62
+ return (
63
+ models_config[model_id]["model"].tts_model.config.audio.sample_rate,
64
+ np.array(wav),
65
+ )
66
+
67
+
68
+ def when_model_selected(model_id):
69
+ model_config = models_config[model_id]
70
+
71
+ speaker_drop_down_choices = [
72
+ (k, v) for k, v in model_config["speaker_mapping"].items()
73
+ ]
74
+
75
+ language_radio_choices = [
76
+ (k, v) for k, v in model_config["language_mapping"].items()
77
+ ]
78
+
79
+ use_default_emb_or_ref_radio_visible = False
80
+ if model_config["model"].tts_model.config.model_args.speaker_encoder_model_path:
81
+ use_default_emb_or_ref_radio_visible = True
82
+
83
+ return (
84
+ gr.update(
85
+ choices=speaker_drop_down_choices,
86
+ value=speaker_drop_down_choices[0][1]
87
+ if len(speaker_drop_down_choices) > 0
88
+ else None,
89
+ interactive=len(speaker_drop_down_choices) > 1,
90
+ ),
91
+ gr.update(
92
+ choices=language_radio_choices,
93
+ value=language_radio_choices[0][1],
94
+ interactive=len(language_radio_choices) > 1,
95
+ ),
96
+ gr.update(visible=use_default_emb_or_ref_radio_visible, value="預設語者"),
97
+ )
98
+
99
+
100
+ def use_default_emb_or_custom_radio_input(use_default_emb_or_custom):
101
+ if use_default_emb_or_custom == "客製化語者":
102
+ return gr.update(visible=True), gr.update(visible=False)
103
+ return gr.update(visible=False), gr.update(visible=True)
104
+
105
+
106
+ def language_radio_changed(language):
107
+ if language in g2p:
108
+ return gr.update(visible=False)
109
+ dialect_choices = [tag.split("_")[1] for tag in g2p.keys() if language in tag]
110
+ return gr.update(
111
+ choices=dialect_choices,
112
+ value=dialect_choices[0],
113
+ interactive=len(dialect_choices) > 1,
114
+ )
115
+
116
+
117
+ demo = gr.Blocks(
118
+ title="臺灣南島語語音合成系統",
119
+ css="@import url(https://tauhu.tw/tauhu-oo.css);",
120
+ theme=gr.themes.Default(
121
+ font=(
122
+ "tauhu-oo",
123
+ gr.themes.GoogleFont("Source Sans Pro"),
124
+ "ui-sans-serif",
125
+ "system-ui",
126
+ "sans-serif",
127
+ )
128
+ ),
129
+ )
130
+
131
+ with demo:
132
+ default_model_id = list(models_config.keys())[0]
133
+ model_drop_down = gr.Dropdown(
134
+ models_config.keys(),
135
+ value=default_model_id,
136
+ label="模型",
137
+ )
138
+ use_default_emb_or_custom_radio = gr.Radio(
139
+ label="語者類型",
140
+ choices=["預設語者", "客製化語者"],
141
+ value="客製化語者",
142
+ visible=True,
143
+ show_label=False,
144
+ )
145
+ speaker_wav = gr.Audio(
146
+ label="客製化語音",
147
+ visible=True,
148
+ editable=False,
149
+ type="filepath",
150
+ waveform_options=gr.WaveformOptions(
151
+ show_controls=False,
152
+ sample_rate=16000,
153
+ ),
154
+ )
155
+ speaker_drop_down = gr.Dropdown(
156
+ choices=[
157
+ (k, v)
158
+ for k, v in models_config[default_model_id]["speaker_mapping"].items()
159
+ ],
160
+ value=list(models_config[default_model_id]["speaker_mapping"].values())[0],
161
+ label="語者",
162
+ interactive=len(models_config[default_model_id]["speaker_mapping"]) > 1,
163
+ visible=False,
164
+ )
165
+ use_default_emb_or_custom_radio.change(
166
+ use_default_emb_or_custom_radio_input,
167
+ inputs=[use_default_emb_or_custom_radio],
168
+ outputs=[speaker_wav, speaker_drop_down],
169
+ )
170
+
171
+ default_language = list(
172
+ models_config[default_model_id]["language_mapping"].values()
173
+ )[0]
174
+ language_radio = gr.Radio(
175
+ choices=[
176
+ (k, v)
177
+ for k, v in models_config[default_model_id]["language_mapping"].items()
178
+ ],
179
+ value=default_language,
180
+ label="語言",
181
+ interactive=len(models_config[default_model_id]["language_mapping"]) > 1,
182
+ )
183
+
184
+ default_dialect_choices = [
185
+ tag.split("_")[1] for tag in g2p.keys() if default_language in tag
186
+ ]
187
+ dialect_radio = gr.Radio(
188
+ choices=default_dialect_choices,
189
+ value=default_dialect_choices[0],
190
+ label="方言",
191
+ interactive=len(default_dialect_choices) > 1,
192
+ )
193
+
194
+ language_radio.change(
195
+ language_radio_changed, inputs=[language_radio], outputs=[dialect_radio]
196
+ )
197
+
198
+ model_drop_down.input(
199
+ when_model_selected,
200
+ inputs=[model_drop_down],
201
+ outputs=[speaker_drop_down, language_radio, use_default_emb_or_custom_radio],
202
+ )
203
+
204
+ input_text = gr.Textbox(
205
+ label="輸入文字",
206
+ value="",
207
+ )
208
+
209
+ speed = gr.Slider(maximum=1.5, minimum=0.5, value=1, label="語速")
210
+
211
+ with open("DEMO.md") as tong:
212
+ gr.Markdown(tong.read())
213
+ gr.Interface(
214
+ text_to_speech,
215
+ inputs=[
216
+ model_drop_down,
217
+ use_default_emb_or_custom_radio,
218
+ speaker_wav,
219
+ speaker_drop_down,
220
+ language_radio,
221
+ dialect_radio,
222
+ speed,
223
+ input_text,
224
+ ],
225
+ outputs=[
226
+ gr.Audio(interactive=False, label="合成語音", show_download_button=True),
227
+ ],
228
+ allow_flagging="auto",
229
+ )
230
+ gr.Examples(
231
+ [
232
+ [
233
+ "預設語者",
234
+ "formosan_dict_ami#wav/formosan_dict_ami/000002_2.31-6.09.wav",
235
+ "阿美",
236
+ "南勢",
237
+ "mikadavu ku vavainay, i vavahiyan, a luma’",
238
+ ],
239
+ [
240
+ "預設語者",
241
+ "formosan_dict_ami#wav/formosan_dict_ami/000035_0.00-3.69.wav",
242
+ "阿美",
243
+ "南勢",
244
+ "mikadavu ku vavainay, i vavahiyan, a luma’",
245
+ ],
246
+ ],
247
+ label="範例",
248
+ inputs=[
249
+ use_default_emb_or_custom_radio,
250
+ speaker_drop_down,
251
+ language_radio,
252
+ dialect_radio,
253
+ input_text,
254
+ ],
255
+ )
256
 
 
257
  demo.launch()
configs/g2p.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ g2p: ${load_g2p:${gh_download:FormoSpeech/FormoLexicon, formosan/g2p.csv, ${oc.env:GH_TOKEN}}}
configs/models.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ yourtts-ami:
2
+ model: ${load_model:united-link/yourtts-formosan-ami}
3
+ language_mapping:
4
+ 阿美: 阿美
5
+ speaker_mapping: # display_name: id
6
+ 男: formosan_dict_ami#wav/formosan_dict_ami/000002_2.31-6.09.wav
7
+ 女: formosan_dict_ami#wav/formosan_dict_ami/000035_0.00-3.69.wav
ipa/__init__.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ from io import BytesIO
3
+
4
+ import requests
5
+ from omegaconf import OmegaConf
6
+
7
+ EXTRA_G2P = {
8
+ "z": "z",
9
+ "o": "o",
10
+ "h": "h",
11
+ "g": "g",
12
+ "y": "j",
13
+ "w": "w",
14
+ "c": "ʦ",
15
+ "u": "u",
16
+ "f": "f",
17
+ "v": "v",
18
+ "j": "ɟ",
19
+ "b": "b",
20
+ "q": "q",
21
+ "e": "e",
22
+ ",": ",",
23
+ }
24
+
25
+
26
+ def gh_download(repo, path, token):
27
+ headers = {
28
+ "Authorization": f"Bearer {token}",
29
+ "Accept": "application/vnd.github.raw+json",
30
+ }
31
+
32
+ url = f"https://api.github.com/repos/{repo}/contents/{path}"
33
+ response = requests.get(url, headers=headers)
34
+ if response.status_code != 200:
35
+ raise Exception(f"Failed to download {path} from {repo}, response: {response}")
36
+ response.encoding = "utf-8-sig"
37
+
38
+ return response.text
39
+
40
+
41
+ def load_g2p(g2p_string):
42
+ g2p = dict()
43
+
44
+ csv_reader = csv.DictReader(g2p_string.split("\n"))
45
+
46
+ for row in csv_reader:
47
+ # print(row)
48
+ language = row["Language"]
49
+ dialect = row["Dialect"]
50
+
51
+ if dialect == "-":
52
+ lang_tag = f"{language}"
53
+ else:
54
+ lang_tag = f"{language}_{dialect}"
55
+
56
+ for key in row:
57
+ if key in ["Language", "Dialect"]:
58
+ continue
59
+
60
+ if row[key] == "-":
61
+ continue
62
+
63
+ g2p[lang_tag] = g2p.get(lang_tag, {})
64
+ g2p[lang_tag][key] = row[key].split(",")[0]
65
+
66
+ for g, p in EXTRA_G2P.items():
67
+ if g not in g2p[lang_tag]:
68
+ g2p[lang_tag][g] = p
69
+
70
+ return g2p
71
+
72
+
73
+ OmegaConf.register_new_resolver("gh_download", gh_download)
74
+ OmegaConf.register_new_resolver("load_g2p", load_g2p)
75
+
76
+ g2p = OmegaConf.to_object(OmegaConf.load("configs/g2p.yaml"))["g2p"]
ipa/ipa.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def can_form_string(x, symbol_dict):
2
+ def helper(x, symbol_dict, matched_parts):
3
+ if not x:
4
+ return True, matched_parts
5
+
6
+ for key in symbol_dict.keys():
7
+ if x.startswith(key):
8
+ result, parts = helper(
9
+ x[len(key) :], symbol_dict, matched_parts + [key]
10
+ )
11
+ if result:
12
+ return True, parts
13
+
14
+ return False, []
15
+
16
+ return helper(x, symbol_dict, [])
17
+
18
+
19
+ def text_to_ipa(text, lang_tag, g2p):
20
+ ipa = []
21
+ words = text.split()
22
+
23
+ print(words)
24
+
25
+ for word in words:
26
+ ipa_parts = ""
27
+ result, matched_parts = can_form_string(word, g2p[lang_tag])
28
+
29
+ if result is False:
30
+ return ""
31
+
32
+ for matched_part in matched_parts:
33
+ ipa_parts = ipa_parts + g2p[lang_tag][matched_part]
34
+
35
+ ipa.append(ipa_parts)
36
+ ipa = " ".join(ipa)
37
+ return ipa
models/__init__.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ from huggingface_hub import snapshot_download
5
+ from omegaconf import OmegaConf
6
+ from TTS.utils.synthesizer import Synthesizer
7
+
8
+
9
+ def load_model(model_id):
10
+ model_dir = snapshot_download(model_id)
11
+ config_file_path = os.path.join(model_dir, "config.json")
12
+ model_ckpt_path = os.path.join(model_dir, "model.pth")
13
+ speaker_file_path = os.path.join(model_dir, "speakers.pth")
14
+ language_file_path = os.path.join(model_dir, "language_ids.json")
15
+ speaker_embedding_file_path = os.path.join(model_dir, "speaker_embs.pth")
16
+
17
+ temp_config_path = "temp_config.json"
18
+ with open(config_file_path, "r") as f:
19
+ content = f.read()
20
+ content = content.replace("speakers.pth", speaker_file_path)
21
+ content = content.replace("language_ids.json", language_file_path)
22
+ content = content.replace("speaker_embs.pth", speaker_embedding_file_path)
23
+ f.close()
24
+ with open(temp_config_path, "w") as f:
25
+ f.write(content)
26
+ f.close()
27
+
28
+ return Synthesizer(
29
+ tts_checkpoint=model_ckpt_path,
30
+ tts_config_path=temp_config_path,
31
+ use_cuda=torch.cuda.is_available(),
32
+ )
33
+
34
+
35
+ OmegaConf.register_new_resolver("load_model", load_model)
36
+
37
+ models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ TTS
2
+ omegaconf