benjaminzhang commited on
Commit
d220e78
·
1 Parent(s): 250486f

Add app, model and requirements

Browse files
Files changed (4) hide show
  1. app.py +166 -0
  2. cou_medium.onnx.json +74 -0
  3. cou_total.onnx +3 -0
  4. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # truku_tts_gradio_app.py (極簡版)
2
+ # ------------------------------------------------------------
3
+ # 太魯閣語 (Truku) TTS — ONNX 推論的極簡 Gradio 介面
4
+ # 需求:只輸入文字 → 直接合成(固定 model/config/scales)
5
+ # - UI:只保留文字輸入、合成按鈕、音檔播放/下載
6
+ # - 其餘選項全部移除
7
+ # ------------------------------------------------------------
8
+
9
+ import os
10
+ import json
11
+ import time
12
+ import numpy as np
13
+ import soundfile as sf
14
+ import onnxruntime as ort
15
+ import gradio as gr
16
+
17
+ # ======== 固定參數(依你的環境修改) ========
18
+ MODEL_PATH = "/home/benjamin/TTS/cou_total.onnx"
19
+ CONFIG_PATH = "/home/benjamin/TTS/cou_medium.onnx.json"
20
+ SCALES = (0.667, 1.0, 0.8) # (length_scale, noise_scale, noise_w)
21
+
22
+ # ======== 輔助:載入 config/phoneme_map、建立 ONNX session ========
23
+ with open(CONFIG_PATH, "r", encoding="utf-8") as f:
24
+ _config = json.load(f)
25
+
26
+ _phoneme_map = _config["phoneme_id_map"]
27
+ _pad_id = _phoneme_map["_"]
28
+ _bos_id = _phoneme_map["^"]
29
+ _eos_id = _phoneme_map["$"]
30
+
31
+ # 若需要 GPU,可改 providers,例如:
32
+ # providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
33
+ _session = ort.InferenceSession(MODEL_PATH, providers=["CPUExecutionProvider"])
34
+
35
+
36
+ def _text_to_ids(text: str) -> np.ndarray:
37
+ # 逐字轉 id;未知字用 pad_id;前後加 BOS/EOS
38
+ ids = [_bos_id] + [_phoneme_map.get(c, _pad_id) for c in text] + [_eos_id]
39
+ return np.array(ids, dtype=np.int64)
40
+
41
+
42
+ def synthesize(text: str):
43
+ text = (text or "").strip()
44
+ if not text:
45
+ raise gr.Error("請輸入要合成的文字!")
46
+
47
+ ids = _text_to_ids(text).reshape(1, -1)
48
+ ids_len = np.array([ids.shape[1]], dtype=np.int64)
49
+ scales = np.array(list(SCALES), dtype=np.float32)
50
+
51
+ start = time.time()
52
+ audio = _session.run(
53
+ None,
54
+ {"input": ids, "input_lengths": ids_len, "scales": scales},
55
+ )[0].squeeze()
56
+ rt = round(time.time() - start, 3)
57
+
58
+ sr = int(_config["audio"]["sample_rate"]) # 取樣率
59
+
60
+ # 同時寫檔,供下載
61
+ out_name = f"truku_tts_{int(time.time()*1000)}.wav"
62
+ out_path = os.path.abspath(out_name)
63
+ sf.write(out_path, audio, samplerate=sr)
64
+
65
+ # gr.Audio 可直接用 (sr, waveform)
66
+ return (sr, audio), out_path
67
+
68
+
69
+ # ======== 極簡 UI:只有一個輸入 + 合成 + 音檔 ========
70
+ demo = gr.Interface(
71
+ fn=synthesize,
72
+ inputs=gr.Textbox(lines=3, placeholder="請輸入太魯閣語文字…", label="輸入文字"),
73
+ outputs=[
74
+ gr.Audio(label="合成音檔", interactive=False, show_download_button=True),
75
+ gr.File(label="下載 WAV 檔"),
76
+ ],
77
+ title="太魯閣語語音合成 (女聲) — ONNX",
78
+ description="輸入文字後按下「提交」即可合成。模型與參數固定在程式內。",
79
+ )
80
+
81
+ if __name__ == "__main__":
82
+ # 本機建議:若在 WSL,無法用 localhost 時,改用 share=True 或用 127.0.0.1:port
83
+ demo.launch(server_name="127.0.0.1", server_port=7860)
84
+ # truku_tts_gradio_app.py (極簡版)
85
+ # ------------------------------------------------------------
86
+ # 太魯閣語 (Truku) TTS — ONNX 推論的極簡 Gradio 介面
87
+ # 需求:只輸入文字 → 直接合成(固定 model/config/scales)
88
+ # - UI:只保留文字輸入、合成按鈕、音檔播放/下載
89
+ # - 其餘選項全部移除
90
+ # ------------------------------------------------------------
91
+
92
+ import os
93
+ import json
94
+ import time
95
+ import numpy as np
96
+ import soundfile as sf
97
+ import onnxruntime as ort
98
+ import gradio as gr
99
+
100
+ # ======== 固定參數(依你的環境修改) ========
101
+ MODEL_PATH = "/home/benjamin/TTS/cou_total.onnx"
102
+ CONFIG_PATH = "/home/benjamin/TTS/cou_medium.onnx.json"
103
+ SCALES = (0.667, 1.0, 0.8) # (length_scale, noise_scale, noise_w)
104
+
105
+ # ======== 輔助:載入 config/phoneme_map、建立 ONNX session ========
106
+ with open(CONFIG_PATH, "r", encoding="utf-8") as f:
107
+ _config = json.load(f)
108
+
109
+ _phoneme_map = _config["phoneme_id_map"]
110
+ _pad_id = _phoneme_map["_"]
111
+ _bos_id = _phoneme_map["^"]
112
+ _eos_id = _phoneme_map["$"]
113
+
114
+ # 若需要 GPU,可改 providers,例如:
115
+ # providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
116
+ _session = ort.InferenceSession(MODEL_PATH, providers=["CPUExecutionProvider"])
117
+
118
+
119
+ def _text_to_ids(text: str) -> np.ndarray:
120
+ # 逐字轉 id;未知字用 pad_id;前後加 BOS/EOS
121
+ ids = [_bos_id] + [_phoneme_map.get(c, _pad_id) for c in text] + [_eos_id]
122
+ return np.array(ids, dtype=np.int64)
123
+
124
+
125
+ def synthesize(text: str):
126
+ text = (text or "").strip()
127
+ if not text:
128
+ raise gr.Error("請輸入要合成的文字!")
129
+
130
+ ids = _text_to_ids(text).reshape(1, -1)
131
+ ids_len = np.array([ids.shape[1]], dtype=np.int64)
132
+ scales = np.array(list(SCALES), dtype=np.float32)
133
+
134
+ start = time.time()
135
+ audio = _session.run(
136
+ None,
137
+ {"input": ids, "input_lengths": ids_len, "scales": scales},
138
+ )[0].squeeze()
139
+ rt = round(time.time() - start, 3)
140
+
141
+ sr = int(_config["audio"]["sample_rate"]) # 取樣率
142
+
143
+ # 同時寫檔,供下載
144
+ out_name = f"truku_tts_{int(time.time()*1000)}.wav"
145
+ out_path = os.path.abspath(out_name)
146
+ sf.write(out_path, audio, samplerate=sr)
147
+
148
+ # gr.Audio 可直接用 (sr, waveform)
149
+ return (sr, audio), out_path
150
+
151
+
152
+ # ======== 極簡 UI:只有一個輸入 + 合成 + 音檔 ========
153
+ demo = gr.Interface(
154
+ fn=synthesize,
155
+ inputs=gr.Textbox(lines=3, placeholder="請輸入太魯閣語文字…", label="輸入文字"),
156
+ outputs=[
157
+ gr.Audio(label="合成音檔", interactive=False, show_download_button=True),
158
+ gr.File(label="下載 WAV 檔"),
159
+ ],
160
+ title="太魯閣語語音合成 (女聲) — ONNX",
161
+ description="輸入文字後按下「提交」即可合成。模型與參數固定在程式內。",
162
+ )
163
+
164
+ if __name__ == "__main__":
165
+ # 本機建議:若在 WSL,無法用 localhost 時,改用 share=True 或用 127.0.0.1:port
166
+ demo.launch(server_name="127.0.0.1", server_port=7860)
cou_medium.onnx.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "TTS",
3
+ "audio": {
4
+ "sample_rate": 22050,
5
+ "quality": "TTS_dataset_wav_prepared"
6
+ },
7
+ "espeak": {
8
+ "voice": "cou"
9
+ },
10
+ "language": {
11
+ "code": "cou"
12
+ },
13
+ "inference": {
14
+ "noise_scale": 0.667,
15
+ "length_scale": 1,
16
+ "noise_w": 0.8
17
+ },
18
+ "phoneme_type": "text",
19
+ "phoneme_map": {},
20
+ "phoneme_id_map": {
21
+ "_": 0,
22
+ "^": 1,
23
+ "$": 2,
24
+ " ": 3,
25
+ "!": 4,
26
+ "'": 5,
27
+ "(": 6,
28
+ ")": 7,
29
+ "*": 8,
30
+ ",": 9,
31
+ "-": 10,
32
+ ".": 11,
33
+ "0": 12,
34
+ "1": 13,
35
+ "2": 14,
36
+ "3": 15,
37
+ "4": 16,
38
+ "5": 17,
39
+ "6": 18,
40
+ ":": 19,
41
+ ";": 20,
42
+ "<": 21,
43
+ ">": 22,
44
+ "?": 23,
45
+ "[": 24,
46
+ "\\": 25,
47
+ "]": 26,
48
+ "a": 27,
49
+ "b": 28,
50
+ "c": 29,
51
+ "e": 30,
52
+ "f": 31,
53
+ "g": 32,
54
+ "h": 33,
55
+ "i": 34,
56
+ "k": 35,
57
+ "l": 36,
58
+ "m": 37,
59
+ "n": 38,
60
+ "o": 39,
61
+ "p": 40,
62
+ "s": 41,
63
+ "t": 42,
64
+ "u": 43,
65
+ "v": 44,
66
+ "y": 45,
67
+ "z": 46,
68
+ "ʉ": 47
69
+ },
70
+ "num_symbols": 256,
71
+ "num_speakers": 1,
72
+ "speaker_id_map": {},
73
+ "piper_version": "1.0.0"
74
+ }
cou_total.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a34881ebb3ae8d361cf038111d4b560d745fa9cfe46b8fa369c83ec19bcdbe71
3
+ size 63516050
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ numpy
4
+ soundfile
5
+ transformers
6
+ onnxruntime