Spaces:

benjaminzhang
/

cou_tts

Sleeping

App Files Files Community

benjaminzhang commited on Aug 12

Commit

d220e78

1 Parent(s): 250486f

Add app, model and requirements

Browse files

Files changed (4) hide show

app.py +166 -0
cou_medium.onnx.json +74 -0
cou_total.onnx +3 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# truku_tts_gradio_app.py (極簡版)
+# ------------------------------------------------------------
+# 太魯閣語 (Truku) TTS — ONNX 推論的極簡 Gradio 介面
+# 需求：只輸入文字 → 直接合成（固定 model/config/scales）
+# - UI：只保留文字輸入、合成按鈕、音檔播放/下載
+# - 其餘選項全部移除
+# ------------------------------------------------------------
+import os
+import json
+import time
+import numpy as np
+import soundfile as sf
+import onnxruntime as ort
+import gradio as gr
+# ======== 固定參數（依你的環境修改） ========
+MODEL_PATH = "/home/benjamin/TTS/cou_total.onnx"
+CONFIG_PATH = "/home/benjamin/TTS/cou_medium.onnx.json"
+SCALES = (0.667, 1.0, 0.8)  # (length_scale, noise_scale, noise_w)
+# ======== 輔助：載入 config/phoneme_map、建立 ONNX session ========
+with open(CONFIG_PATH, "r", encoding="utf-8") as f:
+    _config = json.load(f)
+_phoneme_map = _config["phoneme_id_map"]
+_pad_id = _phoneme_map["_"]
+_bos_id = _phoneme_map["^"]
+_eos_id = _phoneme_map["$"]
+# 若需要 GPU，可改 providers，例如：
+# providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
+_session = ort.InferenceSession(MODEL_PATH, providers=["CPUExecutionProvider"])
+def _text_to_ids(text: str) -> np.ndarray:
+    # 逐字轉 id；未知字用 pad_id；前後加 BOS/EOS
+    ids = [_bos_id] + [_phoneme_map.get(c, _pad_id) for c in text] + [_eos_id]
+    return np.array(ids, dtype=np.int64)
+def synthesize(text: str):
+    text = (text or "").strip()
+    if not text:
+        raise gr.Error("請輸入要合成的文字！")
+    ids = _text_to_ids(text).reshape(1, -1)
+    ids_len = np.array([ids.shape[1]], dtype=np.int64)
+    scales = np.array(list(SCALES), dtype=np.float32)
+    start = time.time()
+    audio = _session.run(
+        None,
+        {"input": ids, "input_lengths": ids_len, "scales": scales},
+    )[0].squeeze()
+    rt = round(time.time() - start, 3)
+    sr = int(_config["audio"]["sample_rate"])  # 取樣率
+    # 同時寫檔，供下載
+    out_name = f"truku_tts_{int(time.time()*1000)}.wav"
+    out_path = os.path.abspath(out_name)
+    sf.write(out_path, audio, samplerate=sr)
+    # gr.Audio 可直接用 (sr, waveform)
+    return (sr, audio), out_path
+# ======== 極簡 UI：只有一個輸入 + 合成 + 音檔 ========
+demo = gr.Interface(
+    fn=synthesize,
+    inputs=gr.Textbox(lines=3, placeholder="請輸入太魯閣語文字…", label="輸入文字"),
+    outputs=[
+        gr.Audio(label="合成音檔", interactive=False, show_download_button=True),
+        gr.File(label="下載 WAV 檔"),
+    ],
+    title="太魯閣語語音合成 (女聲) — ONNX",
+    description="輸入文字後按下「提交」即可合成。模型與參數固定在程式內。",
+)
+if __name__ == "__main__":
+    # 本機建議：若在 WSL，無法用 localhost 時，改用 share=True 或用 127.0.0.1:port
+    demo.launch(server_name="127.0.0.1", server_port=7860)
+# truku_tts_gradio_app.py (極簡版)
+# ------------------------------------------------------------
+# 太魯閣語 (Truku) TTS — ONNX 推論的極簡 Gradio 介面
+# 需求：只輸入文字 → 直接合成（固定 model/config/scales）
+# - UI：只保留文字輸入、合成按鈕、音檔播放/下載
+# - 其餘選項全部移除
+# ------------------------------------------------------------
+import os
+import json
+import time
+import numpy as np
+import soundfile as sf
+import onnxruntime as ort
+import gradio as gr
+# ======== 固定參數（依你的環境修改） ========
+MODEL_PATH = "/home/benjamin/TTS/cou_total.onnx"
+CONFIG_PATH = "/home/benjamin/TTS/cou_medium.onnx.json"
+SCALES = (0.667, 1.0, 0.8)  # (length_scale, noise_scale, noise_w)
+# ======== 輔助：載入 config/phoneme_map、建立 ONNX session ========
+with open(CONFIG_PATH, "r", encoding="utf-8") as f:
+    _config = json.load(f)
+_phoneme_map = _config["phoneme_id_map"]
+_pad_id = _phoneme_map["_"]
+_bos_id = _phoneme_map["^"]
+_eos_id = _phoneme_map["$"]
+# 若需要 GPU，可改 providers，例如：
+# providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
+_session = ort.InferenceSession(MODEL_PATH, providers=["CPUExecutionProvider"])
+def _text_to_ids(text: str) -> np.ndarray:
+    # 逐字轉 id；未知字用 pad_id；前後加 BOS/EOS
+    ids = [_bos_id] + [_phoneme_map.get(c, _pad_id) for c in text] + [_eos_id]
+    return np.array(ids, dtype=np.int64)
+def synthesize(text: str):
+    text = (text or "").strip()
+    if not text:
+        raise gr.Error("請輸入要合成的文字！")
+    ids = _text_to_ids(text).reshape(1, -1)
+    ids_len = np.array([ids.shape[1]], dtype=np.int64)
+    scales = np.array(list(SCALES), dtype=np.float32)
+    start = time.time()
+    audio = _session.run(
+        None,
+        {"input": ids, "input_lengths": ids_len, "scales": scales},
+    )[0].squeeze()
+    rt = round(time.time() - start, 3)
+    sr = int(_config["audio"]["sample_rate"])  # 取樣率
+    # 同時寫檔，供下載
+    out_name = f"truku_tts_{int(time.time()*1000)}.wav"
+    out_path = os.path.abspath(out_name)
+    sf.write(out_path, audio, samplerate=sr)
+    # gr.Audio 可直接用 (sr, waveform)
+    return (sr, audio), out_path
+# ======== 極簡 UI：只有一個輸入 + 合成 + 音檔 ========
+demo = gr.Interface(
+    fn=synthesize,
+    inputs=gr.Textbox(lines=3, placeholder="請輸入太魯閣語文字…", label="輸入文字"),
+    outputs=[
+        gr.Audio(label="合成音檔", interactive=False, show_download_button=True),
+        gr.File(label="下載 WAV 檔"),
+    ],
+    title="太魯閣語語音合成 (女聲) — ONNX",
+    description="輸入文字後按下「提交」即可合成。模型與參數固定在程式內。",
+)
+if __name__ == "__main__":
+    # 本機建議：若在 WSL，無法用 localhost 時，改用 share=True 或用 127.0.0.1:port
+    demo.launch(server_name="127.0.0.1", server_port=7860)

cou_medium.onnx.json ADDED Viewed

	@@ -0,0 +1,74 @@

+{
+    "dataset": "TTS",
+    "audio": {
+        "sample_rate": 22050,
+        "quality": "TTS_dataset_wav_prepared"
+    },
+    "espeak": {
+        "voice": "cou"
+    },
+    "language": {
+        "code": "cou"
+    },
+    "inference": {
+        "noise_scale": 0.667,
+        "length_scale": 1,
+        "noise_w": 0.8
+    },
+    "phoneme_type": "text",
+    "phoneme_map": {},
+    "phoneme_id_map": {
+        "_": 0,
+        "^": 1,
+        "$": 2,
+        " ": 3,
+        "!": 4,
+        "'": 5,
+        "(": 6,
+        ")": 7,
+        "*": 8,
+        ",": 9,
+        "-": 10,
+        ".": 11,
+        "0": 12,
+        "1": 13,
+        "2": 14,
+        "3": 15,
+        "4": 16,
+        "5": 17,
+        "6": 18,
+        ":": 19,
+        ";": 20,
+        "<": 21,
+        ">": 22,
+        "?": 23,
+        "[": 24,
+        "\\": 25,
+        "]": 26,
+        "a": 27,
+        "b": 28,
+        "c": 29,
+        "e": 30,
+        "f": 31,
+        "g": 32,
+        "h": 33,
+        "i": 34,
+        "k": 35,
+        "l": 36,
+        "m": 37,
+        "n": 38,
+        "o": 39,
+        "p": 40,
+        "s": 41,
+        "t": 42,
+        "u": 43,
+        "v": 44,
+        "y": 45,
+        "z": 46,
+        "ʉ": 47
+    },
+    "num_symbols": 256,
+    "num_speakers": 1,
+    "speaker_id_map": {},
+    "piper_version": "1.0.0"
+}

cou_total.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a34881ebb3ae8d361cf038111d4b560d745fa9cfe46b8fa369c83ec19bcdbe71
+size 63516050

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+torch
+numpy
+soundfile
+transformers
+onnxruntime