Spaces:

ivanusto
/

tw-hakka-tts

Running on Zero

App Files Files Community

txya900619 commited on Apr 11

Commit

5e8e534

•

1 Parent(s): 4aaa5e4

feat: init upload

Browse files

Files changed (10) hide show

.gitignore +3 -0
app.py +126 -0
configs/ipa.yaml +8 -0
configs/models.yaml +10 -0
ipa/__init__.py +24 -0
ipa/convert_digits.py +180 -0
ipa/ipa.py +88 -0
ipa/proc_text.py +85 -0
replace/tts.py +70 -0
requirements.txt +4 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__/
+temp_config.json
+flagged/

app.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import json
+import os
+import tempfile
+import gradio as gr
+import TTS
+from TTS.utils.synthesizer import Synthesizer
+import numpy as np
+from huggingface_hub import snapshot_download
+from omegaconf import OmegaConf
+from ipa.ipa import get_ipa, parse_ipa
+from replace.tts import ChangedVitsConfig
+TTS.tts.configs.vits_config.VitsConfig = ChangedVitsConfig
+def load_model(model_id):
+    model_dir = snapshot_download(model_id)
+    config_file_path = os.path.join(model_dir, "config.json")
+    model_ckpt_path = os.path.join(model_dir, "model.pth")
+    speaker_file_path = os.path.join(model_dir, "speakers.pth")
+    language_file_path = os.path.join(model_dir, "language_ids.json")
+    temp_config_path = "temp_config.json"
+    with open(config_file_path, "r") as f:
+        content = f.read()
+        content = content.replace("speakers.pth", speaker_file_path)
+        content = content.replace("language_ids.json", language_file_path)
+        f.close()
+    with open(temp_config_path, "w") as f:
+        f.write(content)
+        f.close()
+    return Synthesizer(tts_checkpoint=model_ckpt_path, tts_config_path=temp_config_path)
+OmegaConf.register_new_resolver("load_model", load_model)
+models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
+def text_to_speech(model_id: str, speaker: str, dialect, text: str):
+    model = models_config[model_id]["model"]
+    if len(text) == 0:
+        raise gr.Error("請勿輸入空字串。")
+    words, ipa, pinyin, missing_words = get_ipa(text, dialect=dialect)
+    if len(missing_words) > 0:
+        raise gr.Error(
+            f"句子中的[{','.join(missing_words)}]目前無法轉成 ipa。請嘗試其他句子。"
+        )
+    wav = model.tts(
+        parse_ipa(ipa),
+        speaker_name=speaker,
+        language_name=dialect,
+        split_sentences=False,
+    )
+    return words, pinyin, (16000, np.array(wav))
+def when_model_selected(model_id):
+    model_config = models_config[model_id]
+    speaker_drop_down_choices = [(k,v) for k, v in model_config["speaker_mapping"].items()]
+    dialect_drop_down_choices = model_config["avalible_dialect"]
+    return gr.update(choices=speaker_drop_down_choices), gr.update(choices=dialect_drop_down_choices)
+demo = gr.Blocks(
+    title="臺灣客語語音生成系統",
+    css="@import url(https://tauhu.tw/tauhu-oo.css);",
+    theme=gr.themes.Default(
+        font=(
+            "tauhu-oo",
+            gr.themes.GoogleFont("Source Sans Pro"),
+            "ui-sans-serif",
+            "system-ui",
+            "sans-serif",
+        )
+    ),
+)
+with demo:
+    default_model_id = list(models_config.keys())[0]
+    model_drop_down = gr.Dropdown(
+        models_config.keys(),
+        value=default_model_id,
+    )
+    speaker_drop_down = gr.Dropdown(
+        choices=[(k,v) for k, v in models_config[default_model_id]["speaker_mapping"].items()],
+        value=list(models_config[default_model_id]["speaker_mapping"].values())[0]
+    )
+    dialect_drop_down = gr.Dropdown(
+        choices=models_config[default_model_id]["avalible_dialect"],
+        value=models_config[default_model_id]["avalible_dialect"][0]
+    )
+    model_drop_down.input(
+        when_model_selected,
+        inputs=[model_drop_down],
+        outputs=[speaker_drop_down, dialect_drop_down]
+    )
+    gr.Markdown(
+        """
+        # 臺灣客語語音生成系統
+        """
+    )
+    gr.Interface(
+        text_to_speech,
+        inputs=[
+            model_drop_down,
+            speaker_drop_down,
+            dialect_drop_down,
+            gr.Textbox(),
+        ],
+        outputs=[
+            gr.Textbox(interactive=False, label="word segment"),
+            gr.Textbox(interactive=False, label="pinyin"),
+            gr.Audio(
+                interactive=False, label="generated speech", show_download_button=True
+            ),
+        ],
+        allow_flagging="auto",
+    )
+demo.launch()

configs/ipa.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+gh_token: ${oc.env:GH_TOKEN}
+delimiter_list: ${gh_download:FormoSpeech/FormoLexicon, release/delimiters.json, ${gh_token}}
+replace_dict: ${gh_download:FormoSpeech/FormoLexicon, release/replaced_words_htia.json, ${gh_token}}
+v2f_dict: ${gh_download:FormoSpeech/FormoLexicon, [release/v2f_goyu.json, release/v2f_htia.json], ${gh_token}}
+preserved_list: ${gh_download:FormoSpeech/FormoLexicon, release/preserved_words_htia.json, ${gh_token}}
+lexicon:
+  sixian: ${gh_download:FormoSpeech/FormoLexicon, release/lexicon_htia_sixian_c.json, ${gh_token}}
+  hailu: ${gh_download:FormoSpeech/FormoLexicon, release/lexicon_htia_hailu_c.json, ${gh_token}}

configs/models.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+sixian-hailu-mix:
+  model: ${load_model:formospeech/taiwanese-hakka-tts-sixian-hailu-mix}
+  avalible_dialect:
+    - sixian
+    - hailu
+  speaker_mapping: # display_name: id
+    sixian/female: XF
+    sixian/male: XM
+    hailu/female: HF
+    hailu/male: HM

ipa/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import requests
+from omegaconf import OmegaConf
+def gh_download(repo, path, token):
+    paths = [path] if isinstance(path, str) else path
+    result = None
+    headers = {"Authorization": f"Bearer {token}", "Accept": "application/vnd.github.raw+json"}
+    for path in paths:
+        url = f"https://api.github.com/repos/{repo}/contents/{path}"
+        response = requests.get(url, headers=headers)
+        if response.status_code != 200:
+            raise Exception(f"Failed to download {path} from {repo}")
+        if result is None:
+            result = response.json()
+        elif isinstance(result, list):
+            result.extend(response.json())
+        elif isinstance(result, dict):
+            result.update(response.json())
+    return result
+OmegaConf.register_new_resolver("gh_download", gh_download)

ipa/convert_digits.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# Copyright 2024    Hung-Shin Lee (hungshinlee@gmail.com)
+# Apache 2.0
+import itertools
+import re
+c_basic = "零一二三四五六七八九"
+d2c = {str(d): c for d, c in enumerate(c_basic)}
+d2c["."] = "點"
+def num4year(matched):
+    def _num4year(num):
+        return "{}".format("".join([c_basic[int(i)] for i in num]))
+    matched_str = matched.group(0)
+    for m in matched.groups():
+        matched_str = matched_str.replace(m, _num4year(m))
+    return matched_str
+def num2chines_simple(matched):
+    return "{}".format("".join([d2c[i] for i in matched]))
+def num4percent(matched):
+    matched = matched.group(1)
+    return "百分之{}".format(num2chinese(matched[:-1]))
+def num4cellphone(matched):
+    matched = matched.group(1)
+    matched = matched.replace(" ", "").replace("-", "")
+    return "".join([c_basic[int(i)] for i in matched])
+def num4er(matched):  # 2 to 二
+    matched = matched.group(1)
+    return matched.replace("2", "二")
+def num4liang(matched):  # 2 to 兩
+    matched = matched.group(1)
+    return matched.replace("2", "兩")
+def num4general(matched):
+    num = matched.group(1)
+    if re.match("[A-Za-z-─]", num[0]):
+        if len(num[1:]) < 3:
+            # MP3 or F-16
+            return "{}{}".format(num[0], num2chinese(num[1:]))
+        else:
+            # AM104
+            return "{}{}".format(num[0], num2chines_simple(num[1:]))
+    else:
+        if re.match("[0-9]", num[0]):
+            return "{}".format(num2chinese(num))
+        else:
+            return "{}{}".format(num[0], num2chinese(num[1:]))
+def parse_num(text: str) -> str:
+    # year
+    text = re.sub("([0-9]{4})[到至]([0-9]{4})年", num4year, text)
+    text = re.sub("([0-9]{4})年", num4year, text)
+    # percentage
+    text = re.sub(r"([0-9]+\.?[0-9]?%)", num4percent, text)
+    # cellphone
+    text = re.sub(r"([0-9]{4}\s?-\s?[0-9]{6})", num4cellphone, text)
+    # single 2 to 二
+    text = re.sub(r"([^\d]2[診樓月號])", num4er, text)
+    text = re.sub(r"([初]2[^\d])", num4er, text)
+    # single 2 to 兩
+    text = re.sub(r"([^\d]2[^\d])", num4liang, text)
+    # general number
+    text = re.sub(r"([^0-9]?[0-9]+\.?[0-9]?)", num4general, text)
+    return text
+def num2chinese(num, big=False, simp=False, o=False, twoalt=True) -> str:
+    """
+    Converts numbers to Chinese representations.
+    https://gist.github.com/gumblex/0d65cad2ba607fd14de7
+    `big`   : use financial characters.
+    `simp`  : use simplified characters instead of traditional characters.
+    `o`     : use 〇 for zero.
+    `twoalt`: use 两/兩 for two when appropriate.
+    Note that `o` and `twoalt` is ignored when `big` is used,
+    and `twoalt` is ignored when `o` is used for formal representations.
+    """
+    # check num first
+    nd = str(num)
+    if abs(float(nd)) >= 1e48:
+        raise ValueError("number out of range")
+    elif "e" in nd:
+        raise ValueError("scientific notation is not supported")
+    c_symbol = "正负点" if simp else "正負點"
+    if o:  # formal
+        twoalt = False
+    if big:
+        c_basic = "零壹贰叁肆伍陆柒捌玖" if simp else "零壹貳參肆伍陸柒捌玖"
+        c_unit1 = "拾佰仟"
+        c_twoalt = "贰" if simp else "貳"
+    else:
+        c_basic = "〇一二三四五六七八九" if o else "零一二三四五六七八九"
+        c_unit1 = "十百千"
+        if twoalt:
+            c_twoalt = "两" if simp else "兩"
+        else:
+            c_twoalt = "二"
+    c_unit2 = "万亿兆京垓秭穰沟涧正载" if simp else "萬億兆京垓秭穰溝澗正載"
+    def revuniq(l):
+        return "".join(k for k, g in itertools.groupby(reversed(l)))
+    nd = str(num)
+    result = []
+    if nd[0] == "+":
+        result.append(c_symbol[0])
+    elif nd[0] == "-":
+        result.append(c_symbol[1])
+    if "." in nd:
+        integer, remainder = nd.lstrip("+-").split(".")
+    else:
+        integer, remainder = nd.lstrip("+-"), None
+    if int(integer):
+        splitted = [integer[max(i - 4, 0) : i] for i in range(len(integer), 0, -4)]
+        intresult = []
+        for nu, unit in enumerate(splitted):
+            # special cases
+            if int(unit) == 0:  # 0000
+                intresult.append(c_basic[0])
+                continue
+            elif nu > 0 and int(unit) == 2:  # 0002
+                intresult.append(c_twoalt + c_unit2[nu - 1])
+                continue
+            ulist = []
+            unit = unit.zfill(4)
+            for nc, ch in enumerate(reversed(unit)):
+                if ch == "0":
+                    if ulist:  # ???0
+                        ulist.append(c_basic[0])
+                elif nc == 0:
+                    ulist.append(c_basic[int(ch)])
+                elif nc == 1 and ch == "1" and all([i == "0" for i in unit[: nc + 1]]):
+                    # special case for tens
+                    # edit the 'elif' if you don't like
+                    # 十四, 三千零十四, 三千三百一十���
+                    ulist.append(c_unit1[0])
+                elif nc > 1 and ch == "2":
+                    ulist.append(c_twoalt + c_unit1[nc - 1])
+                else:
+                    ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
+            # print(ulist)
+            ustr = revuniq(ulist)
+            if nu == 0:
+                intresult.append(ustr)
+            else:
+                intresult.append(ustr + c_unit2[nu - 1])
+        result.append(revuniq(intresult).strip(c_basic[0]))
+    else:
+        result.append(c_basic[0])
+    if remainder:
+        result.append(c_symbol[2])
+        result.append("".join(c_basic[int(ch)] for ch in remainder))
+    return "".join(result)
+if __name__ == "__main__":
+    text = "若手機仔幾多號？吾手機仔係0964-498042。"
+    print(f"{text} -> {parse_num(text)}")

ipa/ipa.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import os
+import re
+from pathlib import Path
+import jieba
+from omegaconf import OmegaConf
+from ipa.convert_digits import parse_num
+from ipa.proc_text import (
+    apply_v2f,
+    normalize_text,
+    prep_regex,
+    run_jieba,
+    update_jieba_dict,
+)
+ipa_configs = OmegaConf.to_object(OmegaConf.load("configs/ipa.yaml"))
+for key in ipa_configs["preserved_list"]:
+    ipa_configs["v2f_dict"].pop(key, None)
+delimiter_regex, replace_regex, v2f_regex = prep_regex(
+    ipa_configs["delimiter_list"], ipa_configs["replace_dict"], ipa_configs["v2f_dict"]
+)
+def get_ipa(raw_text, dialect):
+    lexicon = ipa_configs["lexicon"][dialect]
+    update_jieba_dict(
+    list(lexicon.keys()), Path(os.path.dirname(jieba.__file__)) / "dict.txt"
+    )
+    text = normalize_text(raw_text, ipa_configs["replace_dict"], replace_regex)
+    text = parse_num(text)
+    text_parts = [s.strip() for s in re.split(delimiter_regex, text) if s.strip()]
+    text = "，".join(text_parts)
+    word_list = run_jieba(text)
+    word_list = apply_v2f(word_list, ipa_configs["v2f_dict"], v2f_regex)
+    word_list = run_jieba("".join(word_list))
+    final_words = []
+    final_pinyin = []
+    final_ipa = []
+    missing_words = []
+    for word in word_list:
+        if not bool(word.strip()):
+            continue
+        if word == "，":
+            final_words.append("，")
+            final_pinyin.append("，")
+            final_ipa.append("，")
+        elif word not in lexicon:
+            final_words.append(word)
+            missing_words.append(word)
+        else:
+            final_words.append(f"{word}")
+            final_pinyin.append(lexicon[word]['pinyin'][0])
+            # NOTE 只有 lexicon[word] 中的第一個 ipa 才被考慮
+            final_ipa.append(lexicon[word]['ipa'][0].replace(" ", "-"))
+    if len(final_ipa) == 0 or len(missing_words) > 0:
+        return final_words, final_ipa, final_pinyin, missing_words
+    final_words = " ".join(final_words).replace(" ， ", "，")
+    final_ipa = " ".join(final_ipa).replace(" ， ", "，")
+    final_pinyin = " ".join(final_pinyin).replace(" ， ", "，")
+    return final_words, final_ipa, final_pinyin, missing_words
+def parse_ipa(ipa: str):
+    text = []
+    ipa_list = re.split(r"(?<![， -])(?=[， -])|(?<=[， -])(?![， -])",ipa)
+    # tone as a separate token
+    for phoneme_with_tone in ipa_list:
+        if phoneme_with_tone ==" ":
+            text.append(phoneme_with_tone)
+            continue
+        elif phoneme_with_tone == "，":
+            text.extend(" ， ")
+            continue
+        elif phoneme_with_tone == "-": # use " " split 詞 (or use " " to split 字)
+            continue
+        split_phoneme_and_tone = phoneme_with_tone.split("_")
+        if len(split_phoneme_and_tone) == 2:
+            phoneme, tone = split_phoneme_and_tone
+            text.extend(phoneme)
+            text.append(tone)
+        else:
+            text.extend(split_phoneme_and_tone[0])
+    return text

ipa/proc_text.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright 2024    Hung-Shin Lee (hungshinlee@gmail.com)
+# Apache 2.0
+import re
+from pathlib import Path
+from typing import Tuple
+from unicodedata import normalize
+import jieba
+import opencc
+jieba.setLogLevel(20)
+jieba.re_han_default = re.compile("([\u2e80-\U000e01efa-zA-Z0-9+#&\._%\-']+)", re.U)
+s2tw_converter = opencc.OpenCC("s2tw.json")
+def update_jieba_dict(
+    lexicon: list,
+    jieba_dict_path: Path,
+    high_freq_words: list = [],
+    high_freq_words_weight: int = 10,
+) -> list:
+    lexicon = sorted(set(lexicon))
+    jieba_dict_path.unlink(missing_ok=True)
+    Path("/tmp/jieba.cache").unlink(missing_ok=True)
+    with jieba_dict_path.open("w") as file:
+        for word in lexicon:
+            if word in high_freq_words:
+                file.write(f"{word} {len(word) * high_freq_words_weight}\n")
+            else:
+                file.write(f"{word} {len(word)}\n")
+    jieba.dt.initialized = False
+    return lexicon
+def run_jieba(line: str) -> list:
+    # NOTE JIEBA 處理多行文本的結果會失去原本的行結構
+    seg_list = list(jieba.cut(line, cut_all=False, HMM=False))
+    return seg_list
+def normalize_text(text: str, replace_dict: dict, replace_regex: str) -> str:
+    def replace_match(match):
+        return replace_dict[match.group(0)]
+    text = re.sub("\x08", "", text)
+    text = re.sub("\ufeff", "", text)
+    text = re.sub("\u0010", "", text)
+    text = normalize("NFKC", text)
+    text = re.sub(replace_regex, replace_match, text)
+    text = " ".join(text.split()).upper()
+    return text
+def apply_v2f(word_list: list, v2f_dict: dict, v2f_regex: str) -> list:
+    result = []
+    for word in word_list:
+        result.append(re.sub(v2f_regex, lambda x: v2f_dict[x.group(0)], word))
+    return result
+def prep_regex(
+    delimiter_list: list, replace_dict: dict = {}, v2f_dict: dict = {}
+) -> Tuple[str, str, str]:
+    delimiter_regex = "|".join(map(re.escape, delimiter_list))
+    replace_regex = ""
+    if len(replace_dict):
+        sorted_keys = sorted(replace_dict.keys(), key=len, reverse=True)
+        replace_regex = "|".join(map(re.escape, sorted_keys))
+    v2f_regex = ""
+    if len(v2f_dict):
+        v2f_regex = "|".join(map(re.escape, v2f_dict.keys()))
+    return delimiter_regex, replace_regex, v2f_regex

replace/tts.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from dataclasses import dataclass
+from typing import Dict, List
+from TTS.tts.configs.vits_config import VitsConfig
+from coqpit import Coqpit
+@dataclass
+class CharactersConfig(Coqpit):
+    """Defines arguments for the `BaseCharacters` or `BaseVocabulary` and their subclasses.
+    Args:
+        characters_class (str):
+            Defines the class of the characters used. If None, we pick ```Phonemes``` or ```Graphemes``` based on
+            the configuration. Defaults to None.
+        vocab_dict (dict):
+            Defines the vocabulary dictionary used to encode the characters. Defaults to None.
+        pad (str):
+            characters in place of empty padding. Defaults to None.
+        eos (str):
+            characters showing the end of a sentence. Defaults to None.
+        bos (str):
+            characters showing the beginning of a sentence. Defaults to None.
+        blank (str):
+            Optional character used between characters by some models for better prosody. Defaults to `_blank`.
+        characters (str):
+            character set used by the model. Characters not in this list are ignored when converting input text to
+            a list of sequence IDs. Defaults to None.
+        punctuations (str):
+            characters considered as punctuation as parsing the input sentence. Defaults to None.
+        phonemes (str):
+            characters considered as parsing phonemes. This is only for backwards compat. Use `characters` for new
+            models. Defaults to None.
+        is_unique (bool):
+            remove any duplicate characters in the character lists. It is a bandaid for compatibility with the old
+            models trained with character lists with duplicates. Defaults to True.
+        is_sorted (bool):
+            Sort the characters in alphabetical order. Defaults to True.
+    """
+    characters_class: str = None
+    # using BaseVocabulary
+    vocab_dict: Dict = None
+    # using on BaseCharacters
+    pad: str = None
+    eos: str = None
+    bos: str = None
+    blank: str = None
+    characters: List[str] = None
+    punctuations: str = None
+    phonemes: str = None
+    is_unique: bool = True  # for backwards compatibility of models trained with char sets with duplicates
+    is_sorted: bool = True
+@dataclass
+class ChangedVitsConfig(VitsConfig):
+    characters: CharactersConfig = None

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+jieba
+opencc
+TTS
+omegaconf