Spaces:

ntt123
/

vietnam-male-voice-wavegru-tts

Running

App Files Files Community

ntt123 commited on Sep 18, 2023

Commit

587b6c9

1 Parent(s): 6907c30

add app

Browse files

Files changed (21) hide show

BUILD +44 -0
Dockerfile +32 -0
WORKSPACE +154 -0
alphabet.txt +97 -0
app.py +148 -0
build_ext.sh +4 -0
extract_tacotrons_model.py +5 -0
extract_wavegru_model.py +5 -0
inference.py +90 -0
mynumbers.py +73 -0
packages.txt +7 -0
pooch.py +10 -0
requirements.txt +12 -0
tacotron.py +451 -0
tacotron.toml +32 -0
text.py +92 -0
utils.py +74 -0
wavegru.py +300 -0
wavegru.yaml +14 -0
wavegru_cpp.py +42 -0
wavegru_mod.cc +150 -0

BUILD ADDED Viewed

	@@ -0,0 +1,44 @@

+# [internal] load cc_fuzz_target.bzl
+# [internal] load cc_proto_library.bzl
+# [internal] load android_cc_test:def.bzl
+load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
+package(default_visibility = [":__subpackages__"])
+licenses(["notice"])
+# To run all cc_tests in this directory:
+# bazel test //:all
+# [internal] Command to run dsp_util_android_test.
+# [internal] Command to run lyra_integration_android_test.
+exports_files(
+    srcs = [
+        "wavegru_mod.cc",
+    ],
+)
+pybind_extension(
+    name = "wavegru_mod",  # This name is not actually created!
+    srcs = ["wavegru_mod.cc"],
+    deps = [
+        "//sparse_matmul",
+    ],
+)
+py_library(
+    name = "wavegru_mod",
+    data = [":wavegru_mod.so"],
+)
+py_binary(
+    name = "wavegru",
+    srcs = ["wavegru.py"],
+    deps = [
+        ":wavegru_mod"
+    ],
+)

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.11
+RUN apt update; apt install libsndfile1-dev make autoconf automake libtool gcc pkg-config -y
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+RUN bash ./build_ext.sh
+CMD ["python", "main.py"]

WORKSPACE ADDED Viewed

	@@ -0,0 +1,154 @@

+########################
+# Platform Independent #
+########################
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository", "new_git_repository")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+# GoogleTest/GoogleMock framework.
+git_repository(
+    name = "com_google_googletest",
+    remote = "https://github.com/google/googletest.git",
+    tag = "release-1.10.0",
+)
+# Google benchmark.
+http_archive(
+    name = "com_github_google_benchmark",
+    urls = ["https://github.com/google/benchmark/archive/bf585a2789e30585b4e3ce6baf11ef2750b54677.zip"],  # 2020-11-26T11:14:03Z
+    strip_prefix = "benchmark-bf585a2789e30585b4e3ce6baf11ef2750b54677",
+    sha256 = "2a778d821997df7d8646c9c59b8edb9a573a6e04c534c01892a40aa524a7b68c",
+)
+# proto_library, cc_proto_library, and java_proto_library rules implicitly
+# depend on @com_google_protobuf for protoc and proto runtimes.
+# This statement defines the @com_google_protobuf repo.
+git_repository(
+    name = "com_google_protobuf",
+    remote = "https://github.com/protocolbuffers/protobuf.git",
+    tag = "v3.15.4",
+)
+load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps")
+protobuf_deps()
+# Google Abseil Libs
+git_repository(
+    name = "com_google_absl",
+    remote = "https://github.com/abseil/abseil-cpp.git",
+    branch = "lts_2020_09_23",
+)
+# Filesystem
+# The new_* prefix is used because it is not a bazel project and there is
+# no BUILD file in that repo.
+FILESYSTEM_BUILD = """
+cc_library(
+  name = "filesystem",
+  hdrs = glob(["include/ghc/*"]),
+  visibility = ["//visibility:public"],
+)
+"""
+new_git_repository(
+    name = "gulrak_filesystem",
+    remote = "https://github.com/gulrak/filesystem.git",
+    tag = "v1.3.6",
+    build_file_content = FILESYSTEM_BUILD
+)
+# Audio DSP
+git_repository(
+    name = "com_google_audio_dsp",
+    remote = "https://github.com/google/multichannel-audio-tools.git",
+    # There are no tags for this repo, we are synced to bleeding edge.
+    branch = "master",
+    repo_mapping = {
+        "@com_github_glog_glog" : "@com_google_glog"
+    }
+)
+http_archive(
+  name = "pybind11_bazel",
+  strip_prefix = "pybind11_bazel-72cbbf1fbc830e487e3012862b7b720001b70672",
+  urls = ["https://github.com/pybind/pybind11_bazel/archive/72cbbf1fbc830e487e3012862b7b720001b70672.zip"],
+)
+# We still require the pybind library.
+http_archive(
+  name = "pybind11",
+  build_file = "@pybind11_bazel//:pybind11.BUILD",
+  strip_prefix = "pybind11-2.9.0",
+  urls = ["https://github.com/pybind/pybind11/archive/v2.9.0.tar.gz"],
+)
+load("@pybind11_bazel//:python_configure.bzl", "python_configure")
+python_configure(name = "local_config_python")
+# Transitive dependencies of Audio DSP.
+http_archive(
+    name = "eigen_archive",
+    build_file = "eigen.BUILD",
+    sha256 = "f3d69ac773ecaf3602cb940040390d4e71a501bb145ca9e01ce5464cf6d4eb68",
+    strip_prefix = "eigen-eigen-049af2f56331",
+    urls = [
+        "http://mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/049af2f56331.tar.gz",
+        "https://bitbucket.org/eigen/eigen/get/049af2f56331.tar.gz",
+    ],
+)
+http_archive(
+    name = "fft2d",
+    build_file = "fft2d.BUILD",
+    sha256 = "ada7e99087c4ed477bfdf11413f2ba8db8a840ba9bbf8ac94f4f3972e2a7cec9",
+    urls = [
+        "http://www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz",
+    ],
+)
+# Google logging
+git_repository(
+    name = "com_google_glog",
+    remote = "https://github.com/google/glog.git",
+    branch = "master"
+)
+# Dependency for glog
+git_repository(
+    name = "com_github_gflags_gflags",
+    remote = "https://github.com/mchinen/gflags.git",
+    branch = "android_linking_fix"
+)
+# Bazel/build rules
+http_archive(
+    name = "bazel_skylib",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz",
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz",
+    ],
+    sha256 = "97e70364e9249702246c0e9444bccdc4b847bed1eb03c5a3ece4f83dfe6abc44",
+)
+load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace")
+bazel_skylib_workspace()
+http_archive(
+    name = "rules_android",
+    sha256 = "cd06d15dd8bb59926e4d65f9003bfc20f9da4b2519985c27e190cddc8b7a7806",
+    strip_prefix = "rules_android-0.1.1",
+    urls = ["https://github.com/bazelbuild/rules_android/archive/v0.1.1.zip"],
+)
+# Google Maven Repository
+GMAVEN_TAG = "20180625-1"
+http_archive(
+    name = "gmaven_rules",
+    strip_prefix = "gmaven_rules-%s" % GMAVEN_TAG,
+    url = "https://github.com/bazelbuild/gmaven_rules/archive/%s.tar.gz" % GMAVEN_TAG,
+)
+load("@gmaven_rules//:gmaven.bzl", "gmaven_rules")
+gmaven_rules()

alphabet.txt ADDED Viewed

	@@ -0,0 +1,97 @@

+_
+■
+!
+,
+.
+:
+?
+a
+b
+c
+d
+e
+g
+h
+i
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+x
+y
+à
+á
+â
+ã
+è
+é
+ê
+ì
+í
+ò
+ó
+ô
+õ
+ù
+ú
+ý
+ă
+đ
+ĩ
+ũ
+ơ
+ư
+ạ
+ả
+ấ
+ầ
+ẩ
+ẫ
+ậ
+ắ
+ằ
+ẳ
+ẵ
+ặ
+ẹ
+ẻ
+ẽ
+ế
+ề
+ể
+ễ
+ệ
+ỉ
+ị
+ọ
+ỏ
+ố
+ồ
+ổ
+ỗ
+ộ
+ớ
+ờ
+ở
+ỡ
+ợ
+ụ
+ủ
+ứ
+ừ
+ử
+ữ
+ự
+ỳ
+ỵ
+ỷ
+ỹ

app.py ADDED Viewed

	@@ -0,0 +1,148 @@

+## build wavegru-cpp
+# import os
+# os.system("./bazelisk-linux-amd64 clean --expunge")
+# os.system("./bazelisk-linux-amd64 build wavegru_mod -c opt --copt=-march=native")
+# install espeak
+import os
+import re
+import unicodedata
+import regex
+if not os.path.isfile("./wavegru_mod.so"):
+    os.system("bash ./build_ext.sh")
+import gradio as gr
+from inference import load_tacotron_model, load_wavegru_net, mel_to_wav, text_to_mel
+from wavegru_cpp import extract_weight_mask, load_wavegru_cpp
+alphabet, tacotron_net, tacotron_config = load_tacotron_model(
+    "./alphabet.txt", "./tacotron.toml", "./mono_tts_cbhg_small_0700000.ckpt"
+)
+wavegru_config, wavegru_net = load_wavegru_net(
+    "./wavegru.yaml", "./wavegru_vocoder_tpu_gta_preemphasis_pruning_0800000.ckpt"
+)
+wave_cpp_weight_mask = extract_weight_mask(wavegru_net)
+wavecpp = load_wavegru_cpp(
+    wave_cpp_weight_mask, wavegru_config["upsample_factors"][-1]
+)
+space_re = regex.compile(r"\s+")
+number_re = regex.compile("([0-9]+)")
+digits = ["không", "một", "hai", "ba", "bốn", "năm", "sáu", "bảy", "tám", "chín"]
+num_re = regex.compile(r"([0-9.,]*[0-9])")
+alphabet_ = "aàáảãạăằắẳẵặâầấẩẫậeèéẻẽẹêềếểễệiìíỉĩịoòóỏõọôồốổỗộơờớởỡợuùúủũụưừứửữựyỳýỷỹỵbcdđghklmnpqrstvx"
+keep_text_and_num_re = regex.compile(rf"[^\s{alphabet_}.,0-9]")
+keep_text_re = regex.compile(rf"[^\s{alphabet_}]")
+def read_number(num: str) -> str:
+    if len(num) == 1:
+        return digits[int(num)]
+    elif len(num) == 2 and num.isdigit():
+        n = int(num)
+        end = digits[n % 10]
+        if n == 10:
+            return "mười"
+        if n % 10 == 5:
+            end = "lăm"
+        if n % 10 == 0:
+            return digits[n // 10] + " mươi"
+        elif n < 20:
+            return "mười " + end
+        else:
+            if n % 10 == 1:
+                end = "mốt"
+            return digits[n // 10] + " mươi " + end
+    elif len(num) == 3 and num.isdigit():
+        n = int(num)
+        if n % 100 == 0:
+            return digits[n // 100] + " trăm"
+        elif num[1] == "0":
+            return digits[n // 100] + " trăm lẻ " + digits[n % 100]
+        else:
+            return digits[n // 100] + " trăm " + read_number(num[1:])
+    elif len(num) >= 4 and len(num) <= 6 and num.isdigit():
+        n = int(num)
+        n1 = n // 1000
+        return read_number(str(n1)) + " ngàn " + read_number(num[-3:])
+    elif "," in num:
+        n1, n2 = num.split(",")
+        return read_number(n1) + " phẩy " + read_number(n2)
+    elif "." in num:
+        parts = num.split(".")
+        if len(parts) == 2:
+            if parts[1] == "000":
+                return read_number(parts[0]) + " ngàn"
+            elif parts[1].startswith("00"):
+                end = digits[int(parts[1][2:])]
+                return read_number(parts[0]) + " ngàn lẻ " + end
+            else:
+                return read_number(parts[0]) + " ngàn " + read_number(parts[1])
+        elif len(parts) == 3:
+            return (
+                read_number(parts[0])
+                + " triệu "
+                + read_number(parts[1])
+                + " ngàn "
+                + read_number(parts[2])
+            )
+    return num
+def normalize_text(text):
+    # lowercase
+    text = text.lower()
+    # unicode normalize
+    text = unicodedata.normalize("NFKC", text)
+    text = text.replace(".", ". ")
+    text = text.replace(",", ", ")
+    text = text.replace(";", "; ")
+    text = text.replace(":", ": ")
+    text = text.replace("!", "! ")
+    text = text.replace("?", "? ")
+    text = text.replace("(", "( ")
+    text = num_re.sub(r" \1 ", text)
+    words = text.split()
+    words = [read_number(w) if num_re.fullmatch(w) else w for w in words]
+    text = " ".join(words)
+    # remove redundant spaces
+    text = re.sub(r"\s+", " ", text)
+    # remove leading and trailing spaces
+    text = text.strip()
+    return text
+def speak(text):
+    text = normalize_text(text)
+    mel = text_to_mel(tacotron_net, text, alphabet, tacotron_config)
+    y = mel_to_wav(wavegru_net, wavecpp, mel, wavegru_config)
+    return 24_000, y
+title = "WaveGRU-TTS"
+description = "WaveGRU text-to-speech demo."
+gr.Interface(
+    fn=speak,
+    inputs="text",
+    examples=[
+        "Trăm năm trong cõi người ta, chữ tài chữ mệnh khéo là ghét nhau.",
+        "Đoạn trường tân thanh, thường được biết đến với cái tên đơn giản là Truyện Kiều, là một truyện thơ của đại thi hào Nguyễn Du",
+        "Lục Vân Tiên quê ở huyện Đông Thành, khôi ngô tuấn tú, tài kiêm văn võ. Nghe tin triều đình mở khoa thi, Vân Tiên từ giã thầy xuống núi đua tài.",
+        "Lê Quý Đôn, tên thuở nhỏ là Lê Danh Phương, là vị quan thời Lê trung hưng, cũng là nhà thơ và được mệnh danh là nhà bác học l��n của Việt Nam trong thời phong kiến",
+        "Tất cả mọi người đều sinh ra có quyền bình đẳng. Tạo hóa cho họ những quyền không ai có thể xâm phạm được; trong những quyền ấy, có quyền được sống, quyền tự do và quyền mưu cầu hạnh phúc.",
+    ],
+    outputs="audio",
+    title=title,
+    description=description,
+    theme="default",
+    allow_screenshot=False,
+    allow_flagging="never",
+).launch(enable_queue=True)

build_ext.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+pip install -U pip
+pip install gradio==3.42.0
+USE_BAZEL_VERSION=5.0.0 ./bazelisk-linux-amd64 build wavegru_mod -c opt --copt=-march=native
+cp -f bazel-bin/wavegru_mod.so .

extract_tacotrons_model.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import pickle
+dic = pickle.load(open("./mono_tts_cbhg_small_0700000.ckpt", "rb"))
+del dic["optim_state_dict"]
+pickle.dump(dic, open("./mono_tts_cbhg_small_0700000.ckpt", "wb"))

extract_wavegru_model.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import pickle
+dic = pickle.load(open("./wavegru_vocoder_tpu_gta_preemphasis_pruning_0800000.ckpt", "rb"))
+del dic["optim_state_dict"]
+pickle.dump(dic, open("./wavegru_vocoder_tpu_gta_preemphasis_pruning_0800000.ckpt", "wb"))

inference.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+import jax
+import jax.numpy as jnp
+import librosa
+import numpy as np
+import pax
+# from text import english_cleaners
+from utils import (
+    create_tacotron_model,
+    load_tacotron_ckpt,
+    load_tacotron_config,
+    load_wavegru_ckpt,
+    load_wavegru_config,
+)
+from wavegru import WaveGRU
+# os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = "./espeak/usr/lib/libespeak-ng.so.1.1.51"
+# from phonemizer.backend import EspeakBackend
+# backend = EspeakBackend("en-us", preserve_punctuation=True, with_stress=True)
+def load_tacotron_model(alphabet_file, config_file, model_file):
+    """load tacotron model to memory"""
+    with open(alphabet_file, "r", encoding="utf-8") as f:
+        alphabet = f.read().split("\n")
+    config = load_tacotron_config(config_file)
+    net = create_tacotron_model(config)
+    _, net, _ = load_tacotron_ckpt(net, None, model_file)
+    net = net.eval()
+    net = jax.device_put(net)
+    return alphabet, net, config
+tacotron_inference_fn = pax.pure(lambda net, text: net.inference(text, max_len=2400))
+def text_to_mel(net, text, alphabet, config):
+    """convert text to mel spectrogram"""
+    # text = english_cleaners(text)
+    # text = backend.phonemize([text], strip=True)[0]
+    text = text + config["END_CHARACTER"]
+    text = text + config["PAD"] * (100 - (len(text) % 100))
+    tokens = []
+    for c in text:
+        if c in alphabet:
+            tokens.append(alphabet.index(c))
+    tokens = jnp.array(tokens, dtype=jnp.int32)
+    mel = tacotron_inference_fn(net, tokens[None])
+    return mel
+def load_wavegru_net(config_file, model_file):
+    """load wavegru to memory"""
+    config = load_wavegru_config(config_file)
+    net = WaveGRU(
+        mel_dim=config["mel_dim"],
+        rnn_dim=config["rnn_dim"],
+        upsample_factors=config["upsample_factors"],
+        has_linear_output=True,
+    )
+    _, net, _ = load_wavegru_ckpt(net, None, model_file)
+    net = net.eval()
+    net = jax.device_put(net)
+    return config, net
+wavegru_inference = pax.pure(lambda net, mel: net.inference(mel, no_gru=True))
+def mel_to_wav(net, netcpp, mel, config):
+    """convert mel to wav"""
+    if len(mel.shape) == 2:
+        mel = mel[None]
+    pad = config["num_pad_frames"] // 2 + 2
+    mel = np.pad(mel, [(0, 0), (pad, pad), (0, 0)], mode="edge")
+    ft = wavegru_inference(net, mel)
+    ft = jax.device_get(ft[0])
+    wav = netcpp.inference(ft, 1.0)
+    wav = np.array(wav)
+    wav = librosa.mu_expand(wav - 127, mu=255)
+    wav = librosa.effects.deemphasis(wav, coef=0.86)
+    wav = wav * 2.0
+    wav = wav / max(1.0, np.max(np.abs(wav)))
+    wav = wav * 2**15
+    wav = np.clip(wav, a_min=-(2**15), a_max=(2**15) - 1)
+    wav = wav.astype(np.int16)
+    return wav

mynumbers.py ADDED Viewed

	@@ -0,0 +1,73 @@

+""" from https://github.com/keithito/tacotron """
+import inflect
+import re
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
+_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
+_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+_number_re = re.compile(r"[0-9]+")
+def _remove_commas(m):
+    return m.group(1).replace(",", "")
+def _expand_decimal_point(m):
+    return m.group(1).replace(".", " point ")
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split(".")
+    if len(parts) > 2:
+        return match + " dollars"  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        return "%s %s" % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s" % (cents, cent_unit)
+    else:
+        return "zero dollars"
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+def _expand_number(m):
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return "two thousand"
+        elif num > 2000 and num < 2010:
+            return "two thousand " + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + " hundred"
+        else:
+            return _inflect.number_to_words(
+                num, andword="", zero="oh", group=2
+            ).replace(", ", " ")
+    else:
+        return _inflect.number_to_words(num, andword="")
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r"\1 pounds", text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text

packages.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+libsndfile1-dev
+make
+autoconf
+automake
+libtool
+gcc
+pkg-config

pooch.py ADDED Viewed

	@@ -0,0 +1,10 @@

+def os_cache(x):
+    return x
+def create(*args, **kwargs):
+    class T:
+        def load_registry(self, *args, **kwargs):
+            return None
+    return T()

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+inflect
+jax
+jaxlib
+jinja2
+librosa
+numpy
+pax3
+pyyaml
+toml
+unidecode
+phonemizer
+gradio==3.42.0

tacotron.py ADDED Viewed

	@@ -0,0 +1,451 @@

+"""
+Tacotron + stepwise monotonic attention
+"""
+import jax
+import jax.numpy as jnp
+import pax
+def conv_block(in_ft, out_ft, kernel_size, activation_fn, use_dropout):
+    """
+    Conv >> LayerNorm >> activation >> Dropout
+    """
+    f = pax.Sequential(
+        pax.Conv1D(in_ft, out_ft, kernel_size, with_bias=False),
+        pax.LayerNorm(out_ft, -1, True, True),
+    )
+    if activation_fn is not None:
+        f >>= activation_fn
+    if use_dropout:
+        f >>= pax.Dropout(0.5)
+    return f
+class HighwayBlock(pax.Module):
+    """
+    Highway block
+    """
+    def __init__(self, dim: int) -> None:
+        super().__init__()
+        self.dim = dim
+        self.fc = pax.Linear(dim, 2 * dim)
+    def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+        t, h = jnp.split(self.fc(x), 2, axis=-1)
+        t = jax.nn.sigmoid(t - 1.0)  # bias toward keeping x
+        h = jax.nn.relu(h)
+        x = x * (1.0 - t) + h * t
+        return x
+class BiGRU(pax.Module):
+    """
+    Bidirectional GRU
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.rnn_fwd = pax.GRU(dim, dim)
+        self.rnn_bwd = pax.GRU(dim, dim)
+    def __call__(self, x, reset_masks):
+        N = x.shape[0]
+        x_fwd = x
+        x_bwd = jnp.flip(x, axis=1)
+        x_fwd_states = self.rnn_fwd.initial_state(N)
+        x_bwd_states = self.rnn_bwd.initial_state(N)
+        x_fwd_states, x_fwd = pax.scan(
+            self.rnn_fwd, x_fwd_states, x_fwd, time_major=False
+        )
+        reset_masks = jnp.flip(reset_masks, axis=1)
+        x_bwd_states0 = x_bwd_states
+        def rnn_reset_core(prev, inputs):
+            x, reset_mask = inputs
+            def reset_state(x0, xt):
+                return jnp.where(reset_mask, x0, xt)
+            state, _ = self.rnn_bwd(prev, x)
+            state = jax.tree_map(reset_state, x_bwd_states0, state)
+            return state, state.hidden
+        x_bwd_states, x_bwd = pax.scan(
+            rnn_reset_core, x_bwd_states, (x_bwd, reset_masks), time_major=False
+        )
+        x_bwd = jnp.flip(x_bwd, axis=1)
+        x = jnp.concatenate((x_fwd, x_bwd), axis=-1)
+        return x
+class CBHG(pax.Module):
+    """
+    Conv Bank >> Highway net >> GRU
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.convs = [conv_block(dim, dim, i, jax.nn.relu, False) for i in range(1, 17)]
+        self.conv_projection_1 = conv_block(16 * dim, dim, 3, jax.nn.relu, False)
+        self.conv_projection_2 = conv_block(dim, dim, 3, None, False)
+        self.highway = pax.Sequential(
+            HighwayBlock(dim), HighwayBlock(dim), HighwayBlock(dim), HighwayBlock(dim)
+        )
+        self.rnn = BiGRU(dim)
+    def __call__(self, x, x_mask):
+        conv_input = x * x_mask
+        fts = [f(conv_input) for f in self.convs]
+        residual = jnp.concatenate(fts, axis=-1)
+        residual = pax.max_pool(residual, 2, 1, "SAME", -1)
+        residual = self.conv_projection_1(residual * x_mask)
+        residual = self.conv_projection_2(residual * x_mask)
+        x = x + residual
+        x = self.highway(x)
+        x = self.rnn(x * x_mask, reset_masks=1 - x_mask)
+        return x * x_mask
+class PreNet(pax.Module):
+    """
+    Linear >> relu >> dropout >> Linear >> relu >> dropout
+    """
+    def __init__(self, input_dim, hidden_dim, output_dim, always_dropout=True):
+        super().__init__()
+        self.fc1 = pax.Linear(input_dim, hidden_dim)
+        self.fc2 = pax.Linear(hidden_dim, output_dim)
+        self.rng_seq = pax.RngSeq()
+        self.always_dropout = always_dropout
+    def __call__(self, x, k1=None, k2=None):
+        x = self.fc1(x)
+        x = jax.nn.relu(x)
+        if self.always_dropout or self.training:
+            if k1 is None:
+                k1 = self.rng_seq.next_rng_key()
+            x = pax.dropout(k1, 0.5, x)
+        x = self.fc2(x)
+        x = jax.nn.relu(x)
+        if self.always_dropout or self.training:
+            if k2 is None:
+                k2 = self.rng_seq.next_rng_key()
+            x = pax.dropout(k2, 0.5, x)
+        return x
+class Tacotron(pax.Module):
+    """
+    Tacotron TTS model.
+    It uses stepwise monotonic attention for robust attention.
+    """
+    def __init__(
+        self,
+        mel_dim: int,
+        attn_bias,
+        rr,
+        max_rr,
+        mel_min,
+        sigmoid_noise,
+        pad_token,
+        prenet_dim,
+        attn_hidden_dim,
+        attn_rnn_dim,
+        rnn_dim,
+        postnet_dim,
+        text_dim,
+    ):
+        """
+        New Tacotron model
+        Args:
+            mel_dim (int): dimension of log mel-spectrogram features.
+            attn_bias (float): control how "slow" the attention will
+                move forward at initialization.
+            rr (int): the reduction factor.
+                Number of predicted frame at each time step. Default is 2.
+            max_rr (int): max value of rr.
+            mel_min (float): the minimum value of mel features.
+                The <go> frame is filled by `log(mel_min)` values.
+            sigmoid_noise (float): the variance of gaussian noise added
+                to attention scores in training.
+            pad_token (int): the pad value at the end of text sequences.
+            prenet_dim (int): dimension of prenet output.
+            attn_hidden_dim (int): dimension of attention hidden vectors.
+            attn_rnn_dim (int): number of cells in the attention RNN.
+            rnn_dim (int): number of cells in the decoder RNNs.
+            postnet_dim (int): number of features in the postnet convolutions.
+            text_dim (int): dimension of text embedding vectors.
+        """
+        super().__init__()
+        self.text_dim = text_dim
+        assert rr <= max_rr
+        self.rr = rr
+        self.max_rr = max_rr
+        self.mel_dim = mel_dim
+        self.mel_min = mel_min
+        self.sigmoid_noise = sigmoid_noise
+        self.pad_token = pad_token
+        self.prenet_dim = prenet_dim
+        # encoder submodules
+        self.encoder_embed = pax.Embed(256, text_dim)
+        self.encoder_pre_net = PreNet(text_dim, 256, prenet_dim, always_dropout=True)
+        self.encoder_cbhg = CBHG(prenet_dim)
+        # random key generator
+        self.rng_seq = pax.RngSeq()
+        # pre-net
+        self.decoder_pre_net = PreNet(mel_dim, 256, prenet_dim, always_dropout=True)
+        # decoder submodules
+        self.attn_rnn = pax.LSTM(prenet_dim + prenet_dim * 2, attn_rnn_dim)
+        self.text_key_fc = pax.Linear(prenet_dim * 2, attn_hidden_dim, with_bias=True)
+        self.attn_query_fc = pax.Linear(attn_rnn_dim, attn_hidden_dim, with_bias=False)
+        self.attn_V = pax.Linear(attn_hidden_dim, 1, with_bias=False)
+        self.attn_V_weight_norm = jnp.array(1.0 / jnp.sqrt(attn_hidden_dim))
+        self.attn_V_bias = jnp.array(attn_bias)
+        self.attn_log = jnp.zeros((1,))
+        self.decoder_input = pax.Linear(attn_rnn_dim + 2 * prenet_dim, rnn_dim)
+        self.decoder_rnn1 = pax.LSTM(rnn_dim, rnn_dim)
+        self.decoder_rnn2 = pax.LSTM(rnn_dim, rnn_dim)
+        # mel + end-of-sequence token
+        self.output_fc = pax.Linear(rnn_dim, (mel_dim + 1) * max_rr, with_bias=True)
+        # post-net
+        self.post_net = pax.Sequential(
+            conv_block(mel_dim, postnet_dim, 5, jax.nn.tanh, True),
+            conv_block(postnet_dim, postnet_dim, 5, jax.nn.tanh, True),
+            conv_block(postnet_dim, postnet_dim, 5, jax.nn.tanh, True),
+            conv_block(postnet_dim, postnet_dim, 5, jax.nn.tanh, True),
+            conv_block(postnet_dim, mel_dim, 5, None, True),
+        )
+    parameters = pax.parameters_method("attn_V_weight_norm", "attn_V_bias")
+    def encode_text(self, text: jnp.ndarray) -> jnp.ndarray:
+        """
+        Encode text to a sequence of real vectors
+        """
+        N, L = text.shape
+        text_mask = (text != self.pad_token)[..., None]
+        x = self.encoder_embed(text)
+        x = self.encoder_pre_net(x)
+        x = self.encoder_cbhg(x, text_mask)
+        return x
+    def go_frame(self, batch_size: int) -> jnp.ndarray:
+        """
+        return the go frame
+        """
+        return jnp.ones((batch_size, self.mel_dim)) * jnp.log(self.mel_min)
+    def decoder_initial_state(self, N: int, L: int):
+        """
+        setup decoder initial state
+        """
+        attn_context = jnp.zeros((N, self.prenet_dim * 2))
+        attn_pr = jax.nn.one_hot(
+            jnp.zeros((N,), dtype=jnp.int32), num_classes=L, axis=-1
+        )
+        attn_state = (self.attn_rnn.initial_state(N), attn_context, attn_pr)
+        decoder_rnn_states = (
+            self.decoder_rnn1.initial_state(N),
+            self.decoder_rnn2.initial_state(N),
+        )
+        return attn_state, decoder_rnn_states
+    def monotonic_attention(self, prev_state, inputs, envs):
+        """
+        Stepwise monotonic attention
+        """
+        attn_rnn_state, attn_context, prev_attn_pr = prev_state
+        x, attn_rng_key = inputs
+        text, text_key = envs
+        attn_rnn_input = jnp.concatenate((x, attn_context), axis=-1)
+        attn_rnn_state, attn_rnn_output = self.attn_rnn(attn_rnn_state, attn_rnn_input)
+        attn_query_input = attn_rnn_output
+        attn_query = self.attn_query_fc(attn_query_input)
+        attn_hidden = jnp.tanh(attn_query[:, None, :] + text_key)
+        score = self.attn_V(attn_hidden)
+        score = jnp.squeeze(score, axis=-1)
+        weight_norm = jnp.linalg.norm(self.attn_V.weight)
+        score = score * (self.attn_V_weight_norm / weight_norm)
+        score = score + self.attn_V_bias
+        noise = jax.random.normal(attn_rng_key, score.shape) * self.sigmoid_noise
+        pr_stay = jax.nn.sigmoid(score + noise)
+        pr_move = 1.0 - pr_stay
+        pr_new_location = pr_move * prev_attn_pr
+        pr_new_location = jnp.pad(
+            pr_new_location[:, :-1], ((0, 0), (1, 0)), constant_values=0
+        )
+        attn_pr = pr_stay * prev_attn_pr + pr_new_location
+        attn_context = jnp.einsum("NL,NLD->ND", attn_pr, text)
+        new_state = (attn_rnn_state, attn_context, attn_pr)
+        return new_state, attn_rnn_output
+    def zoneout_lstm(self, lstm_core, rng_key, zoneout_pr=0.1):
+        """
+        Return a zoneout lstm core.
+        It will zoneout the new hidden states and keep the new cell states unchanged.
+        """
+        def core(state, x):
+            new_state, _ = lstm_core(state, x)
+            h_old = state.hidden
+            h_new = new_state.hidden
+            mask = jax.random.bernoulli(rng_key, zoneout_pr, h_old.shape)
+            h_new = h_old * mask + h_new * (1.0 - mask)
+            return pax.LSTMState(h_new, new_state.cell), h_new
+        return core
+    def decoder_step(
+        self,
+        attn_state,
+        decoder_rnn_states,
+        rng_key,
+        mel,
+        text,
+        text_key,
+        call_pre_net=False,
+    ):
+        """
+        One decoder step
+        """
+        if call_pre_net:
+            k1, k2, zk1, zk2, rng_key, rng_key_next = jax.random.split(rng_key, 6)
+            mel = self.decoder_pre_net(mel, k1, k2)
+        else:
+            zk1, zk2, rng_key, rng_key_next = jax.random.split(rng_key, 4)
+        attn_inputs = (mel, rng_key)
+        attn_envs = (text, text_key)
+        attn_state, attn_rnn_output = self.monotonic_attention(
+            attn_state, attn_inputs, attn_envs
+        )
+        (_, attn_context, attn_pr) = attn_state
+        (decoder_rnn_state1, decoder_rnn_state2) = decoder_rnn_states
+        decoder_rnn1_input = jnp.concatenate((attn_rnn_output, attn_context), axis=-1)
+        decoder_rnn1_input = self.decoder_input(decoder_rnn1_input)
+        decoder_rnn1 = self.zoneout_lstm(self.decoder_rnn1, zk1)
+        decoder_rnn_state1, decoder_rnn_output1 = decoder_rnn1(
+            decoder_rnn_state1, decoder_rnn1_input
+        )
+        decoder_rnn2_input = decoder_rnn1_input + decoder_rnn_output1
+        decoder_rnn2 = self.zoneout_lstm(self.decoder_rnn2, zk2)
+        decoder_rnn_state2, decoder_rnn_output2 = decoder_rnn2(
+            decoder_rnn_state2, decoder_rnn2_input
+        )
+        x = decoder_rnn1_input + decoder_rnn_output1 + decoder_rnn_output2
+        decoder_rnn_states = (decoder_rnn_state1, decoder_rnn_state2)
+        return attn_state, decoder_rnn_states, rng_key_next, x, attn_pr[0]
+    @jax.jit
+    def inference_step(
+        self, attn_state, decoder_rnn_states, rng_key, mel, text, text_key
+    ):
+        """one inference step"""
+        attn_state, decoder_rnn_states, rng_key, x, _ = self.decoder_step(
+            attn_state,
+            decoder_rnn_states,
+            rng_key,
+            mel,
+            text,
+            text_key,
+            call_pre_net=True,
+        )
+        x = self.output_fc(x)
+        N, D2 = x.shape
+        x = jnp.reshape(x, (N, self.max_rr, D2 // self.max_rr))
+        x = x[:, : self.rr, :]
+        x = jnp.reshape(x, (N, self.rr, -1))
+        mel = x[..., :-1]
+        eos_logit = x[..., -1]
+        eos_pr = jax.nn.sigmoid(eos_logit[0, -1])
+        eos_pr = jnp.where(eos_pr < 0.1, 0.0, eos_pr)
+        rng_key, eos_rng_key = jax.random.split(rng_key)
+        eos = jax.random.bernoulli(eos_rng_key, p=eos_pr)
+        return attn_state, decoder_rnn_states, rng_key, (mel, eos)
+    def inference(self, text, seed=42, max_len=1000):
+        """
+        text to mel
+        """
+        text = self.encode_text(text)
+        text_key = self.text_key_fc(text)
+        N, L, D = text.shape
+        assert N == 1
+        mel = self.go_frame(N)
+        attn_state, decoder_rnn_states = self.decoder_initial_state(N, L)
+        rng_key = jax.random.PRNGKey(seed)
+        mels = []
+        count = 0
+        while True:
+            count = count + 1
+            attn_state, decoder_rnn_states, rng_key, (mel, eos) = self.inference_step(
+                attn_state, decoder_rnn_states, rng_key, mel, text, text_key
+            )
+            mels.append(mel)
+            if eos.item() or count > max_len:
+                break
+            mel = mel[:, -1, :]
+        mels = jnp.concatenate(mels, axis=1)
+        mel = mel + self.post_net(mel)
+        return mels
+    def decode(self, mel, text):
+        """
+        Attention mechanism + Decoder
+        """
+        text_key = self.text_key_fc(text)
+        def scan_fn(prev_states, inputs):
+            attn_state, decoder_rnn_states = prev_states
+            x, rng_key = inputs
+            attn_state, decoder_rnn_states, _, output, attn_pr = self.decoder_step(
+                attn_state, decoder_rnn_states, rng_key, x, text, text_key
+            )
+            states = (attn_state, decoder_rnn_states)
+            return states, (output, attn_pr)
+        N, L, D = text.shape
+        decoder_states = self.decoder_initial_state(N, L)
+        rng_keys = self.rng_seq.next_rng_key(mel.shape[1])
+        rng_keys = jnp.stack(rng_keys, axis=1)
+        decoder_states, (x, attn_log) = pax.scan(
+            scan_fn,
+            decoder_states,
+            (mel, rng_keys),
+            time_major=False,
+        )
+        self.attn_log = attn_log
+        del decoder_states
+        x = self.output_fc(x)
+        N, T2, D2 = x.shape
+        x = jnp.reshape(x, (N, T2, self.max_rr, D2 // self.max_rr))
+        x = x[:, :, : self.rr, :]
+        x = jnp.reshape(x, (N, T2 * self.rr, -1))
+        mel = x[..., :-1]
+        eos = x[..., -1]
+        return mel, eos
+    def __call__(self, mel: jnp.ndarray, text: jnp.ndarray):
+        text = self.encode_text(text)
+        mel = self.decoder_pre_net(mel)
+        mel, eos = self.decode(mel, text)
+        return mel, mel + self.post_net(mel), eos

tacotron.toml ADDED Viewed

	@@ -0,0 +1,32 @@

+[tacotron]
+# training
+BATCH_SIZE = 64
+LR=1024e-6 # learning rate
+MODEL_PREFIX = "mono_tts_cbhg_small"
+LOG_DIR = "./logs"
+CKPT_DIR = "./ckpts"
+USE_MP = false  # use mixed-precision training
+# data
+TF_DATA_DIR = "./tf_data" # tensorflow data directory
+TF_GTA_DATA_DIR = "./tf_gta_data" # tf gta data directory
+SAMPLE_RATE = 24000 # convert to this sample rate if needed
+MEL_DIM = 80 # the dimension of melspectrogram features
+MEL_MIN = 1e-5
+PAD = "_" # padding character
+PAD_TOKEN = 0
+END_CHARACTER = "■"  # to signal the end of the transcript
+TEST_DATA_SIZE = 1024
+# model
+RR = 1 # reduction factor
+MAX_RR=2
+ATTN_BIAS = 0.0 # control how slow the attention moves forward
+SIGMOID_NOISE = 2.0
+PRENET_DIM = 128
+TEXT_DIM = 256
+RNN_DIM = 512
+ATTN_RNN_DIM = 256
+ATTN_HIDDEN_DIM = 128
+POSTNET_DIM = 512

text.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# """ from https://github.com/keithito/tacotron """
+# """
+# Cleaners are transformations that run over the input text at both training and eval time.
+# Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+# hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+#   1. "english_cleaners" for English text
+#   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+#      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+#   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+#      the symbols in symbols.py to match your data).
+# """
+# import re
+# from mynumbers import normalize_numbers
+# from unidecode import unidecode
+# # Regular expression matching whitespace:
+# _whitespace_re = re.compile(r"\s+")
+# # List of (regular expression, replacement) pairs for abbreviations:
+# _abbreviations = [
+#     (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+#     for x in [
+#         ("mrs", "misess"),
+#         ("mr", "mister"),
+#         ("dr", "doctor"),
+#         ("st", "saint"),
+#         ("co", "company"),
+#         ("jr", "junior"),
+#         ("maj", "major"),
+#         ("gen", "general"),
+#         ("drs", "doctors"),
+#         ("rev", "reverend"),
+#         ("lt", "lieutenant"),
+#         ("hon", "honorable"),
+#         ("sgt", "sergeant"),
+#         ("capt", "captain"),
+#         ("esq", "esquire"),
+#         ("ltd", "limited"),
+#         ("col", "colonel"),
+#         ("ft", "fort"),
+#     ]
+# ]
+# def expand_abbreviations(text):
+#     for regex, replacement in _abbreviations:
+#         text = re.sub(regex, replacement, text)
+#     return text
+# def expand_numbers(text):
+#     return normalize_numbers(text)
+# def lowercase(text):
+#     return text.lower()
+# def collapse_whitespace(text):
+#     return re.sub(_whitespace_re, " ", text)
+# def convert_to_ascii(text):
+#     return unidecode(text)
+# def basic_cleaners(text):
+#     """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+#     text = lowercase(text)
+#     text = collapse_whitespace(text)
+#     return text
+# def transliteration_cleaners(text):
+#     """Pipeline for non-English text that transliterates to ASCII."""
+#     text = convert_to_ascii(text)
+#     text = lowercase(text)
+#     text = collapse_whitespace(text)
+#     return text
+# def english_cleaners(text):
+#     """Pipeline for English text, including number and abbreviation expansion."""
+#     text = convert_to_ascii(text)
+#     text = lowercase(text)
+#     text = expand_numbers(text)
+#     text = expand_abbreviations(text)
+#     text = collapse_whitespace(text)
+#     return text

utils.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""
+Utility functions
+"""
+import pickle
+from pathlib import Path
+import pax
+import toml
+import yaml
+from tacotron import Tacotron
+def load_tacotron_config(config_file=Path("tacotron.toml")):
+    """
+    Load the project configurations
+    """
+    return toml.load(config_file)["tacotron"]
+def load_tacotron_ckpt(net: pax.Module, optim: pax.Module, path):
+    """
+    load checkpoint from disk
+    """
+    with open(path, "rb") as f:
+        dic = pickle.load(f)
+    if net is not None:
+        net = net.load_state_dict(dic["model_state_dict"])
+    if optim is not None:
+        optim = optim.load_state_dict(dic["optim_state_dict"])
+    return dic["step"], net, optim
+def create_tacotron_model(config):
+    """
+    return a random initialized Tacotron model
+    """
+    return Tacotron(
+        mel_dim=config["MEL_DIM"],
+        attn_bias=config["ATTN_BIAS"],
+        rr=config["RR"],
+        max_rr=config["MAX_RR"],
+        mel_min=config["MEL_MIN"],
+        sigmoid_noise=config["SIGMOID_NOISE"],
+        pad_token=config["PAD_TOKEN"],
+        prenet_dim=config["PRENET_DIM"],
+        attn_hidden_dim=config["ATTN_HIDDEN_DIM"],
+        attn_rnn_dim=config["ATTN_RNN_DIM"],
+        rnn_dim=config["RNN_DIM"],
+        postnet_dim=config["POSTNET_DIM"],
+        text_dim=config["TEXT_DIM"],
+    )
+def load_wavegru_config(config_file):
+    """
+    Load project configurations
+    """
+    with open(config_file, "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
+def load_wavegru_ckpt(net, optim, ckpt_file):
+    """
+    load training checkpoint from file
+    """
+    with open(ckpt_file, "rb") as f:
+        dic = pickle.load(f)
+    if net is not None:
+        net = net.load_state_dict(dic["net_state_dict"])
+    if optim is not None:
+        optim = optim.load_state_dict(dic["optim_state_dict"])
+    return dic["step"], net, optim

wavegru.py ADDED Viewed

	@@ -0,0 +1,300 @@

+"""
+WaveGRU model: melspectrogram => mu-law encoded waveform
+"""
+from typing import Tuple
+import jax
+import jax.numpy as jnp
+import pax
+from pax import GRUState
+from tqdm.cli import tqdm
+class ReLU(pax.Module):
+    def __call__(self, x):
+        return jax.nn.relu(x)
+def dilated_residual_conv_block(dim, kernel, stride, dilation):
+    """
+    Use dilated convs to enlarge the receptive field
+    """
+    return pax.Sequential(
+        pax.Conv1D(dim, dim, kernel, stride, dilation, "VALID", with_bias=False),
+        pax.LayerNorm(dim, -1, True, True),
+        ReLU(),
+        pax.Conv1D(dim, dim, 1, 1, 1, "VALID", with_bias=False),
+        pax.LayerNorm(dim, -1, True, True),
+        ReLU(),
+    )
+def tile_1d(x, factor):
+    """
+    Tile tensor of shape N, L, D into N, L*factor, D
+    """
+    N, L, D = x.shape
+    x = x[:, :, None, :]
+    x = jnp.tile(x, (1, 1, factor, 1))
+    x = jnp.reshape(x, (N, L * factor, D))
+    return x
+def up_block(in_dim, out_dim, factor, relu=True):
+    """
+    Tile >> Conv >> BatchNorm >> ReLU
+    """
+    f = pax.Sequential(
+        lambda x: tile_1d(x, factor),
+        pax.Conv1D(
+            in_dim, out_dim, 2 * factor, stride=1, padding="VALID", with_bias=False
+        ),
+        pax.LayerNorm(out_dim, -1, True, True),
+    )
+    if relu:
+        f >>= ReLU()
+    return f
+class Upsample(pax.Module):
+    """
+    Upsample melspectrogram to match raw audio sample rate.
+    """
+    def __init__(
+        self, input_dim, hidden_dim, rnn_dim, upsample_factors, has_linear_output=False
+    ):
+        super().__init__()
+        self.input_conv = pax.Sequential(
+            pax.Conv1D(input_dim, hidden_dim, 1, with_bias=False),
+            pax.LayerNorm(hidden_dim, -1, True, True),
+        )
+        self.upsample_factors = upsample_factors
+        self.dilated_convs = [
+            dilated_residual_conv_block(hidden_dim, 3, 1, 2**i) for i in range(5)
+        ]
+        self.up_factors = upsample_factors[:-1]
+        self.up_blocks = [
+            up_block(hidden_dim, hidden_dim, x) for x in self.up_factors[:-1]
+        ]
+        self.up_blocks.append(
+            up_block(
+                hidden_dim,
+                hidden_dim if has_linear_output else 3 * rnn_dim,
+                self.up_factors[-1],
+                relu=False,
+            )
+        )
+        if has_linear_output:
+            self.x2zrh_fc = pax.Linear(hidden_dim, rnn_dim * 3)
+        self.has_linear_output = has_linear_output
+        self.final_tile = upsample_factors[-1]
+    def __call__(self, x, no_repeat=False):
+        x = self.input_conv(x)
+        for residual in self.dilated_convs:
+            y = residual(x)
+            pad = (x.shape[1] - y.shape[1]) // 2
+            x = x[:, pad:-pad, :] + y
+        for f in self.up_blocks:
+            x = f(x)
+        if self.has_linear_output:
+            x = self.x2zrh_fc(x)
+        if no_repeat:
+            return x
+        x = tile_1d(x, self.final_tile)
+        return x
+class GRU(pax.Module):
+    """
+    A customized GRU module.
+    """
+    input_dim: int
+    hidden_dim: int
+    def __init__(self, hidden_dim: int):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.h_zrh_fc = pax.Linear(
+            hidden_dim,
+            hidden_dim * 3,
+            w_init=jax.nn.initializers.variance_scaling(
+                1, "fan_out", "truncated_normal"
+            ),
+        )
+    def initial_state(self, batch_size: int) -> GRUState:
+        """Create an all zeros initial state."""
+        return GRUState(jnp.zeros((batch_size, self.hidden_dim), dtype=jnp.float32))
+    def __call__(self, state: GRUState, x) -> Tuple[GRUState, jnp.ndarray]:
+        hidden = state.hidden
+        x_zrh = x
+        h_zrh = self.h_zrh_fc(hidden)
+        x_zr, x_h = jnp.split(x_zrh, [2 * self.hidden_dim], axis=-1)
+        h_zr, h_h = jnp.split(h_zrh, [2 * self.hidden_dim], axis=-1)
+        zr = x_zr + h_zr
+        zr = jax.nn.sigmoid(zr)
+        z, r = jnp.split(zr, 2, axis=-1)
+        h_hat = x_h + r * h_h
+        h_hat = jnp.tanh(h_hat)
+        h = (1 - z) * hidden + z * h_hat
+        return GRUState(h), h
+class Pruner(pax.Module):
+    """
+    Base class for pruners
+    """
+    def compute_sparsity(self, step):
+        t = jnp.power(1 - (step * 1.0 - 1_000) / 200_000, 3)
+        z = 0.95 * jnp.clip(1.0 - t, a_min=0, a_max=1)
+        return z
+    def prune(self, step, weights):
+        """
+        Return a mask
+        """
+        z = self.compute_sparsity(step)
+        x = weights
+        H, W = x.shape
+        x = x.reshape(H // 4, 4, W // 4, 4)
+        x = jnp.abs(x)
+        x = jnp.sum(x, axis=(1, 3), keepdims=True)
+        q = jnp.quantile(jnp.reshape(x, (-1,)), z)
+        x = x >= q
+        x = jnp.tile(x, (1, 4, 1, 4))
+        x = jnp.reshape(x, (H, W))
+        return x
+class GRUPruner(Pruner):
+    def __init__(self, gru):
+        super().__init__()
+        self.h_zrh_fc_mask = jnp.ones_like(gru.h_zrh_fc.weight) == 1
+    def __call__(self, gru: pax.GRU):
+        """
+        Apply mask after an optimization step
+        """
+        zrh_masked_weights = jnp.where(self.h_zrh_fc_mask, gru.h_zrh_fc.weight, 0)
+        gru = gru.replace_node(gru.h_zrh_fc.weight, zrh_masked_weights)
+        return gru
+    def update_mask(self, step, gru: pax.GRU):
+        """
+        Update internal masks
+        """
+        z_weight, r_weight, h_weight = jnp.split(gru.h_zrh_fc.weight, 3, axis=1)
+        z_mask = self.prune(step, z_weight)
+        r_mask = self.prune(step, r_weight)
+        h_mask = self.prune(step, h_weight)
+        self.h_zrh_fc_mask *= jnp.concatenate((z_mask, r_mask, h_mask), axis=1)
+class LinearPruner(Pruner):
+    def __init__(self, linear):
+        super().__init__()
+        self.mask = jnp.ones_like(linear.weight) == 1
+    def __call__(self, linear: pax.Linear):
+        """
+        Apply mask after an optimization step
+        """
+        return linear.replace(weight=jnp.where(self.mask, linear.weight, 0))
+    def update_mask(self, step, linear: pax.Linear):
+        """
+        Update internal masks
+        """
+        self.mask *= self.prune(step, linear.weight)
+class WaveGRU(pax.Module):
+    """
+    WaveGRU vocoder model.
+    """
+    def __init__(
+        self,
+        mel_dim=80,
+        rnn_dim=1024,
+        upsample_factors=(5, 3, 20),
+        has_linear_output=False,
+    ):
+        super().__init__()
+        self.embed = pax.Embed(256, 3 * rnn_dim)
+        self.upsample = Upsample(
+            input_dim=mel_dim,
+            hidden_dim=512,
+            rnn_dim=rnn_dim,
+            upsample_factors=upsample_factors,
+            has_linear_output=has_linear_output,
+        )
+        self.rnn = GRU(rnn_dim)
+        self.o1 = pax.Linear(rnn_dim, rnn_dim)
+        self.o2 = pax.Linear(rnn_dim, 256)
+        self.gru_pruner = GRUPruner(self.rnn)
+        self.o1_pruner = LinearPruner(self.o1)
+        self.o2_pruner = LinearPruner(self.o2)
+    def output(self, x):
+        x = self.o1(x)
+        x = jax.nn.relu(x)
+        x = self.o2(x)
+        return x
+    def inference(self, mel, no_gru=False, seed=42):
+        """
+        generate waveform form melspectrogram
+        """
+        @jax.jit
+        def step(rnn_state, mel, rng_key, x):
+            x = self.embed(x)
+            x = x + mel
+            rnn_state, x = self.rnn(rnn_state, x)
+            x = self.output(x)
+            rng_key, next_rng_key = jax.random.split(rng_key, 2)
+            x = jax.random.categorical(rng_key, x, axis=-1)
+            return rnn_state, next_rng_key, x
+        y = self.upsample(mel, no_repeat=no_gru)
+        if no_gru:
+            return y
+        x = jnp.array([127], dtype=jnp.int32)
+        rnn_state = self.rnn.initial_state(1)
+        output = []
+        rng_key = jax.random.PRNGKey(seed)
+        for i in tqdm(range(y.shape[1])):
+            rnn_state, rng_key, x = step(rnn_state, y[:, i], rng_key, x)
+            output.append(x)
+        x = jnp.concatenate(output, axis=0)
+        return x
+    def __call__(self, mel, x):
+        x = self.embed(x)
+        y = self.upsample(mel)
+        pad_left = (x.shape[1] - y.shape[1]) // 2
+        pad_right = x.shape[1] - y.shape[1] - pad_left
+        x = x[:, pad_left:-pad_right]
+        x = x + y
+        _, x = pax.scan(
+            self.rnn,
+            self.rnn.initial_state(x.shape[0]),
+            x,
+            time_major=False,
+        )
+        x = self.output(x)
+        return x

wavegru.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+## dsp
+sample_rate : 24000
+window_length: 50.0 # ms
+hop_length: 12.5 # ms
+mel_min: 1.0e-5 ## need .0 to make it a float
+mel_dim: 80
+n_fft: 2048
+## wavegru
+embed_dim: 32
+rnn_dim: 1024
+frames_per_sequence: 67
+num_pad_frames: 62
+upsample_factors: [5, 3, 20]

wavegru_cpp.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import numpy as np
+from wavegru_mod import WaveGRU
+def extract_weight_mask(net):
+    data = {}
+    data["embed_weight"] = net.embed.weight
+    data["gru_h_zrh_weight"] = net.rnn.h_zrh_fc.weight
+    data["gru_h_zrh_mask"] = net.gru_pruner.h_zrh_fc_mask
+    data["gru_h_zrh_bias"] = net.rnn.h_zrh_fc.bias
+    data["o1_weight"] = net.o1.weight
+    data["o1_mask"] = net.o1_pruner.mask
+    data["o1_bias"] = net.o1.bias
+    data["o2_weight"] = net.o2.weight
+    data["o2_mask"] = net.o2_pruner.mask
+    data["o2_bias"] = net.o2.bias
+    return data
+def load_wavegru_cpp(data, repeat_factor):
+    """load wavegru weight to cpp object"""
+    embed = data["embed_weight"]
+    rnn_dim = data["gru_h_zrh_bias"].shape[0] // 3
+    net = WaveGRU(rnn_dim, repeat_factor)
+    net.load_embed(embed)
+    m = np.ascontiguousarray(data["gru_h_zrh_weight"].T)
+    mask = np.ascontiguousarray(data["gru_h_zrh_mask"].T)
+    b = data["gru_h_zrh_bias"]
+    o1 = np.ascontiguousarray(data["o1_weight"].T)
+    masko1 = np.ascontiguousarray(data["o1_mask"].T)
+    o1b = data["o1_bias"]
+    o2 = np.ascontiguousarray(data["o2_weight"].T)
+    masko2 = np.ascontiguousarray(data["o2_mask"].T)
+    o2b = data["o2_bias"]
+    net.load_weights(m, mask, b, o1, masko1, o1b, o2, masko2, o2b)
+    return net

wavegru_mod.cc ADDED Viewed

	@@ -0,0 +1,150 @@

+/*
+WaveGRU:
+> Embed > GRU > O1 > O2 > Sampling > ...
+*/
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <iostream>
+#include <random>
+#include <vector>
+#include "sparse_matmul/sparse_matmul.h"
+namespace py = pybind11;
+using namespace std;
+using fvec = std::vector<float>;
+using ivec = std::vector<int>;
+using fndarray = py::array_t<float>;
+using indarray = py::array_t<int>;
+using mat = csrblocksparse::CsrBlockSparseMatrix<float, float, int16_t>;
+using vec = csrblocksparse::CacheAlignedVector<float>;
+using masked_mat = csrblocksparse::MaskedSparseMatrix<float>;
+mat create_mat(int h, int w) {
+  auto m = masked_mat(w, h, 0.90, 4, 4, 0.0, true);
+  auto a = mat(m);
+  return a;
+}
+struct WaveGRU {
+  int hidden_dim;
+  int repeat_factor;
+  mat m;
+  vec b;
+  vec z, r, hh, zrh;
+  vec fco1, fco2;
+  vec o1b, o2b;
+  vec t;
+  vec h;
+  vec logits;
+  mat o1, o2;
+  std::vector<vec> embed;
+  WaveGRU(int hidden_dim, int repeat_factor)
+      : hidden_dim(hidden_dim),
+        repeat_factor(repeat_factor),
+        b(3*hidden_dim),
+        t(3*hidden_dim),
+        zrh(3*hidden_dim),
+        z(hidden_dim),
+        r(hidden_dim),
+        hh(hidden_dim),
+        fco1(hidden_dim),
+        fco2(256),
+        h(hidden_dim),
+        o1b(hidden_dim),
+        o2b(256),
+        logits(256) {
+    m = create_mat(hidden_dim, 3*hidden_dim);
+    o1 = create_mat(hidden_dim, hidden_dim);
+    o2 = create_mat(hidden_dim, 256);
+    embed = std::vector<vec>();
+    for (int i = 0; i < 256; i++) {
+      embed.emplace_back(hidden_dim * 3);
+      embed[i].FillRandom();
+    }
+  }
+  void load_embed(fndarray embed_weights) {
+    auto a_embed = embed_weights.unchecked<2>();
+    for (int i = 0; i < 256; i++) {
+      for (int j = 0; j < hidden_dim * 3; j++) embed[i][j] = a_embed(i, j);
+    }
+  }
+  mat load_linear(vec& bias, fndarray w, indarray mask, fndarray b) {
+    auto w_ptr = static_cast<float*>(w.request().ptr);
+    auto mask_ptr = static_cast<int*>(mask.request().ptr);
+    auto rb = b.unchecked<1>();
+    // load bias, scale by 1/4
+    for (int i = 0; i < rb.shape(0); i++) bias[i] = rb(i) / 4;
+    // load weights
+    masked_mat mm(w.shape(0), w.shape(1), mask_ptr, w_ptr);
+    mat mmm(mm);
+    return mmm;
+  }
+  void load_weights(fndarray m, indarray m_mask, fndarray b,
+                    fndarray o1, indarray o1_mask,
+                    fndarray o1b, fndarray o2,
+                    indarray o2_mask, fndarray o2b) {
+    this->m = load_linear(this->b, m, m_mask, b);
+    this->o1 = load_linear(this->o1b, o1, o1_mask, o1b);
+    this->o2 = load_linear(this->o2b, o2, o2_mask, o2b);
+  }
+  std::vector<int> inference(fndarray ft, float temperature) {
+    auto rft = ft.unchecked<2>();
+    int value = 127;
+    std::vector<int> signal(rft.shape(0) * repeat_factor);
+    h.FillZero();
+    for (int index = 0; index < signal.size(); index++) {
+      m.SpMM_bias(h, b, &zrh, false);
+      for (int i = 0; i < 3 * hidden_dim; i++) t[i] = embed[value][i] + rft(index / repeat_factor, i);
+      for (int i = 0; i < hidden_dim; i++) {
+        z[i] = zrh[i] + t[i];
+        r[i] = zrh[hidden_dim + i] + t[hidden_dim + i];
+      }
+      z.Sigmoid();
+      r.Sigmoid();
+      for (int i = 0; i < hidden_dim; i++) {
+        hh[i] = zrh[hidden_dim * 2 + i]  * r[i] + t[hidden_dim * 2 + i];
+      }
+      hh.Tanh();
+      for (int i = 0; i < hidden_dim; i++) {
+        h[i] = (1. - z[i]) * h[i] + z[i] * hh[i];
+      }
+      o1.SpMM_bias(h, o1b, &fco1, true);
+      o2.SpMM_bias(fco1, o2b, &fco2, false);
+      // auto max_logit = fco2[0];
+      // for (int i = 1; i <= 255; ++i) {
+      //   max_logit = max(max_logit, fco2[i]);
+      // }
+      // float total = 0.0;
+      // for (int i = 0; i <= 255; ++i) {
+      //   logits[i] = csrblocksparse::fast_exp(fco2[i] - max_logit);
+      //   total += logits[i];
+      // }
+      // for (int i = 0; i <= 255; ++i) {
+      //   if (logits[i] < total / 1024.0) fco2[i] = -1e9;
+      // }
+      value = fco2.Sample(temperature);
+      signal[index] = value;
+    }
+    return signal;
+  }
+};
+PYBIND11_MODULE(wavegru_mod, m) {
+  py::class_<WaveGRU>(m, "WaveGRU")
+      .def(py::init<int, int>())
+      .def("load_embed", &WaveGRU::load_embed)
+      .def("load_weights", &WaveGRU::load_weights)
+      .def("inference", &WaveGRU::inference);
+}