Spaces:

ntt123
/

WaveGRU-Text-To-Speech

Running

App Files Files Community

NTT123 commited on Mar 15, 2022

Commit

df1ad02

•

1 Parent(s): 73eaac3

a slow but working model

Browse files

Files changed (15) hide show

.gitattributes +2 -0
alphabet.txt +41 -0
app.py +30 -4
inference.py +82 -0
packages.txt +1 -0
pooch.py +10 -0
pretrained_model_ljs_500k.ckpt +3 -0
requirements.txt +10 -0
tacotron.py +446 -0
tacotron.toml +31 -0
text.py +87 -0
utils.py +74 -0
wavegru.py +234 -0
wavegru.yaml +14 -0
wavegru_vocoder_tpu_gta_preemphasis_pruning_v7_0040000.ckpt +3 -0

.gitattributes CHANGED Viewed

@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+pretrained_model_ljs_500k.ckpt filter=lfs diff=lfs merge=lfs -text
+wavegru_vocoder_tpu_gta_preemphasis_pruning_v7_0040000.ckpt filter=lfs diff=lfs merge=lfs -text

alphabet.txt ADDED Viewed

	@@ -0,0 +1,41 @@

+_
+!
+"
+'
+(
+)
+,
+-
+.
+:
+;
+?
+[
+]
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z

app.py CHANGED Viewed

@@ -1,7 +1,33 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

 import gradio as gr
+from inference import load_tacotron_model, load_wavegru_net, text_to_mel, mel_to_wav
+alphabet, tacotron_net, tacotron_config = load_tacotron_model(
+    "./alphabet.txt", "./tacotron.toml", "./pretrained_model_ljs_500k.ckpt"
+)
+wavegru_config, wavegru_net = load_wavegru_net(
+    "./wavegru.yaml", "./wavegru_vocoder_tpu_gta_preemphasis_pruning_v7_0040000.ckpt"
+)
+def speak(text):
+    mel = text_to_mel(tacotron_net, text, alphabet, tacotron_config)
+    y = mel_to_wav(wavegru_net, mel, wavegru_config)
+    return 24_000, y
+title = "WaveGRU-TTS"
+description = "WaveGRU text-to-speech demo."
+gr.Interface(
+    fn=speak,
+    inputs="text",
+    outputs="audio",
+    title=title,
+    description=description,
+    theme="default",
+    allow_screenshot=False,
+    allow_flagging="never",
+).launch(debug=False)

inference.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import jax
+import jax.numpy as jnp
+import librosa
+import numpy as np
+import pax
+from text import english_cleaners
+from utils import (
+    create_tacotron_model,
+    load_tacotron_ckpt,
+    load_tacotron_config,
+    load_wavegru_ckpt,
+    load_wavegru_config,
+)
+from wavegru import WaveGRU
+def load_tacotron_model(alphabet_file, config_file, model_file):
+    """load tacotron model to memory"""
+    with open(alphabet_file, "r", encoding="utf-8") as f:
+        alphabet = f.read().split("\n")
+    config = load_tacotron_config(config_file)
+    net = create_tacotron_model(config)
+    _, net, _ = load_tacotron_ckpt(net, None, model_file)
+    net = net.eval()
+    net = jax.device_put(net)
+    return alphabet, net, config
+tacotron_inference_fn = pax.pure(lambda net, text: net.inference(text, max_len=10000))
+def text_to_mel(net, text, alphabet, config):
+    """convert text to mel spectrogram"""
+    text = english_cleaners(text)
+    text = text + config["PAD"] * (100 - (len(text) % 100))
+    tokens = [alphabet.index(c) for c in text]
+    tokens = jnp.array(tokens, dtype=jnp.int32)
+    mel = tacotron_inference_fn(net, tokens[None])
+    return mel
+def load_wavegru_net(config_file, model_file):
+    """load wavegru to memory"""
+    config = load_wavegru_config(config_file)
+    net = WaveGRU(
+        mel_dim=config["mel_dim"],
+        embed_dim=config["embed_dim"],
+        rnn_dim=config["rnn_dim"],
+        upsample_factors=config["upsample_factors"],
+    )
+    _, net, _ = load_wavegru_ckpt(net, None, model_file)
+    net = net.eval()
+    net = jax.device_put(net)
+    return config, net
+wavegru_inference = pax.pure(lambda net, mel: net.inference(mel, no_gru=False))
+def mel_to_wav(net, mel, config):
+    """convert mel to wav"""
+    if len(mel.shape) == 2:
+        mel = mel[None]
+    pad = config["num_pad_frames"] // 2 + 4
+    mel = np.pad(
+        mel,
+        [(0, 0), (pad, pad), (0, 0)],
+        constant_values=np.log(config["mel_min"]),
+    )
+    x = wavegru_inference(net, mel)
+    x = jax.device_get(x)
+    wav = librosa.mu_expand(x - 127, mu=255)
+    wav = librosa.effects.deemphasis(wav, coef=0.86)
+    wav = wav * 2.0
+    wav = wav / max(1.0, np.max(np.abs(wav)))
+    wav = wav * 2**15
+    wav = np.clip(wav, a_min=-(2**15), a_max=(2**15) - 1)
+    wav = wav.astype(np.int16)
+    return wav

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ libsndfile1-dev

pooch.py ADDED Viewed

	@@ -0,0 +1,10 @@

+def os_cache(x):
+    return x
+def create(*args, **kwargs):
+    class T:
+        def load_registry(self, *args, **kwargs):
+            return None
+    return T()

pretrained_model_ljs_500k.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4eabdcac35cd016469d17678f9549bd25d1c9bf66c9089ea9f0632619ba91194
+size 53221435

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+jax==0.3.1
+jaxlib==0.3.0
+numpy==1.22.3
+librosa==0.9.1
+pax3==0.5.6
+gradio
+jinja2
+toml==0.10.2
+unidecode==1.3.4
+pyyaml==6.0

tacotron.py ADDED Viewed

	@@ -0,0 +1,446 @@

+"""
+Tacotron + stepwise monotonic attention
+"""
+import jax
+import jax.numpy as jnp
+import pax
+def conv_block(in_ft, out_ft, kernel_size, activation_fn, use_dropout):
+    """
+    Conv >> LayerNorm >> activation >> Dropout
+    """
+    f = pax.Sequential(
+        pax.Conv1D(in_ft, out_ft, kernel_size, with_bias=False),
+        pax.LayerNorm(out_ft, -1, True, True),
+    )
+    if activation_fn is not None:
+        f >>= activation_fn
+    if use_dropout:
+        f >>= pax.Dropout(0.5)
+    return f
+class HighwayBlock(pax.Module):
+    """
+    Highway block
+    """
+    def __init__(self, dim: int) -> None:
+        super().__init__()
+        self.dim = dim
+        self.fc = pax.Linear(dim, 2 * dim)
+    def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+        t, h = jnp.split(self.fc(x), 2, axis=-1)
+        t = jax.nn.sigmoid(t - 1.0)  # bias toward keeping x
+        h = jax.nn.relu(h)
+        x = x * (1.0 - t) + h * t
+        return x
+class BiGRU(pax.Module):
+    """
+    Bidirectional GRU
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.rnn_fwd = pax.GRU(dim, dim)
+        self.rnn_bwd = pax.GRU(dim, dim)
+    def __call__(self, x, reset_masks):
+        N = x.shape[0]
+        x_fwd = x
+        x_bwd = jnp.flip(x, axis=1)
+        x_fwd_states = self.rnn_fwd.initial_state(N)
+        x_bwd_states = self.rnn_bwd.initial_state(N)
+        x_fwd_states, x_fwd = pax.scan(
+            self.rnn_fwd, x_fwd_states, x_fwd, time_major=False
+        )
+        reset_masks = jnp.flip(reset_masks, axis=1)
+        x_bwd_states0 = x_bwd_states
+        def rnn_reset_core(prev, inputs):
+            x, reset_mask = inputs
+            def reset_state(x0, xt):
+                return jnp.where(reset_mask, x0, xt)
+            state, _ = self.rnn_bwd(prev, x)
+            state = jax.tree_map(reset_state, x_bwd_states0, state)
+            return state, state.hidden
+        x_bwd_states, x_bwd = pax.scan(
+            rnn_reset_core, x_bwd_states, (x_bwd, reset_masks), time_major=False
+        )
+        x_bwd = jnp.flip(x_bwd, axis=1)
+        x = jnp.concatenate((x_fwd, x_bwd), axis=-1)
+        return x
+class CBHG(pax.Module):
+    """
+    Conv Bank >> Highway net >> GRU
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.convs = [conv_block(dim, dim, i, jax.nn.relu, False) for i in range(1, 17)]
+        self.conv_projection_1 = conv_block(16 * dim, dim, 3, jax.nn.relu, False)
+        self.conv_projection_2 = conv_block(dim, dim, 3, None, False)
+        self.highway = pax.Sequential(
+            HighwayBlock(dim), HighwayBlock(dim), HighwayBlock(dim), HighwayBlock(dim)
+        )
+        self.rnn = BiGRU(dim)
+    def __call__(self, x, x_mask):
+        conv_input = x * x_mask
+        fts = [f(conv_input) for f in self.convs]
+        residual = jnp.concatenate(fts, axis=-1)
+        residual = pax.max_pool(residual, 2, 1, "SAME", -1)
+        residual = self.conv_projection_1(residual * x_mask)
+        residual = self.conv_projection_2(residual * x_mask)
+        x = x + residual
+        x = self.highway(x)
+        x = self.rnn(x * x_mask, reset_masks=1 - x_mask)
+        return x * x_mask
+class PreNet(pax.Module):
+    """
+    Linear >> relu >> dropout >> Linear >> relu >> dropout
+    """
+    def __init__(self, input_dim, hidden_dim, output_dim, always_dropout=True):
+        super().__init__()
+        self.fc1 = pax.Linear(input_dim, hidden_dim)
+        self.fc2 = pax.Linear(hidden_dim, output_dim)
+        self.rng_seq = pax.RngSeq()
+        self.always_dropout = always_dropout
+    def __call__(self, x, k1=None, k2=None):
+        x = self.fc1(x)
+        x = jax.nn.relu(x)
+        if self.always_dropout or self.training:
+            if k1 is None:
+                k1 = self.rng_seq.next_rng_key()
+            x = pax.dropout(k1, 0.5, x)
+        x = self.fc2(x)
+        x = jax.nn.relu(x)
+        if self.always_dropout or self.training:
+            if k2 is None:
+                k2 = self.rng_seq.next_rng_key()
+            x = pax.dropout(k2, 0.5, x)
+        return x
+class Tacotron(pax.Module):
+    """
+    Tacotron TTS model.
+    It uses stepwise monotonic attention for robust attention.
+    """
+    def __init__(
+        self,
+        mel_dim: int,
+        attn_bias,
+        rr,
+        max_rr,
+        mel_min,
+        sigmoid_noise,
+        pad_token,
+        prenet_dim,
+        attn_hidden_dim,
+        attn_rnn_dim,
+        rnn_dim,
+        postnet_dim,
+        text_dim,
+    ):
+        """
+        New Tacotron model
+        Args:
+            mel_dim (int): dimension of log mel-spectrogram features.
+            attn_bias (float): control how "slow" the attention will
+                move forward at initialization.
+            rr (int): the reduction factor.
+                Number of predicted frame at each time step. Default is 2.
+            max_rr (int): max value of rr.
+            mel_min (float): the minimum value of mel features.
+                The <go> frame is filled by `log(mel_min)` values.
+            sigmoid_noise (float): the variance of gaussian noise added
+                to attention scores in training.
+            pad_token (int): the pad value at the end of text sequences.
+            prenet_dim (int): dimension of prenet output.
+            attn_hidden_dim (int): dimension of attention hidden vectors.
+            attn_rnn_dim (int): number of cells in the attention RNN.
+            rnn_dim (int): number of cells in the decoder RNNs.
+            postnet_dim (int): number of features in the postnet convolutions.
+            text_dim (int): dimension of text embedding vectors.
+        """
+        super().__init__()
+        self.text_dim = text_dim
+        assert rr <= max_rr
+        self.rr = rr
+        self.max_rr = max_rr
+        self.mel_dim = mel_dim
+        self.mel_min = mel_min
+        self.sigmoid_noise = sigmoid_noise
+        self.pad_token = pad_token
+        self.prenet_dim = prenet_dim
+        # encoder submodules
+        self.encoder_embed = pax.Embed(256, text_dim)
+        self.encoder_pre_net = PreNet(text_dim, 256, prenet_dim, always_dropout=True)
+        self.encoder_cbhg = CBHG(prenet_dim)
+        # random key generator
+        self.rng_seq = pax.RngSeq()
+        # pre-net
+        self.decoder_pre_net = PreNet(mel_dim, 256, prenet_dim, always_dropout=True)
+        # decoder submodules
+        self.attn_rnn = pax.LSTM(prenet_dim + prenet_dim * 2, attn_rnn_dim)
+        self.text_key_fc = pax.Linear(prenet_dim * 2, attn_hidden_dim, with_bias=True)
+        self.attn_query_fc = pax.Linear(attn_rnn_dim, attn_hidden_dim, with_bias=False)
+        self.attn_V = pax.Linear(attn_hidden_dim, 1, with_bias=False)
+        self.attn_V_weight_norm = jnp.array(1.0 / jnp.sqrt(attn_hidden_dim))
+        self.attn_V_bias = jnp.array(attn_bias)
+        self.attn_log = jnp.zeros((1,))
+        self.decoder_input = pax.Linear(attn_rnn_dim + 2 * prenet_dim, rnn_dim)
+        self.decoder_rnn1 = pax.LSTM(rnn_dim, rnn_dim)
+        self.decoder_rnn2 = pax.LSTM(rnn_dim, rnn_dim)
+        # mel + end-of-sequence token
+        self.output_fc = pax.Linear(rnn_dim, (mel_dim + 1) * max_rr, with_bias=True)
+        # post-net
+        self.post_net = pax.Sequential(
+            conv_block(mel_dim, postnet_dim, 5, jax.nn.tanh, True),
+            conv_block(postnet_dim, postnet_dim, 5, jax.nn.tanh, True),
+            conv_block(postnet_dim, postnet_dim, 5, jax.nn.tanh, True),
+            conv_block(postnet_dim, postnet_dim, 5, jax.nn.tanh, True),
+            conv_block(postnet_dim, mel_dim, 5, None, True),
+        )
+    parameters = pax.parameters_method("attn_V_weight_norm", "attn_V_bias")
+    def encode_text(self, text: jnp.ndarray) -> jnp.ndarray:
+        """
+        Encode text to a sequence of real vectors
+        """
+        N, L = text.shape
+        text_mask = (text != self.pad_token)[..., None]
+        x = self.encoder_embed(text)
+        x = self.encoder_pre_net(x)
+        x = self.encoder_cbhg(x, text_mask)
+        return x
+    def go_frame(self, batch_size: int) -> jnp.ndarray:
+        """
+        return the go frame
+        """
+        return jnp.ones((batch_size, self.mel_dim)) * jnp.log(self.mel_min)
+    def decoder_initial_state(self, N: int, L: int):
+        """
+        setup decoder initial state
+        """
+        attn_context = jnp.zeros((N, self.prenet_dim * 2))
+        attn_pr = jax.nn.one_hot(
+            jnp.zeros((N,), dtype=jnp.int32), num_classes=L, axis=-1
+        )
+        attn_state = (self.attn_rnn.initial_state(N), attn_context, attn_pr)
+        decoder_rnn_states = (
+            self.decoder_rnn1.initial_state(N),
+            self.decoder_rnn2.initial_state(N),
+        )
+        return attn_state, decoder_rnn_states
+    def monotonic_attention(self, prev_state, inputs, envs):
+        """
+        Stepwise monotonic attention
+        """
+        attn_rnn_state, attn_context, prev_attn_pr = prev_state
+        x, attn_rng_key = inputs
+        text, text_key = envs
+        attn_rnn_input = jnp.concatenate((x, attn_context), axis=-1)
+        attn_rnn_state, attn_rnn_output = self.attn_rnn(attn_rnn_state, attn_rnn_input)
+        attn_query_input = attn_rnn_output
+        attn_query = self.attn_query_fc(attn_query_input)
+        attn_hidden = jnp.tanh(attn_query[:, None, :] + text_key)
+        score = self.attn_V(attn_hidden)
+        score = jnp.squeeze(score, axis=-1)
+        weight_norm = jnp.linalg.norm(self.attn_V.weight)
+        score = score * (self.attn_V_weight_norm / weight_norm)
+        score = score + self.attn_V_bias
+        noise = jax.random.normal(attn_rng_key, score.shape) * self.sigmoid_noise
+        pr_stay = jax.nn.sigmoid(score + noise)
+        pr_move = 1.0 - pr_stay
+        pr_new_location = pr_move * prev_attn_pr
+        pr_new_location = jnp.pad(
+            pr_new_location[:, :-1], ((0, 0), (1, 0)), constant_values=0
+        )
+        attn_pr = pr_stay * prev_attn_pr + pr_new_location
+        attn_context = jnp.einsum("NL,NLD->ND", attn_pr, text)
+        new_state = (attn_rnn_state, attn_context, attn_pr)
+        return new_state, attn_rnn_output
+    def zoneout_lstm(self, lstm_core, rng_key, zoneout_pr=0.1):
+        """
+        Return a zoneout lstm core.
+        It will zoneout the new hidden states and keep the new cell states unchanged.
+        """
+        def core(state, x):
+            new_state, _ = lstm_core(state, x)
+            h_old = state.hidden
+            h_new = new_state.hidden
+            mask = jax.random.bernoulli(rng_key, zoneout_pr, h_old.shape)
+            h_new = h_old * mask + h_new * (1.0 - mask)
+            return pax.LSTMState(h_new, new_state.cell), h_new
+        return core
+    def decoder_step(
+        self,
+        attn_state,
+        decoder_rnn_states,
+        rng_key,
+        mel,
+        text,
+        text_key,
+        call_pre_net=False,
+    ):
+        """
+        One decoder step
+        """
+        if call_pre_net:
+            k1, k2, zk1, zk2, rng_key, rng_key_next = jax.random.split(rng_key, 6)
+            mel = self.decoder_pre_net(mel, k1, k2)
+        else:
+            zk1, zk2, rng_key, rng_key_next = jax.random.split(rng_key, 4)
+        attn_inputs = (mel, rng_key)
+        attn_envs = (text, text_key)
+        attn_state, attn_rnn_output = self.monotonic_attention(
+            attn_state, attn_inputs, attn_envs
+        )
+        (_, attn_context, attn_pr) = attn_state
+        (decoder_rnn_state1, decoder_rnn_state2) = decoder_rnn_states
+        decoder_rnn1_input = jnp.concatenate((attn_rnn_output, attn_context), axis=-1)
+        decoder_rnn1_input = self.decoder_input(decoder_rnn1_input)
+        decoder_rnn1 = self.zoneout_lstm(self.decoder_rnn1, zk1)
+        decoder_rnn_state1, decoder_rnn_output1 = decoder_rnn1(
+            decoder_rnn_state1, decoder_rnn1_input
+        )
+        decoder_rnn2_input = decoder_rnn1_input + decoder_rnn_output1
+        decoder_rnn2 = self.zoneout_lstm(self.decoder_rnn2, zk2)
+        decoder_rnn_state2, decoder_rnn_output2 = decoder_rnn2(
+            decoder_rnn_state2, decoder_rnn2_input
+        )
+        x = decoder_rnn1_input + decoder_rnn_output1 + decoder_rnn_output2
+        decoder_rnn_states = (decoder_rnn_state1, decoder_rnn_state2)
+        return attn_state, decoder_rnn_states, rng_key_next, x, attn_pr[0]
+    @jax.jit
+    def inference_step(
+        self, attn_state, decoder_rnn_states, rng_key, mel, text, text_key
+    ):
+        """one inference step"""
+        attn_state, decoder_rnn_states, rng_key, x, _ = self.decoder_step(
+            attn_state,
+            decoder_rnn_states,
+            rng_key,
+            mel,
+            text,
+            text_key,
+            call_pre_net=True,
+        )
+        x = self.output_fc(x)
+        N, D2 = x.shape
+        x = jnp.reshape(x, (N, self.max_rr, D2 // self.max_rr))
+        x = x[:, : self.rr, :]
+        x = jnp.reshape(x, (N, self.rr, -1))
+        mel = x[..., :-1]
+        eos = x[..., -1]
+        return attn_state, decoder_rnn_states, rng_key, (mel, eos)
+    def inference(self, text, seed=42, max_len=1000):
+        """
+        text to mel
+        """
+        text = self.encode_text(text)
+        text_key = self.text_key_fc(text)
+        N, L, D = text.shape
+        mel = self.go_frame(N)
+        attn_state, decoder_rnn_states = self.decoder_initial_state(N, L)
+        rng_key = jax.random.PRNGKey(seed)
+        mels = []
+        count = 0
+        while True:
+            count = count + 1
+            attn_state, decoder_rnn_states, rng_key, (mel, eos) = self.inference_step(
+                attn_state, decoder_rnn_states, rng_key, mel, text, text_key
+            )
+            mels.append(mel)
+            if eos[0, -1].item() > 0 or count > max_len:
+                break
+            mel = mel[:, -1, :]
+        mels = jnp.concatenate(mels, axis=1)
+        mel = mel + self.post_net(mel)
+        return mels
+    def decode(self, mel, text):
+        """
+        Attention mechanism + Decoder
+        """
+        text_key = self.text_key_fc(text)
+        def scan_fn(prev_states, inputs):
+            attn_state, decoder_rnn_states = prev_states
+            x, rng_key = inputs
+            attn_state, decoder_rnn_states, _, output, attn_pr = self.decoder_step(
+                attn_state, decoder_rnn_states, rng_key, x, text, text_key
+            )
+            states = (attn_state, decoder_rnn_states)
+            return states, (output, attn_pr)
+        N, L, D = text.shape
+        decoder_states = self.decoder_initial_state(N, L)
+        rng_keys = self.rng_seq.next_rng_key(mel.shape[1])
+        rng_keys = jnp.stack(rng_keys, axis=1)
+        decoder_states, (x, attn_log) = pax.scan(
+            scan_fn,
+            decoder_states,
+            (mel, rng_keys),
+            time_major=False,
+        )
+        self.attn_log = attn_log
+        del decoder_states
+        x = self.output_fc(x)
+        N, T2, D2 = x.shape
+        x = jnp.reshape(x, (N, T2, self.max_rr, D2 // self.max_rr))
+        x = x[:, :, : self.rr, :]
+        x = jnp.reshape(x, (N, T2 * self.rr, -1))
+        mel = x[..., :-1]
+        eos = x[..., -1]
+        return mel, eos
+    def __call__(self, mel: jnp.ndarray, text: jnp.ndarray):
+        text = self.encode_text(text)
+        mel = self.decoder_pre_net(mel)
+        mel, eos = self.decode(mel, text)
+        return mel, mel + self.post_net(mel), eos

tacotron.toml ADDED Viewed

	@@ -0,0 +1,31 @@

+[tacotron]
+# training
+BATCH_SIZE = 64
+LR=1024e-6 # learning rate
+MODEL_PREFIX = "mono_tts_cbhg_small"
+LOG_DIR = "./logs"
+CKPT_DIR = "./ckpts"
+USE_MP = false  # use mixed-precision training
+# data
+TF_DATA_DIR = "./tf_data" # tensorflow data directory
+TF_GTA_DATA_DIR = "./tf_gta_data" # tf gta data directory
+SAMPLE_RATE = 24000 # convert to this sample rate if needed
+MEL_DIM = 80 # the dimension of melspectrogram features
+MEL_MIN = 1e-5
+PAD = "_" # padding character
+PAD_TOKEN = 0
+TEST_DATA_SIZE = 1024
+# model
+RR = 2 # reduction factor
+MAX_RR=2
+ATTN_BIAS = 0.0 # control how slow the attention moves forward
+SIGMOID_NOISE = 2.0
+PRENET_DIM = 128
+TEXT_DIM = 256
+RNN_DIM = 512
+ATTN_RNN_DIM = 256
+ATTN_HIDDEN_DIM = 128
+POSTNET_DIM = 512

text.py ADDED Viewed

	@@ -0,0 +1,87 @@

+""" from https://github.com/keithito/tacotron """
+"""
+Cleaners are transformations that run over the input text at both training and eval time.
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+  1. "english_cleaners" for English text
+  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+     the symbols in symbols.py to match your data).
+"""
+import re
+from unidecode import unidecode
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r"\s+")
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [
+    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("mrs", "misess"),
+        ("mr", "mister"),
+        ("dr", "doctor"),
+        ("st", "saint"),
+        ("co", "company"),
+        ("jr", "junior"),
+        ("maj", "major"),
+        ("gen", "general"),
+        ("drs", "doctors"),
+        ("rev", "reverend"),
+        ("lt", "lieutenant"),
+        ("hon", "honorable"),
+        ("sgt", "sergeant"),
+        ("capt", "captain"),
+        ("esq", "esquire"),
+        ("ltd", "limited"),
+        ("col", "colonel"),
+        ("ft", "fort"),
+    ]
+]
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text)
+def convert_to_ascii(text):
+    return unidecode(text)
+def basic_cleaners(text):
+    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def transliteration_cleaners(text):
+    """Pipeline for non-English text that transliterates to ASCII."""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def english_cleaners(text):
+    """Pipeline for English text, including number and abbreviation expansion."""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_abbreviations(text)
+    text = collapse_whitespace(text)
+    return text

utils.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""
+Utility functions
+"""
+import pickle
+from pathlib import Path
+import pax
+import toml
+import yaml
+from tacotron import Tacotron
+def load_tacotron_config(config_file=Path("tacotron.toml")):
+    """
+    Load the project configurations
+    """
+    return toml.load(config_file)["tacotron"]
+def load_tacotron_ckpt(net: pax.Module, optim: pax.Module, path):
+    """
+    load checkpoint from disk
+    """
+    with open(path, "rb") as f:
+        dic = pickle.load(f)
+    if net is not None:
+        net = net.load_state_dict(dic["model_state_dict"])
+    if optim is not None:
+        optim = optim.load_state_dict(dic["optim_state_dict"])
+    return dic["step"], net, optim
+def create_tacotron_model(config):
+    """
+    return a random initialized Tacotron model
+    """
+    return Tacotron(
+        mel_dim=config["MEL_DIM"],
+        attn_bias=config["ATTN_BIAS"],
+        rr=config["RR"],
+        max_rr=config["MAX_RR"],
+        mel_min=config["MEL_MIN"],
+        sigmoid_noise=config["SIGMOID_NOISE"],
+        pad_token=config["PAD_TOKEN"],
+        prenet_dim=config["PRENET_DIM"],
+        attn_hidden_dim=config["ATTN_HIDDEN_DIM"],
+        attn_rnn_dim=config["ATTN_RNN_DIM"],
+        rnn_dim=config["RNN_DIM"],
+        postnet_dim=config["POSTNET_DIM"],
+        text_dim=config["TEXT_DIM"],
+    )
+def load_wavegru_config(config_file):
+    """
+    Load project configurations
+    """
+    with open(config_file, "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
+def load_wavegru_ckpt(net, optim, ckpt_file):
+    """
+    load training checkpoint from file
+    """
+    with open(ckpt_file, "rb") as f:
+        dic = pickle.load(f)
+    if net is not None:
+        net = net.load_state_dict(dic["net_state_dict"])
+    if optim is not None:
+        optim = optim.load_state_dict(dic["optim_state_dict"])
+    return dic["step"], net, optim

wavegru.py ADDED Viewed

	@@ -0,0 +1,234 @@

+"""
+WaveGRU model: melspectrogram => mu-law encoded waveform
+"""
+import jax
+import jax.numpy as jnp
+import pax
+class ReLU(pax.Module):
+    def __call__(self, x):
+        return jax.nn.relu(x)
+def dilated_residual_conv_block(dim, kernel, stride, dilation):
+    """
+    Use dilated convs to enlarge the receptive field
+    """
+    return pax.Sequential(
+        pax.Conv1D(dim, dim, kernel, stride, dilation, "VALID", with_bias=False),
+        pax.LayerNorm(dim, -1, True, True),
+        ReLU(),
+        pax.Conv1D(dim, dim, 1, 1, 1, "VALID", with_bias=False),
+        pax.LayerNorm(dim, -1, True, True),
+        ReLU(),
+    )
+def tile_1d(x, factor):
+    """
+    Tile tensor of shape N, L, D into N, L*factor, D
+    """
+    N, L, D = x.shape
+    x = x[:, :, None, :]
+    x = jnp.tile(x, (1, 1, factor, 1))
+    x = jnp.reshape(x, (N, L * factor, D))
+    return x
+def up_block(dim, factor):
+    """
+    Tile >> Conv >> BatchNorm >> ReLU
+    """
+    return pax.Sequential(
+        lambda x: tile_1d(x, factor),
+        pax.Conv1D(dim, dim, 2 * factor, stride=1, padding="VALID", with_bias=False),
+        pax.LayerNorm(dim, -1, True, True),
+        ReLU(),
+    )
+class Upsample(pax.Module):
+    """
+    Upsample melspectrogram to match raw audio sample rate.
+    """
+    def __init__(self, input_dim, upsample_factors):
+        super().__init__()
+        self.input_conv = pax.Sequential(
+            pax.Conv1D(input_dim, 512, 1, with_bias=False),
+            pax.LayerNorm(512, -1, True, True),
+        )
+        self.upsample_factors = upsample_factors
+        self.dilated_convs = [
+            dilated_residual_conv_block(512, 3, 1, 2**i) for i in range(5)
+        ]
+        self.up_factors = upsample_factors[:-1]
+        self.up_blocks = [up_block(512, x) for x in self.up_factors]
+        self.final_tile = upsample_factors[-1]
+    def __call__(self, x):
+        x = self.input_conv(x)
+        for residual in self.dilated_convs:
+            y = residual(x)
+            pad = (x.shape[1] - y.shape[1]) // 2
+            x = x[:, pad:-pad, :] + y
+        for f in self.up_blocks:
+            x = f(x)
+        x = tile_1d(x, self.final_tile)
+        return x
+class Pruner(pax.Module):
+    """
+    Base class for pruners
+    """
+    def __init__(self, update_freq=500):
+        super().__init__()
+        self.update_freq = update_freq
+    def compute_sparsity(self, step):
+        """
+        Two-stages pruning
+        """
+        t = jnp.power(1 - (step * 1.0 - 1_000) / 300_000, 3)
+        z = 0.5 * jnp.clip(1.0 - t, a_min=0, a_max=1)
+        for i in range(4):
+            t = jnp.power(1 - (step * 1.0 - 1_000 - 400_000 - i * 200_000) / 100_000, 3)
+            z = z + 0.1 * jnp.clip(1 - t, a_min=0, a_max=1)
+        return z
+    def prune(self, step, weights):
+        """
+        Return a mask
+        """
+        z = self.compute_sparsity(step)
+        x = weights
+        H, W = x.shape
+        x = x.reshape(H // 4, 4, W // 4, 4)
+        x = jnp.abs(x)
+        x = jnp.sum(x, axis=(1, 3), keepdims=True)
+        q = jnp.quantile(jnp.reshape(x, (-1,)), z)
+        x = x >= q
+        x = jnp.tile(x, (1, 4, 1, 4))
+        x = jnp.reshape(x, (H, W))
+        return x
+class GRUPruner(Pruner):
+    def __init__(self, gru, update_freq=500):
+        super().__init__(update_freq=update_freq)
+        self.xh_zr_fc_mask = jnp.ones_like(gru.xh_zr_fc.weight) == 1
+        self.xh_h_fc_mask = jnp.ones_like(gru.xh_h_fc.weight) == 1
+    def __call__(self, gru: pax.GRU):
+        """
+        Apply mask after an optimization step
+        """
+        zr_masked_weights = jnp.where(self.xh_zr_fc_mask, gru.xh_zr_fc.weight, 0)
+        gru = gru.replace_node(gru.xh_zr_fc.weight, zr_masked_weights)
+        h_masked_weights = jnp.where(self.xh_h_fc_mask, gru.xh_h_fc.weight, 0)
+        gru = gru.replace_node(gru.xh_h_fc.weight, h_masked_weights)
+        return gru
+    def update_mask(self, step, gru: pax.GRU):
+        """
+        Update internal masks
+        """
+        xh_z_weight, xh_r_weight = jnp.split(gru.xh_zr_fc.weight, 2, axis=1)
+        xh_z_weight = self.prune(step, xh_z_weight)
+        xh_r_weight = self.prune(step, xh_r_weight)
+        self.xh_zr_fc_mask *= jnp.concatenate((xh_z_weight, xh_r_weight), axis=1)
+        self.xh_h_fc_mask *= self.prune(step, gru.xh_h_fc.weight)
+class LinearPruner(Pruner):
+    def __init__(self, linear, update_freq=500):
+        super().__init__(update_freq=update_freq)
+        self.mask = jnp.ones_like(linear.weight) == 1
+    def __call__(self, linear: pax.Linear):
+        """
+        Apply mask after an optimization step
+        """
+        return linear.replace(weight=jnp.where(self.mask, linear.weight, 0))
+    def update_mask(self, step, linear: pax.Linear):
+        """
+        Update internal masks
+        """
+        self.mask *= self.prune(step, linear.weight)
+class WaveGRU(pax.Module):
+    """
+    WaveGRU vocoder model
+    """
+    def __init__(
+        self, mel_dim=80, embed_dim=32, rnn_dim=512, upsample_factors=(5, 4, 3, 5)
+    ):
+        super().__init__()
+        self.embed = pax.Embed(256, embed_dim)
+        self.upsample = Upsample(input_dim=mel_dim, upsample_factors=upsample_factors)
+        self.rnn = pax.GRU(embed_dim + rnn_dim, rnn_dim)
+        self.o1 = pax.Linear(rnn_dim, rnn_dim)
+        self.o2 = pax.Linear(rnn_dim, 256)
+        self.gru_pruner = GRUPruner(self.rnn)
+        self.o1_pruner = LinearPruner(self.o1)
+        self.o2_pruner = LinearPruner(self.o2)
+    def output(self, x):
+        x = self.o1(x)
+        x = jax.nn.relu(x)
+        x = self.o2(x)
+        return x
+    @jax.jit
+    def inference_step(self, rnn_state, mel, rng_key, x):
+        """one inference step"""
+        x = self.embed(x)
+        x = jnp.concatenate((x, mel), axis=-1)
+        rnn_state, x = self.rnn(rnn_state, x)
+        x = self.output(x)
+        rng_key, next_rng_key = jax.random.split(rng_key, 2)
+        x = jax.random.categorical(rng_key, x, axis=-1)
+        return rnn_state, next_rng_key, x
+    def inference(self, mel, no_gru=False, seed=42):
+        """
+        generate waveform form melspectrogram
+        """
+        y = self.upsample(mel)
+        if no_gru:
+            return y
+        x = jnp.array([127], dtype=jnp.int32)
+        rnn_state = self.rnn.initial_state(1)
+        output = []
+        rng_key = jax.random.PRNGKey(seed)
+        for i in range(y.shape[1]):
+            rnn_state, rng_key, x = self.inference_step(rnn_state, y[:, i], rng_key, x)
+            output.append(x)
+        x = jnp.concatenate(output, axis=0)
+        return x
+    def __call__(self, mel, x):
+        x = self.embed(x)
+        y = self.upsample(mel)
+        pad_left = (x.shape[1] - y.shape[1]) // 2
+        pad_right = x.shape[1] - y.shape[1] - pad_left
+        x = x[:, pad_left:-pad_right]
+        x = jnp.concatenate((x, y), axis=-1)
+        _, x = pax.scan(
+            self.rnn,
+            self.rnn.initial_state(x.shape[0]),
+            x,
+            time_major=False,
+        )
+        x = self.output(x)
+        return x

wavegru.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+## dsp
+sample_rate : 24000
+window_length: 50.0 # ms
+hop_length: 12.5 # ms
+mel_min: 1.0e-5 ## need .0 to make it a float
+mel_dim: 80
+n_fft: 2048
+## wavegru
+embed_dim: 32
+rnn_dim: 512
+frames_per_sequence: 67
+num_pad_frames: 62
+upsample_factors: [5, 4, 3, 5]

wavegru_vocoder_tpu_gta_preemphasis_pruning_v7_0040000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c09ed822c5daac0afbd19e8ba4e0ded26dd5732e0efd13ce193c3f54c4e63f54
+size 56479599