Spaces:

NATSpeech
/

PortaSpeech

Runtime error

App Files Files Community

RayeRen commited on Feb 10, 2022

Commit

53fa903

1 Parent(s): 91c5bdb

update

Browse files

Files changed (11) hide show

checkpoints/diffsinger/config.yaml +393 -0
checkpoints/diffsinger/model_ckpt_steps_160000.ckpt +3 -0
docs/diffspeech.md +62 -0
docs/prepare_vocoder.md +1 -1
egs/datasets/audio/lj/ds.yaml +29 -0
egs/egs_bases/tts/ds.yaml +32 -0
inference/tts/ds.py +30 -0
modules/tts/commons/align_ops.py +2 -3
modules/tts/diffspeech/net.py +110 -0
modules/tts/diffspeech/shallow_diffusion_tts.py +281 -0
tasks/tts/diffspeech.py +111 -0

checkpoints/diffsinger/config.yaml ADDED Viewed

	@@ -0,0 +1,393 @@

+K_step: 71
+accumulate_grad_batches: 1
+amp: false
+audio_num_mel_bins: 80
+audio_sample_rate: 22050
+base_config:
+- egs/egs_bases/tts/ds.yaml
+- ./fs2_orig.yaml
+binarization_args:
+  min_sil_duration: 0.1
+  shuffle: false
+  test_range:
+  - 0
+  - 523
+  train_range:
+  - 871
+  - -1
+  trim_eos_bos: false
+  valid_range:
+  - 523
+  - 871
+  with_align: true
+  with_f0: true
+  with_f0cwt: true
+  with_linear: false
+  with_spk_embed: false
+  with_wav: false
+binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
+binary_data_dir: data/binary/ljspeech_cwt
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+conv_use_pos: false
+cwt_std_scale: 1.0
+debug: false
+dec_dilations:
+- 1
+- 1
+- 1
+- 1
+dec_ffn_kernel_size: 9
+dec_inp_add_noise: false
+dec_kernel_size: 5
+dec_layers: 4
+dec_post_net_kernel: 3
+decay_steps: 50000
+decoder_rnn_dim: 0
+decoder_type: fft
+diff_decoder_type: wavenet
+diff_loss_type: l1
+dilation_cycle_length: 1
+dropout: 0.0
+ds_workers: 2
+dur_predictor_kernel: 3
+dur_predictor_layers: 2
+enc_dec_norm: ln
+enc_dilations:
+- 1
+- 1
+- 1
+- 1
+enc_ffn_kernel_size: 9
+enc_kernel_size: 5
+enc_layers: 4
+enc_post_net_kernel: 3
+enc_pre_ln: true
+enc_prenet: true
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+eval_max_batches: -1
+f0_max: 600
+f0_min: 80
+ffn_act: gelu
+ffn_hidden_size: 1024
+fft_size: 1024
+fmax: 7600
+fmin: 80
+frames_multiple: 1
+fs2_ckpt: checkpoints/fs2_exp/model_ckpt_steps_160000.ckpt
+gen_dir_name: ''
+griffin_lim_iters: 30
+hidden_size: 256
+hop_size: 256
+infer: false
+keep_bins: 80
+lambda_commit: 0.25
+lambda_energy: 0.1
+lambda_f0: 1.0
+lambda_ph_dur: 0.1
+lambda_sent_dur: 1.0
+lambda_uv: 1.0
+lambda_word_dur: 1.0
+layers_in_block: 2
+load_ckpt: ''
+loud_norm: false
+lr: 0.001
+max_beta: 0.06
+max_epochs: 1000
+max_frames: 1548
+max_input_tokens: 1550
+max_sentences: 128
+max_tokens: 30000
+max_updates: 160000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+mel_losses: l1:0.5|ssim:0.5
+mel_vmax: 1.5
+mel_vmin: -6
+min_frames: 0
+num_ckpt_keep: 3
+num_heads: 2
+num_sanity_val_steps: 5
+num_spk: 1
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_extractor: parselmouth
+pitch_key: pitch
+pitch_type: cwt
+predictor_dropout: 0.5
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+preprocess_args:
+  add_eos_bos: true
+  mfa_group_shuffle: false
+  mfa_offset: 0.02
+  nsample_per_mfa_group: 1000
+  reset_phone_dict: true
+  reset_word_dict: true
+  save_sil_mask: true
+  txt_processor: en
+  use_mfa: true
+  vad_max_silence_length: 12
+  wav_processors: []
+  with_phsep: true
+preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
+print_nan_grads: false
+processed_data_dir: data/processed/ljspeech
+profile_infer: false
+raw_data_dir: data/raw/LJSpeech-1.1
+ref_norm_layer: bn
+rename_tmux: true
+residual_channels: 256
+residual_layers: 20
+resume_from_checkpoint: 0
+save_best: false
+save_codes:
+- tasks
+- modules
+- egs
+save_f0: false
+save_gt: true
+schedule_type: linear
+scheduler: warmup
+seed: 1234
+sort_by_len: true
+spec_max:
+- -0.5982
+- -0.0778
+- 0.1205
+- 0.2747
+- 0.4657
+- 0.5123
+- 0.583
+- 0.7093
+- 0.6461
+- 0.6101
+- 0.7316
+- 0.7715
+- 0.7681
+- 0.8349
+- 0.7815
+- 0.7591
+- 0.791
+- 0.7433
+- 0.7352
+- 0.6869
+- 0.6854
+- 0.6623
+- 0.5353
+- 0.6492
+- 0.6909
+- 0.6106
+- 0.5761
+- 0.5236
+- 0.5638
+- 0.4054
+- 0.4545
+- 0.3407
+- 0.3037
+- 0.338
+- 0.1599
+- 0.1603
+- 0.2741
+- 0.213
+- 0.1569
+- 0.1911
+- 0.2324
+- 0.1586
+- 0.1221
+- 0.0341
+- -0.0558
+- 0.0553
+- -0.1153
+- -0.0933
+- -0.1171
+- -0.005
+- -0.1519
+- -0.1629
+- -0.0522
+- -0.0739
+- -0.2069
+- -0.2405
+- -0.1244
+- -0.2582
+- -0.1361
+- -0.1575
+- -0.1442
+- 0.0513
+- -0.1567
+- -0.2
+- 0.0086
+- -0.0698
+- 0.1385
+- 0.0941
+- 0.1864
+- 0.1225
+- 0.1389
+- 0.1382
+- 0.167
+- 0.1007
+- 0.1444
+- 0.0888
+- 0.1998
+- 0.228
+- 0.2932
+- 0.3047
+spec_min:
+- -4.7574
+- -4.6783
+- -4.6431
+- -4.5832
+- -4.539
+- -4.6771
+- -4.8089
+- -4.7672
+- -4.5784
+- -4.7755
+- -4.715
+- -4.8919
+- -4.8271
+- -4.7389
+- -4.6047
+- -4.7759
+- -4.6799
+- -4.8201
+- -4.7823
+- -4.8262
+- -4.7857
+- -4.7545
+- -4.9358
+- -4.9733
+- -5.1134
+- -5.1395
+- -4.9016
+- -4.8434
+- -5.0189
+- -4.846
+- -5.0529
+- -4.951
+- -5.0217
+- -5.0049
+- -5.1831
+- -5.1445
+- -5.1015
+- -5.0281
+- -4.9887
+- -4.9916
+- -4.9785
+- -4.9071
+- -4.9488
+- -5.0342
+- -4.9332
+- -5.065
+- -4.8924
+- -5.0875
+- -5.0483
+- -5.0848
+- -5.0655
+- -5.0279
+- -5.0015
+- -5.0792
+- -5.0636
+- -5.2413
+- -5.1421
+- -5.171
+- -5.3256
+- -5.0511
+- -5.1186
+- -5.0057
+- -5.0446
+- -5.1173
+- -5.0325
+- -5.1085
+- -5.0053
+- -5.0755
+- -5.1176
+- -5.1004
+- -5.2153
+- -5.2757
+- -5.3025
+- -5.2867
+- -5.2918
+- -5.3328
+- -5.2731
+- -5.2985
+- -5.24
+- -5.2211
+task_cls: tasks.tts.diffspeech.DiffSpeechTask
+tb_log_interval: 100
+test_ids:
+- 0
+- 1
+- 2
+- 3
+- 4
+- 5
+- 6
+- 7
+- 8
+- 9
+- 10
+- 11
+- 12
+- 13
+- 14
+- 15
+- 16
+- 17
+- 18
+- 19
+- 68
+- 70
+- 74
+- 87
+- 110
+- 172
+- 190
+- 215
+- 231
+- 294
+- 316
+- 324
+- 402
+- 422
+- 485
+- 500
+- 505
+- 508
+- 509
+- 519
+test_input_yaml: ''
+test_num: 100
+test_set_name: test
+timesteps: 100
+train_set_name: train
+train_sets: ''
+use_energy_embed: true
+use_gt_dur: false
+use_gt_energy: false
+use_gt_f0: false
+use_pitch_embed: true
+use_pos_embed: true
+use_spk_embed: false
+use_spk_id: false
+use_uv: true
+use_word_input: false
+val_check_interval: 2000
+valid_infer_interval: 10000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+vocoder: HifiGAN
+vocoder_ckpt: checkpoints/hifi_lj
+warmup_updates: 4000
+weight_decay: 0
+win_size: 1024
+word_dict_size: 10000
+work_dir: checkpoints/0209_ds_1

checkpoints/diffsinger/model_ckpt_steps_160000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:503f81009a75c02d868253b6fb4f1411aeaa32308b101d7804447bc583636b83
+size 168816223

docs/diffspeech.md ADDED Viewed

	@@ -0,0 +1,62 @@

+# Run DiffSpeech
+## Quick Start
+### Install Dependencies
+Install dependencies following [readme.md](../readme.md)
+### Set Config Path and Experiment Name
+```bash
+export CONFIG_NAME=egs/datasets/audio/lj/ds.yaml
+export MY_EXP_NAME=ds_exp
+```
+### Preprocess and binary dataset
+Prepare dataset following [prepare_data.md](./prepare_data.md)
+### Prepare Vocoder
+Prepare vocoder following [prepare_vocoder.md](./prepare_vocoder.md)
+## Training
+First, you need a pre-trained FastSpeech2 checkpoint `chckpoints/fs2_exp/model_ckpt_steps_160000.ckpt`. To train a FastSpeech 2 model, run:
+```bash
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config egs/datasets/audio/lj/fs2_orig.yaml --exp_name fs2_exp --reset
+```
+Then, run:
+```bash
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $CONFIG_NAME --exp_name $MY_EXP_NAME --reset
+```
+You can check the training and validation curves open Tensorboard via:
+```bash
+tensorboard --logdir checkpoints/$MY_EXP_NAME
+```
+## Inference (Testing)
+```bash
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $CONFIG_NAME --exp_name $MY_EXP_NAME --infer
+```
+## Citation
+If you find this useful for your research, please use the following.
+```bib
+@article{liu2021diffsinger,
+  title={Diffsinger: Singing voice synthesis via shallow diffusion mechanism},
+  author={Liu, Jinglin and Li, Chengxi and Ren, Yi and Chen, Feiyang and Liu, Peng and Zhao, Zhou},
+  journal={arXiv preprint arXiv:2105.02446},
+  volume={2},
+  year={2021}
+ }
+```

docs/prepare_vocoder.md CHANGED Viewed

@@ -26,7 +26,7 @@ export MY_EXP_NAME=my_hifigan_exp
 Prepare dataset following [prepare_data.md](./prepare_data.md).
 If you have run the `prepare_data` step of the acoustic
-model (e.g., FastSpeech 2 and PortaSpeech), you only need to binarize the dataset for the vocoder training:
 ```bash
 python data_gen/tts/runs/binarize.py --config $CONFIG_NAME

 Prepare dataset following [prepare_data.md](./prepare_data.md).
 If you have run the `prepare_data` step of the acoustic
+model (e.g., PortaSpeech and DiffSpeech), you only need to binarize the dataset for the vocoder training:
 ```bash
 python data_gen/tts/runs/binarize.py --config $CONFIG_NAME

egs/datasets/audio/lj/ds.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+base_config:
+  - egs/egs_bases/tts/ds.yaml
+  - ./fs2_orig.yaml
+fs2_ckpt: checkpoints/fs2_exp/model_ckpt_steps_160000.ckpt
+# spec_min and spec_max are calculated on the training set.
+spec_min: [ -4.7574, -4.6783, -4.6431, -4.5832, -4.5390, -4.6771, -4.8089, -4.7672,
+            -4.5784, -4.7755, -4.7150, -4.8919, -4.8271, -4.7389, -4.6047, -4.7759,
+            -4.6799, -4.8201, -4.7823, -4.8262, -4.7857, -4.7545, -4.9358, -4.9733,
+            -5.1134, -5.1395, -4.9016, -4.8434, -5.0189, -4.8460, -5.0529, -4.9510,
+            -5.0217, -5.0049, -5.1831, -5.1445, -5.1015, -5.0281, -4.9887, -4.9916,
+            -4.9785, -4.9071, -4.9488, -5.0342, -4.9332, -5.0650, -4.8924, -5.0875,
+            -5.0483, -5.0848, -5.0655, -5.0279, -5.0015, -5.0792, -5.0636, -5.2413,
+            -5.1421, -5.1710, -5.3256, -5.0511, -5.1186, -5.0057, -5.0446, -5.1173,
+            -5.0325, -5.1085, -5.0053, -5.0755, -5.1176, -5.1004, -5.2153, -5.2757,
+            -5.3025, -5.2867, -5.2918, -5.3328, -5.2731, -5.2985, -5.2400, -5.2211 ]
+spec_max: [ -0.5982, -0.0778,  0.1205,  0.2747,  0.4657,  0.5123,  0.5830,  0.7093,
+            0.6461,  0.6101,  0.7316,  0.7715,  0.7681,  0.8349,  0.7815,  0.7591,
+            0.7910,  0.7433,  0.7352,  0.6869,  0.6854,  0.6623,  0.5353,  0.6492,
+            0.6909,  0.6106,  0.5761,  0.5236,  0.5638,  0.4054,  0.4545,  0.3407,
+            0.3037,  0.3380,  0.1599,  0.1603,  0.2741,  0.2130,  0.1569,  0.1911,
+            0.2324,  0.1586,  0.1221,  0.0341, -0.0558,  0.0553, -0.1153, -0.0933,
+            -0.1171, -0.0050, -0.1519, -0.1629, -0.0522, -0.0739, -0.2069, -0.2405,
+            -0.1244, -0.2582, -0.1361, -0.1575, -0.1442,  0.0513, -0.1567, -0.2000,
+            0.0086, -0.0698,  0.1385,  0.0941,  0.1864,  0.1225,  0.1389,  0.1382,
+            0.1670,  0.1007,  0.1444,  0.0888,  0.1998,  0.2280,  0.2932,  0.3047 ]
+max_tokens: 30000

egs/egs_bases/tts/ds.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+base_config: ./fs2_orig.yaml
+# special configs for diffspeech
+task_cls: tasks.tts.diffspeech.DiffSpeechTask
+lr: 0.001
+timesteps: 100
+K_step: 71
+diff_loss_type: l1
+diff_decoder_type: 'wavenet'
+schedule_type: 'linear'
+max_beta: 0.06
+## model configs for diffspeech
+dilation_cycle_length: 1
+residual_layers: 20
+residual_channels: 256
+decay_steps: 50000
+keep_bins: 80
+#content_cond_steps: [ ] # [ 0, 10000 ]
+#spk_cond_steps: [ ] # [ 0, 10000 ]
+#gen_tgt_spk_id: -1
+# training configs for diffspeech
+#max_sentences: 48
+#num_sanity_val_steps: 1
+num_valid_plots: 10
+use_gt_dur: false
+use_gt_f0: false
+#pitch_type: cwt
+max_updates: 160000

inference/tts/ds.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+# from inference.tts.fs import FastSpeechInfer
+# from modules.tts.fs2_orig import FastSpeech2Orig
+from inference.tts.base_tts_infer import BaseTTSInfer
+from modules.tts.diffspeech.shallow_diffusion_tts import GaussianDiffusion
+from utils.commons.ckpt_utils import load_ckpt
+from utils.commons.hparams import hparams
+class DiffSpeechInfer(BaseTTSInfer):
+    def build_model(self):
+        dict_size = len(self.ph_encoder)
+        model = GaussianDiffusion(dict_size, self.hparams)
+        model.eval()
+        load_ckpt(model, hparams['work_dir'], 'model')
+        return model
+    def forward_model(self, inp):
+        sample = self.input_to_batch(inp)
+        txt_tokens = sample['txt_tokens']  # [B, T_t]
+        spk_id = sample.get('spk_ids')
+        with torch.no_grad():
+            output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True)
+            mel_out = output['mel_out']
+            wav_out = self.run_vocoder(mel_out)
+        wav_out = wav_out.cpu().numpy()
+        return wav_out[0]
+if __name__ == '__main__':
+    DiffSpeechInfer.example_run()

modules/tts/commons/align_ops.py CHANGED Viewed

@@ -13,9 +13,8 @@ def mel2ph_to_mel2word(mel2ph, ph2word):
 def clip_mel2token_to_multiple(mel2token, frames_multiple):
-    if mel2token.shape[1] % frames_multiple > 0:
-        max_frames = mel2token.shape[1] // frames_multiple * frames_multiple
-        mel2token = mel2token[:, :max_frames]
     return mel2token

 def clip_mel2token_to_multiple(mel2token, frames_multiple):
+    max_frames = mel2token.shape[1] // frames_multiple * frames_multiple
+    mel2token = mel2token[:, :max_frames]
     return mel2token

modules/tts/diffspeech/net.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from math import sqrt
+Linear = nn.Linear
+ConvTranspose2d = nn.ConvTranspose2d
+class Mish(nn.Module):
+    def forward(self, x):
+        return x * torch.tanh(F.softplus(x))
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+def Conv1d(*args, **kwargs):
+    layer = nn.Conv1d(*args, **kwargs)
+    nn.init.kaiming_normal_(layer.weight)
+    return layer
+class ResidualBlock(nn.Module):
+    def __init__(self, encoder_hidden, residual_channels, dilation):
+        super().__init__()
+        self.dilated_conv = Conv1d(residual_channels, 2 * residual_channels, 3, padding=dilation, dilation=dilation)
+        self.diffusion_projection = Linear(residual_channels, residual_channels)
+        self.conditioner_projection = Conv1d(encoder_hidden, 2 * residual_channels, 1)
+        self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1)
+    def forward(self, x, conditioner, diffusion_step):
+        diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
+        conditioner = self.conditioner_projection(conditioner)
+        y = x + diffusion_step
+        y = self.dilated_conv(y) + conditioner
+        gate, filter = torch.chunk(y, 2, dim=1)
+        y = torch.sigmoid(gate) * torch.tanh(filter)
+        y = self.output_projection(y)
+        residual, skip = torch.chunk(y, 2, dim=1)
+        return (x + residual) / sqrt(2.0), skip
+class DiffNet(nn.Module):
+    def __init__(self, hparams):
+        super().__init__()
+        in_dims = hparams['audio_num_mel_bins']
+        self.encoder_hidden = hparams['hidden_size']
+        self.residual_layers = hparams['residual_layers']
+        self.residual_channels = hparams['residual_channels']
+        self.dilation_cycle_length = hparams['dilation_cycle_length']
+        self.input_projection = Conv1d(in_dims, self.residual_channels, 1)
+        self.diffusion_embedding = SinusoidalPosEmb(self.residual_channels)
+        dim = self.residual_channels
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, dim * 4),
+            Mish(),
+            nn.Linear(dim * 4, dim)
+        )
+        self.residual_layers = nn.ModuleList([
+            ResidualBlock(self.encoder_hidden, self.residual_channels, 2 ** (i % self.dilation_cycle_length))
+            for i in range(self.residual_layers)
+        ])
+        self.skip_projection = Conv1d(self.residual_channels, self.residual_channels, 1)
+        self.output_projection = Conv1d(self.residual_channels, in_dims, 1)
+        nn.init.zeros_(self.output_projection.weight)
+    def forward(self, spec, diffusion_step, cond):
+        """
+        :param spec: [B, 1, M, T]
+        :param diffusion_step: [B, 1]
+        :param cond: [B, M, T]
+        :return:
+        """
+        x = spec[:, 0]
+        x = self.input_projection(x)  # x [B, residual_channel, T]
+        x = F.relu(x)
+        diffusion_step = self.diffusion_embedding(diffusion_step)
+        diffusion_step = self.mlp(diffusion_step)
+        skip = []
+        for layer_id, layer in enumerate(self.residual_layers):
+            x, skip_connection = layer(x, cond, diffusion_step)
+            skip.append(skip_connection)
+        x = torch.sum(torch.stack(skip), dim=0) / sqrt(len(self.residual_layers))
+        x = self.skip_projection(x)
+        x = F.relu(x)
+        x = self.output_projection(x)  # [B, 80, T]
+        return x[:, None, :, :]

modules/tts/diffspeech/shallow_diffusion_tts.py ADDED Viewed

	@@ -0,0 +1,281 @@

+import math
+import random
+from functools import partial
+from inspect import isfunction
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from tqdm import tqdm
+from modules.tts.fs2_orig import FastSpeech2Orig
+from modules.tts.diffspeech.net import DiffNet
+from modules.tts.commons.align_ops import expand_states
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+# gaussian diffusion trainer class
+def extract(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def noise_like(shape, device, repeat=False):
+    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
+    noise = lambda: torch.randn(shape, device=device)
+    return repeat_noise() if repeat else noise()
+def linear_beta_schedule(timesteps, max_beta=0.01):
+    """
+    linear schedule
+    """
+    betas = np.linspace(1e-4, max_beta, timesteps)
+    return betas
+def cosine_beta_schedule(timesteps, s=0.008):
+    """
+    cosine schedule
+    as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
+    """
+    steps = timesteps + 1
+    x = np.linspace(0, steps, steps)
+    alphas_cumprod = np.cos(((x / steps) + s) / (1 + s) * np.pi * 0.5) ** 2
+    alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+    betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+    return np.clip(betas, a_min=0, a_max=0.999)
+beta_schedule = {
+    "cosine": cosine_beta_schedule,
+    "linear": linear_beta_schedule,
+}
+DIFF_DECODERS = {
+    'wavenet': lambda hp: DiffNet(hp),
+}
+class AuxModel(FastSpeech2Orig):
+    def forward(self, txt_tokens, mel2ph=None, spk_embed=None, spk_id=None,
+                f0=None, uv=None, energy=None, infer=False, **kwargs):
+        ret = {}
+        encoder_out = self.encoder(txt_tokens)  # [B, T, C]
+        src_nonpadding = (txt_tokens > 0).float()[:, :, None]
+        style_embed = self.forward_style_embed(spk_embed, spk_id)
+        # add dur
+        dur_inp = (encoder_out + style_embed) * src_nonpadding
+        mel2ph = self.forward_dur(dur_inp, mel2ph, txt_tokens, ret)
+        tgt_nonpadding = (mel2ph > 0).float()[:, :, None]
+        decoder_inp = decoder_inp_ = expand_states(encoder_out, mel2ph)
+        # add pitch and energy embed
+        if self.hparams['use_pitch_embed']:
+            pitch_inp = (decoder_inp_ + style_embed) * tgt_nonpadding
+            decoder_inp = decoder_inp + self.forward_pitch(pitch_inp, f0, uv, mel2ph, ret, encoder_out)
+        # add pitch and energy embed
+        if self.hparams['use_energy_embed']:
+            energy_inp = (decoder_inp_ + style_embed) * tgt_nonpadding
+            decoder_inp = decoder_inp + self.forward_energy(energy_inp, energy, ret)
+        # decoder input
+        ret['decoder_inp'] = decoder_inp = (decoder_inp + style_embed) * tgt_nonpadding
+        if self.hparams['dec_inp_add_noise']:
+            B, T, _ = decoder_inp.shape
+            z = kwargs.get('adv_z', torch.randn([B, T, self.z_channels])).to(decoder_inp.device)
+            ret['adv_z'] = z
+            decoder_inp = torch.cat([decoder_inp, z], -1)
+            decoder_inp = self.dec_inp_noise_proj(decoder_inp) * tgt_nonpadding
+        if kwargs['skip_decoder']:
+            return ret
+        ret['mel_out'] = self.forward_decoder(decoder_inp, tgt_nonpadding, ret, infer=infer, **kwargs)
+        return ret
+class GaussianDiffusion(nn.Module):
+    def __init__(self, dict_size, hparams, out_dims=None):
+        super().__init__()
+        self.hparams = hparams
+        out_dims = hparams['audio_num_mel_bins']
+        denoise_fn = DIFF_DECODERS[hparams['diff_decoder_type']](hparams)
+        timesteps = hparams['timesteps']
+        K_step = hparams['K_step']
+        loss_type = hparams['diff_loss_type']
+        spec_min = hparams['spec_min']
+        spec_max = hparams['spec_max']
+        self.denoise_fn = denoise_fn
+        self.fs2 = AuxModel(dict_size, hparams)
+        self.mel_bins = out_dims
+        if hparams['schedule_type'] == 'linear':
+            betas = linear_beta_schedule(timesteps, hparams['max_beta'])
+        else:
+            betas = cosine_beta_schedule(timesteps)
+        alphas = 1. - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
+        timesteps, = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.K_step = K_step
+        self.loss_type = loss_type
+        to_torch = partial(torch.tensor, dtype=torch.float32)
+        self.register_buffer('betas', to_torch(betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
+        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
+        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = betas * (1. - alphas_cumprod_prev) / (1. - alphas_cumprod)
+        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
+        self.register_buffer('posterior_variance', to_torch(posterior_variance))
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
+        self.register_buffer('posterior_mean_coef1', to_torch(
+            betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
+        self.register_buffer('posterior_mean_coef2', to_torch(
+            (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
+        self.register_buffer('spec_min', torch.FloatTensor(spec_min)[None, None, :hparams['keep_bins']])
+        self.register_buffer('spec_max', torch.FloatTensor(spec_max)[None, None, :hparams['keep_bins']])
+    def q_mean_variance(self, x_start, t):
+        mean = extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        variance = extract(1. - self.alphas_cumprod, t, x_start.shape)
+        log_variance = extract(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+    def predict_start_from_noise(self, x_t, t, noise):
+        return (
+                extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
+                extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
+        )
+    def q_posterior(self, x_start, x_t, t):
+        posterior_mean = (
+                extract(self.posterior_mean_coef1, t, x_t.shape) * x_start +
+                extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = extract(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, x_t.shape)
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(self, x, t, cond, clip_denoised: bool):
+        noise_pred = self.denoise_fn(x, t, cond=cond)
+        x_recon = self.predict_start_from_noise(x, t=t, noise=noise_pred)
+        if clip_denoised:
+            x_recon.clamp_(-1., 1.)
+        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
+        return model_mean, posterior_variance, posterior_log_variance
+    @torch.no_grad()
+    def p_sample(self, x, t, cond, clip_denoised=True, repeat_noise=False):
+        b, *_, device = *x.shape, x.device
+        model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, cond=cond, clip_denoised=clip_denoised)
+        noise = noise_like(x.shape, device, repeat_noise)
+        # no noise when t == 0
+        nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
+        return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
+    def q_sample(self, x_start, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        return (
+                extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
+                extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def p_losses(self, x_start, t, cond, noise=None, nonpadding=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+        x_recon = self.denoise_fn(x_noisy, t, cond)
+        if self.loss_type == 'l1':
+            if nonpadding is not None:
+                loss = ((noise - x_recon).abs() * nonpadding.unsqueeze(1)).mean()
+            else:
+                # print('are you sure w/o nonpadding?')
+                loss = (noise - x_recon).abs().mean()
+        elif self.loss_type == 'l2':
+            loss = F.mse_loss(noise, x_recon)
+        else:
+            raise NotImplementedError()
+        return loss
+    def forward(self, txt_tokens, mel2ph=None, spk_embed=None, spk_id=None,
+                ref_mels=None, f0=None, uv=None, energy=None, infer=False, **kwargs):
+        b, *_, device = *txt_tokens.shape, txt_tokens.device
+        ret = self.fs2(txt_tokens, mel2ph=mel2ph, spk_embed=spk_embed, spk_id=spk_id,
+                                f0=f0, uv=uv, energy=energy, infer=infer, skip_decoder=(not infer), **kwargs)
+            # (txt_tokens, mel2ph, spk_embed, ref_mels, f0, uv, energy,
+            #            skip_decoder=(not infer), infer=infer, **kwargs)
+        cond = ret['decoder_inp'].transpose(1, 2)
+        if not infer:
+            t = torch.randint(0, self.K_step, (b,), device=device).long()
+            x = ref_mels
+            x = self.norm_spec(x)
+            x = x.transpose(1, 2)[:, None, :, :]  # [B, 1, M, T]
+            ret['diff_loss'] = self.p_losses(x, t, cond)
+            # nonpadding = (mel2ph != 0).float()
+            # ret['diff_loss'] = self.p_losses(x, t, cond, nonpadding=nonpadding)
+            ret['mel_out'] = None
+        else:
+            ret['fs2_mel'] = ret['mel_out']
+            fs2_mels = ret['mel_out']
+            t = self.K_step
+            fs2_mels = self.norm_spec(fs2_mels)
+            fs2_mels = fs2_mels.transpose(1, 2)[:, None, :, :]
+            x = self.q_sample(x_start=fs2_mels, t=torch.tensor([t - 1], device=device).long())
+            if self.hparams.get('gaussian_start') is not None and self.hparams['gaussian_start']:
+                print('===> gaussian start.')
+                shape = (cond.shape[0], 1, self.mel_bins, cond.shape[2])
+                x = torch.randn(shape, device=device)
+            for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t):
+                x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond)
+            x = x[:, 0].transpose(1, 2)
+            ret['mel_out'] = self.denorm_spec(x)
+        return ret
+    def norm_spec(self, x):
+        return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
+    def denorm_spec(self, x):
+        return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min
+    def cwt2f0_norm(self, cwt_spec, mean, std, mel2ph):
+        return self.fs2.cwt2f0_norm(cwt_spec, mean, std, mel2ph)
+    def out2mel(self, x):
+        return x

tasks/tts/diffspeech.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import torch
+from modules.tts.diffspeech.shallow_diffusion_tts import GaussianDiffusion
+from tasks.tts.fs2_orig import FastSpeech2OrigTask
+import utils
+from utils.commons.hparams import hparams
+from utils.commons.ckpt_utils import load_ckpt
+from utils.audio.pitch.utils import denorm_f0
+class DiffSpeechTask(FastSpeech2OrigTask):
+    def build_tts_model(self):
+        # get min and max
+        # import torch
+        # from tqdm import tqdm
+        # v_min = torch.ones([80]) * 100
+        # v_max = torch.ones([80]) * -100
+        # for i, ds in enumerate(tqdm(self.dataset_cls('train'))):
+        #     v_max = torch.max(torch.max(ds['mel'].reshape(-1, 80), 0)[0], v_max)
+        #     v_min = torch.min(torch.min(ds['mel'].reshape(-1, 80), 0)[0], v_min)
+        #     if i % 100 == 0:
+        #         print(i, v_min, v_max)
+        # print('final', v_min, v_max)
+        dict_size = len(self.token_encoder)
+        self.model = GaussianDiffusion(dict_size, hparams)
+        if hparams['fs2_ckpt'] != '':
+            load_ckpt(self.model.fs2, hparams['fs2_ckpt'], 'model', strict=True)
+        for k, v in self.model.fs2.named_parameters():
+            if 'predictor' not in k:
+                v.requires_grad = False
+        # or
+        # for k, v in self.model.fs2.named_parameters():
+        #     v.requires_grad = False
+    def build_optimizer(self, model):
+        self.optimizer = optimizer = torch.optim.AdamW(
+            filter(lambda p: p.requires_grad, model.parameters()),
+            lr=hparams['lr'],
+            betas=(hparams['optimizer_adam_beta1'], hparams['optimizer_adam_beta2']),
+            weight_decay=hparams['weight_decay'])
+        return optimizer
+    def build_scheduler(self, optimizer):
+        return torch.optim.lr_scheduler.StepLR(optimizer, hparams['decay_steps'], gamma=0.5)
+    def run_model(self, sample, infer=False, *args, **kwargs):
+        txt_tokens = sample['txt_tokens']  # [B, T_t]
+        spk_embed = sample.get('spk_embed')
+        spk_id = sample.get('spk_ids')
+        if not infer:
+            target = sample['mels']  # [B, T_s, 80]
+            mel2ph = sample['mel2ph']  # [B, T_s]
+            f0 = sample.get('f0')
+            uv = sample.get('uv')
+            output = self.model(txt_tokens, mel2ph=mel2ph, spk_embed=spk_embed, spk_id=spk_id,
+                                ref_mels=target, f0=f0, uv=uv, infer=False)
+            losses = {}
+            if 'diff_loss' in output:
+                losses['mel'] = output['diff_loss']
+            self.add_dur_loss(output['dur'], mel2ph, txt_tokens, losses=losses)
+            if hparams['use_pitch_embed']:
+                self.add_pitch_loss(output, sample, losses)
+            return losses, output
+        else:
+            use_gt_dur = kwargs.get('infer_use_gt_dur', hparams['use_gt_dur'])
+            use_gt_f0 = kwargs.get('infer_use_gt_f0', hparams['use_gt_f0'])
+            mel2ph, uv, f0 = None, None, None
+            if use_gt_dur:
+                mel2ph = sample['mel2ph']
+            if use_gt_f0:
+                f0 = sample['f0']
+                uv = sample['uv']
+            output = self.model(txt_tokens, mel2ph=mel2ph, spk_embed=spk_embed, spk_id=spk_id,
+                                ref_mels=None, f0=f0, uv=uv, infer=True)
+            return output
+    def save_valid_result(self, sample, batch_idx, model_out):
+        sr = hparams['audio_sample_rate']
+        f0_gt = None
+        # mel_out = model_out['mel_out']
+        if sample.get('f0') is not None:
+            f0_gt = denorm_f0(sample['f0'][0].cpu(), sample['uv'][0].cpu())
+        # self.plot_mel(batch_idx, sample['mels'], mel_out, f0s=f0_gt)
+        if self.global_step > 0:
+            # wav_pred = self.vocoder.spec2wav(mel_out[0].cpu(), f0=f0_gt)
+            # self.logger.add_audio(f'wav_val_{batch_idx}', wav_pred, self.global_step, sr)
+            # with gt duration
+            model_out = self.run_model(sample, infer=True, infer_use_gt_dur=True)
+            dur_info = self.get_plot_dur_info(sample, model_out)
+            del dur_info['dur_pred']
+            wav_pred = self.vocoder.spec2wav(model_out['mel_out'][0].cpu(), f0=f0_gt)
+            self.logger.add_audio(f'wav_gdur_{batch_idx}', wav_pred, self.global_step, sr)
+            self.plot_mel(batch_idx, sample['mels'], model_out['mel_out'][0], f'diffmel_gdur_{batch_idx}',
+                          dur_info=dur_info, f0s=f0_gt)
+            self.plot_mel(batch_idx, sample['mels'], model_out['fs2_mel'][0], f'fs2mel_gdur_{batch_idx}',
+                          dur_info=dur_info, f0s=f0_gt)  # gt mel vs. fs2 mel
+            # with pred duration
+            if not hparams['use_gt_dur']:
+                model_out = self.run_model(sample, infer=True, infer_use_gt_dur=False)
+                dur_info = self.get_plot_dur_info(sample, model_out)
+                self.plot_mel(batch_idx, sample['mels'], model_out['mel_out'][0], f'mel_pdur_{batch_idx}',
+                              dur_info=dur_info, f0s=f0_gt)
+                wav_pred = self.vocoder.spec2wav(model_out['mel_out'][0].cpu(), f0=f0_gt)
+                self.logger.add_audio(f'wav_pdur_{batch_idx}', wav_pred, self.global_step, sr)
+        # gt wav
+        if self.global_step <= hparams['valid_infer_interval']:
+            mel_gt = sample['mels'][0].cpu()
+            wav_gt = self.vocoder.spec2wav(mel_gt, f0=f0_gt)
+            self.logger.add_audio(f'wav_gt_{batch_idx}', wav_gt, self.global_step, sr)