Spaces:

Rongjiehuang
/

ProDiff

Runtime error

App Files Files Community

Rongjiehuang commited on Aug 26, 2022

Commit

64e7f2f

0 Parent(s):

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +32 -0
.gitignore +151 -0
LICENSE +21 -0
README.md +10 -0
checkpoints/FastDiff/config.yaml +149 -0
checkpoints/FastDiff/model_ckpt_steps_500000.ckpt +3 -0
checkpoints/ProDiff/config.yaml +205 -0
checkpoints/ProDiff/model_ckpt_steps_200000.ckpt +3 -0
checkpoints/ProDiff_Teacher/config.yaml +205 -0
checkpoints/ProDiff_Teacher/model_ckpt_steps_188000.ckpt +3 -0
data/binary/LJSpeech/phone_set.json +1 -0
data/binary/LJSpeech/spk_map.json +1 -0
data/binary/LJSpeech/train_f0s_mean_std.npy +3 -0
data_gen/tts/base_binarizer.py +224 -0
data_gen/tts/base_preprocess.py +245 -0
data_gen/tts/bin/binarize.py +20 -0
data_gen/tts/data_gen_utils.py +352 -0
data_gen/tts/txt_processors/__init__.py +1 -0
data_gen/tts/txt_processors/base_text_processor.py +47 -0
data_gen/tts/txt_processors/en.py +77 -0
data_gen/tts/wav_processors/__init__.py +2 -0
data_gen/tts/wav_processors/base_processor.py +25 -0
data_gen/tts/wav_processors/common_processors.py +86 -0
egs/datasets/audio/libritts/base_text2mel.yaml +14 -0
egs/datasets/audio/libritts/fs2.yaml +3 -0
egs/datasets/audio/libritts/pre_align.py +18 -0
egs/datasets/audio/libritts/pwg.yaml +8 -0
egs/datasets/audio/lj/base_mel2wav.yaml +5 -0
egs/datasets/audio/lj/pre_align.py +13 -0
egs/datasets/audio/lj/pwg.yaml +3 -0
egs/datasets/audio/vctk/base_mel2wav.yaml +3 -0
egs/datasets/audio/vctk/fs2.yaml +12 -0
egs/datasets/audio/vctk/pre_align.py +22 -0
egs/datasets/audio/vctk/pwg.yaml +6 -0
egs/egs_bases/config_base.yaml +46 -0
egs/egs_bases/tts/base.yaml +112 -0
egs/egs_bases/tts/fs2.yaml +102 -0
egs/egs_bases/tts/vocoder/base.yaml +34 -0
egs/egs_bases/tts/vocoder/pwg.yaml +82 -0
inference/ProDiff.py +49 -0
inference/ProDiff_Teacher.py +41 -0
inference/base_tts_infer.py +167 -0
inference/gradio/gradio_settings.yaml +41 -0
inference/gradio/infer.py +69 -0
modules/FastDiff/config/FastDiff.yaml +7 -0
modules/FastDiff/config/FastDiff_libritts.yaml +7 -0
modules/FastDiff/config/FastDiff_sc09.yaml +25 -0
modules/FastDiff/config/FastDiff_tacotron.yaml +58 -0
modules/FastDiff/config/FastDiff_vctk.yaml +7 -0
modules/FastDiff/config/base.yaml +157 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,32 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,151 @@

+### Project ignore
+/ParallelWaveGAN
+/wavegan_pretrained*
+/pretrained_models
+rsync
+.idea
+.DS_Store
+bak
+tmp
+*.tar.gz
+# mfa and kaldi
+kaldi_align/exp
+mfa
+montreal-forced-aligner
+mos
+nbs
+/configs_usr/*
+!/configs_usr/.gitkeep
+/fast_transformers
+/rnnoise
+/usr/*
+!/usr/.gitkeep
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+将删除 datasets/remi/test/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2021 Jinglin Liu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: ProDiff
+emoji: 🤗
+colorFrom: yellow
+colorTo: orange
+sdk: gradio
+app_file: "inference/gradio/infer.py"
+pinned: false
+---

checkpoints/FastDiff/config.yaml ADDED Viewed

	@@ -0,0 +1,149 @@

+N: ''
+T: 1000
+accumulate_grad_batches: 1
+amp: false
+audio_channels: 1
+audio_num_mel_bins: 80
+audio_sample_rate: 22050
+aux_context_window: 0
+beta_0: 1.0e-06
+beta_T: 0.01
+binarization_args:
+  reset_phone_dict: true
+  reset_word_dict: true
+  shuffle: false
+  trim_eos_bos: false
+  with_align: false
+  with_f0: false
+  with_f0cwt: false
+  with_linear: false
+  with_spk_embed: false
+  with_spk_id: true
+  with_txt: false
+  with_wav: true
+  with_word: false
+binarizer_cls: data_gen.tts.vocoder_binarizer.VocoderBinarizer
+binary_data_dir: data/binary/LJSpeech
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+cond_channels: 80
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+dict_dir: ''
+diffusion_step_embed_dim_in: 128
+diffusion_step_embed_dim_mid: 512
+diffusion_step_embed_dim_out: 512
+disc_start_steps: 40000
+discriminator_grad_norm: 1
+dropout: 0.0
+ds_workers: 1
+enc_ffn_kernel_size: 9
+enc_layers: 4
+endless_ds: true
+eval_max_batches: -1
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 1024
+fmax: 7600
+fmin: 80
+frames_multiple: 1
+gen_dir_name: ''
+generator_grad_norm: 10
+griffin_lim_iters: 60
+hidden_size: 256
+hop_size: 256
+infer: false
+inner_channels: 32
+kpnet_conv_size: 3
+kpnet_hidden_channels: 64
+load_ckpt: ''
+loud_norm: false
+lr: 2e-4
+lvc_kernel_size: 3
+lvc_layers_each_block: 4
+max_epochs: 1000
+max_frames: 1548
+max_input_tokens: 1550
+max_samples: 25600
+max_sentences: 20
+max_tokens: 30000
+max_updates: 1000000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+mel_loss: l1
+mel_vmax: 1.5
+mel_vmin: -6
+mfa_version: 2
+min_frames: 0
+min_level_db: -100
+noise_schedule: ''
+num_ckpt_keep: 3
+num_heads: 2
+num_mels: 80
+num_sanity_val_steps: -1
+num_spk: 400
+num_test_samples: 0
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_extractor: parselmouth
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  nsample_per_mfa_group: 1000
+  sox_resample: false
+  sox_to_wav: false
+  trim_sil: false
+  txt_processor: en
+  use_tone: true
+pre_align_cls: egs.datasets.audio.pre_align.PreAlign
+print_nan_grads: false
+processed_data_dir: data/processed/LJSpeech
+profile_infer: false
+raw_data_dir: data/raw/LJSpeech-1.1
+ref_level_db: 20
+rename_tmux: true
+resume_from_checkpoint: 0
+save_best: true
+save_codes: []
+save_f0: false
+save_gt: true
+scheduler: rsqrt
+seed: 1234
+sort_by_len: true
+task_cls: modules.FastDiff.task.FastDiff.FastDiffTask
+tb_log_interval: 100
+test_ids: []
+test_input_dir: ''
+test_mel_dir: ''
+test_num: 100
+test_set_name: test
+train_set_name: train
+train_sets: ''
+upsample_ratios:
+- 8
+- 8
+- 4
+use_pitch_embed: false
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+use_wav: true
+use_weight_norm: true
+use_word_input: false
+val_check_interval: 2000
+valid_infer_interval: 10000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+vocoder_denoise_c: 0.0
+warmup_updates: 8000
+weight_decay: 0
+win_length: null
+win_size: 1024
+window: hann
+word_size: 30000
+work_dir: checkpoints/FastDiff

checkpoints/FastDiff/model_ckpt_steps_500000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee7b6022e525c71a6025b41eeeafff9d6186b52cba76b580d6986bc8674902f3
+size 183951271

checkpoints/ProDiff/config.yaml ADDED Viewed

	@@ -0,0 +1,205 @@

+accumulate_grad_batches: 1
+amp: false
+audio_num_mel_bins: 80
+audio_sample_rate: 22050
+base_config:
+- ./base.yaml
+binarization_args:
+  reset_phone_dict: true
+  reset_word_dict: true
+  shuffle: false
+  trim_eos_bos: false
+  trim_sil: false
+  with_align: true
+  with_f0: true
+  with_f0cwt: false
+  with_linear: false
+  with_spk_embed: false
+  with_spk_id: true
+  with_txt: true
+  with_wav: false
+  with_word: true
+binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
+binary_data_dir: data/binary/LJSpeech
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+conv_use_pos: false
+cwt_add_f0_loss: false
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_std_scale: 0.8
+debug: false
+dec_dilations:
+- 1
+- 1
+- 1
+- 1
+dec_ffn_kernel_size: 9
+dec_inp_add_noise: false
+dec_kernel_size: 5
+dec_layers: 4
+dec_num_heads: 2
+decoder_rnn_dim: 0
+decoder_type: fft
+dict_dir: ''
+diff_decoder_type: wavenet
+diff_loss_type: l1
+dilation_cycle_length: 1
+dropout: 0.1
+ds_workers: 2
+dur_enc_hidden_stride_kernel:
+- 0,2,3
+- 0,2,3
+- 0,1,3
+dur_loss: mse
+dur_predictor_kernel: 3
+dur_predictor_layers: 2
+enc_dec_norm: ln
+enc_dilations:
+- 1
+- 1
+- 1
+- 1
+enc_ffn_kernel_size: 9
+enc_kernel_size: 5
+enc_layers: 4
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+ffn_act: gelu
+ffn_hidden_size: 1024
+ffn_padding: SAME
+fft_size: 1024
+fmax: 7600
+fmin: 80
+frames_multiple: 1
+gen_dir_name: ''
+gen_tgt_spk_id: -1
+griffin_lim_iters: 60
+hidden_size: 256
+hop_size: 256
+infer: false
+keep_bins: 80
+lambda_commit: 0.25
+lambda_energy: 0.1
+lambda_f0: 1.0
+lambda_ph_dur: 0.1
+lambda_sent_dur: 1.0
+lambda_uv: 1.0
+lambda_word_dur: 1.0
+layers_in_block: 2
+load_ckpt: ''
+loud_norm: false
+lr: 1.0
+max_beta: 0.06
+max_epochs: 1000
+max_frames: 1548
+max_input_tokens: 1550
+max_sentences: 48
+max_tokens: 32000
+max_updates: 200000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6
+min_frames: 0
+min_level_db: -100
+num_ckpt_keep: 3
+num_heads: 2
+num_sanity_val_steps: -1
+num_spk: 1
+num_test_samples: 0
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_ar: false
+pitch_embed_type: 0
+pitch_enc_hidden_stride_kernel:
+- 0,2,5
+- 0,2,5
+- 0,2,5
+pitch_extractor: parselmouth
+pitch_loss: l1
+pitch_norm: standard
+pitch_ssim_win: 11
+pitch_type: frame
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  sox_resample: false
+  sox_to_wav: false
+  trim_sil: false
+  txt_processor: en
+  use_tone: true
+pre_align_cls: ''
+predictor_dropout: 0.5
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+pretrain_fs_ckpt: ''
+print_nan_grads: false
+processed_data_dir: data/processed/LJSpeech
+profile_infer: false
+raw_data_dir: data/raw/LJSpeech
+ref_hidden_stride_kernel:
+- 0,3,5
+- 0,3,5
+- 0,2,5
+- 0,2,5
+- 0,2,5
+ref_level_db: 20
+ref_norm_layer: bn
+rename_tmux: true
+residual_channels: 256
+residual_layers: 20
+resume_from_checkpoint: 0
+save_best: true
+save_codes: []
+save_f0: false
+save_gt: true
+schedule_type: vpsde
+scheduler: rsqrt
+seed: 1234
+sil_add_noise: false
+sort_by_len: true
+spec_max: []
+spec_min: []
+task_cls: modules.ProDiff.task.ProDiff_task.ProDiff_Task
+tb_log_interval: 100
+teacher_ckpt: checkpoints/ProDiff_Teacher/model_ckpt_steps_188000.ckpt
+test_ids: []
+test_input_dir: ''
+test_num: 100
+test_set_name: test
+timesteps: 4
+train_set_name: train
+train_sets: ''
+use_cond_disc: true
+use_energy_embed: true
+use_gt_dur: true
+use_gt_f0: true
+use_pitch_embed: true
+use_pos_embed: true
+use_ref_enc: false
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+use_uv: true
+use_var_enc: false
+val_check_interval: 2000
+valid_infer_interval: 10000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+var_enc_vq_codes: 64
+vocoder_denoise_c: 0.0
+warmup_updates: 2000
+weight_decay: 0
+win_size: 1024
+word_size: 30000
+work_dir: checkpoints/ProDiff

checkpoints/ProDiff/model_ckpt_steps_200000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8cc8aad355c297b010e2c362341f736b3477744af76e02f6c9965409a7e9113a
+size 349055740

checkpoints/ProDiff_Teacher/config.yaml ADDED Viewed

	@@ -0,0 +1,205 @@

+accumulate_grad_batches: 1
+amp: false
+audio_num_mel_bins: 80
+audio_sample_rate: 22050
+base_config:
+- ./base.yaml
+binarization_args:
+  reset_phone_dict: true
+  reset_word_dict: true
+  shuffle: false
+  trim_eos_bos: false
+  trim_sil: false
+  with_align: true
+  with_f0: true
+  with_f0cwt: false
+  with_linear: false
+  with_spk_embed: false
+  with_spk_id: true
+  with_txt: true
+  with_wav: false
+  with_word: true
+binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
+binary_data_dir: data/binary/LJSpeech
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+conv_use_pos: false
+cwt_add_f0_loss: false
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_std_scale: 0.8
+debug: false
+dec_dilations:
+- 1
+- 1
+- 1
+- 1
+dec_ffn_kernel_size: 9
+dec_inp_add_noise: false
+dec_kernel_size: 5
+dec_layers: 4
+dec_num_heads: 2
+decoder_rnn_dim: 0
+decoder_type: fft
+dict_dir: ''
+diff_decoder_type: wavenet
+diff_loss_type: l1
+dilation_cycle_length: 1
+dropout: 0.1
+ds_workers: 2
+dur_enc_hidden_stride_kernel:
+- 0,2,3
+- 0,2,3
+- 0,1,3
+dur_loss: mse
+dur_predictor_kernel: 3
+dur_predictor_layers: 2
+enc_dec_norm: ln
+enc_dilations:
+- 1
+- 1
+- 1
+- 1
+enc_ffn_kernel_size: 9
+enc_kernel_size: 5
+enc_layers: 4
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+ffn_act: gelu
+ffn_hidden_size: 1024
+ffn_padding: SAME
+fft_size: 1024
+fmax: 7600
+fmin: 80
+frames_multiple: 1
+gen_dir_name: ''
+gen_tgt_spk_id: -1
+griffin_lim_iters: 60
+hidden_size: 256
+hop_size: 256
+infer: false
+keep_bins: 80
+lambda_commit: 0.25
+lambda_energy: 0.1
+lambda_f0: 1.0
+lambda_ph_dur: 0.1
+lambda_sent_dur: 1.0
+lambda_uv: 1.0
+lambda_word_dur: 1.0
+layers_in_block: 2
+load_ckpt: ''
+loud_norm: false
+lr: 1.0
+max_beta: 0.06
+max_epochs: 1000
+max_frames: 1548
+max_input_tokens: 1550
+max_sentences: 48
+max_tokens: 32000
+max_updates: 200000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6
+min_frames: 0
+min_level_db: -100
+num_ckpt_keep: 3
+num_heads: 2
+num_sanity_val_steps: -1
+num_spk: 1
+num_test_samples: 20
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_ar: false
+pitch_embed_type: 0
+pitch_enc_hidden_stride_kernel:
+- 0,2,5
+- 0,2,5
+- 0,2,5
+pitch_extractor: parselmouth
+pitch_loss: l1
+pitch_norm: standard
+pitch_ssim_win: 11
+pitch_type: frame
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  sox_resample: false
+  sox_to_wav: false
+  trim_sil: false
+  txt_processor: en
+  use_tone: true
+pre_align_cls: egs.datasets.audio.lj.pre_align.LJPreAlign
+predictor_dropout: 0.5
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+pretrain_fs_ckpt: ''
+print_nan_grads: false
+processed_data_dir: data/processed/LJSpeech
+profile_infer: false
+raw_data_dir: data/raw/LJSpeech
+ref_hidden_stride_kernel:
+- 0,3,5
+- 0,3,5
+- 0,2,5
+- 0,2,5
+- 0,2,5
+ref_level_db: 20
+ref_norm_layer: bn
+rename_tmux: true
+residual_channels: 256
+residual_layers: 20
+resume_from_checkpoint: 0
+save_best: true
+save_codes: []
+save_f0: false
+save_gt: true
+schedule_type: vpsde
+scheduler: rsqrt
+seed: 1234
+sil_add_noise: false
+sort_by_len: true
+spec_max: []
+spec_min: []
+task_cls: modules.ProDiff.task.ProDiff_teacher_task.ProDiff_teacher_Task
+tb_log_interval: 100
+test_ids: []
+test_input_dir: ''
+test_num: 100
+test_set_name: test
+timescale: 1
+timesteps: 4
+train_set_name: train
+train_sets: ''
+use_cond_disc: true
+use_energy_embed: true
+use_gt_dur: true
+use_gt_f0: true
+use_pitch_embed: true
+use_pos_embed: true
+use_ref_enc: false
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+use_uv: true
+use_var_enc: false
+val_check_interval: 2000
+valid_infer_interval: 10000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+var_enc_vq_codes: 64
+vocoder_denoise_c: 0.0
+warmup_updates: 2000
+weight_decay: 0
+win_size: 1024
+word_size: 30000
+work_dir: checkpoints/ProDiff_Teacher1

checkpoints/ProDiff_Teacher/model_ckpt_steps_188000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d3d02a215431c69dd54c1413b9a02cdc32795e2039ad9be857b12e85c470eea
+size 342252871

data/binary/LJSpeech/phone_set.json ADDED Viewed

	@@ -0,0 +1 @@

+ ["!", ",", ".", ":", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]

data/binary/LJSpeech/spk_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"SPK1": 0}

data/binary/LJSpeech/train_f0s_mean_std.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8790d5a84d77143690ae71a1f1e7fc81359e69ead263dc440366f2164c739efd
+size 144

data_gen/tts/base_binarizer.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+from utils.multiprocess_utils import chunked_multiprocess_run
+import random
+import traceback
+import json
+from resemblyzer import VoiceEncoder
+from tqdm import tqdm
+from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder
+from utils.hparams import set_hparams, hparams
+import numpy as np
+from utils.indexed_datasets import IndexedDatasetBuilder
+from vocoders.base_vocoder import VOCODERS
+import pandas as pd
+class BinarizationError(Exception):
+    pass
+class BaseBinarizer:
+    def __init__(self, processed_data_dir=None):
+        if processed_data_dir is None:
+            processed_data_dir = hparams['processed_data_dir']
+        self.processed_data_dirs = processed_data_dir.split(",")
+        self.binarization_args = hparams['binarization_args']
+        self.pre_align_args = hparams['pre_align_args']
+        self.forced_align = self.pre_align_args['forced_align']
+        tg_dir = None
+        if self.forced_align == 'mfa':
+            tg_dir = 'mfa_outputs'
+        if self.forced_align == 'kaldi':
+            tg_dir = 'kaldi_outputs'
+        self.item2txt = {}
+        self.item2ph = {}
+        self.item2wavfn = {}
+        self.item2tgfn = {}
+        self.item2spk = {}
+        for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
+            self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
+            for r_idx, r in self.meta_df.iterrows():
+                item_name = raw_item_name = r['item_name']
+                if len(self.processed_data_dirs) > 1:
+                    item_name = f'ds{ds_id}_{item_name}'
+                self.item2txt[item_name] = r['txt']
+                self.item2ph[item_name] = r['ph']
+                self.item2wavfn[item_name] = os.path.join(hparams['raw_data_dir'], 'wavs', os.path.basename(r['wav_fn']).split('_')[1])
+                self.item2spk[item_name] = r.get('spk', 'SPK1')
+                if len(self.processed_data_dirs) > 1:
+                    self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
+                if tg_dir is not None:
+                    self.item2tgfn[item_name] = f"{processed_data_dir}/{tg_dir}/{raw_item_name}.TextGrid"
+        self.item_names = sorted(list(self.item2txt.keys()))
+        if self.binarization_args['shuffle']:
+            random.seed(1234)
+            random.shuffle(self.item_names)
+    @property
+    def train_item_names(self):
+        return self.item_names[hparams['test_num']+hparams['valid_num']:]
+    @property
+    def valid_item_names(self):
+        return self.item_names[0: hparams['test_num']+hparams['valid_num']]  #
+    @property
+    def test_item_names(self):
+        return self.item_names[0: hparams['test_num']]  # Audios for MOS testing are in 'test_ids'
+    def build_spk_map(self):
+        spk_map = set()
+        for item_name in self.item_names:
+            spk_name = self.item2spk[item_name]
+            spk_map.add(spk_name)
+        spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
+        assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
+        return spk_map
+    def item_name2spk_id(self, item_name):
+        return self.spk_map[self.item2spk[item_name]]
+    def _phone_encoder(self):
+        ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
+        ph_set = []
+        if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
+            for processed_data_dir in self.processed_data_dirs:
+                ph_set += [x.split(' ')[0] for x in open(f'{processed_data_dir}/dict.txt').readlines()]
+            ph_set = sorted(set(ph_set))
+            json.dump(ph_set, open(ph_set_fn, 'w'))
+        else:
+            ph_set = json.load(open(ph_set_fn, 'r'))
+        print("| phone set: ", ph_set)
+        return build_phone_encoder(hparams['binary_data_dir'])
+    def meta_data(self, prefix):
+        if prefix == 'valid':
+            item_names = self.valid_item_names
+        elif prefix == 'test':
+            item_names = self.test_item_names
+        else:
+            item_names = self.train_item_names
+        for item_name in item_names:
+            ph = self.item2ph[item_name]
+            txt = self.item2txt[item_name]
+            tg_fn = self.item2tgfn.get(item_name)
+            wav_fn = self.item2wavfn[item_name]
+            spk_id = self.item_name2spk_id(item_name)
+            yield item_name, ph, txt, tg_fn, wav_fn, spk_id
+    def process(self):
+        os.makedirs(hparams['binary_data_dir'], exist_ok=True)
+        self.spk_map = self.build_spk_map()
+        print("| spk_map: ", self.spk_map)
+        spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
+        json.dump(self.spk_map, open(spk_map_fn, 'w'))
+        self.phone_encoder = self._phone_encoder()
+        self.process_data('valid')
+        self.process_data('test')
+        self.process_data('train')
+    def process_data(self, prefix):
+        data_dir = hparams['binary_data_dir']
+        args = []
+        builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
+        lengths = []
+        f0s = []
+        total_sec = 0
+        if self.binarization_args['with_spk_embed']:
+            voice_encoder = VoiceEncoder().cuda()
+        meta_data = list(self.meta_data(prefix))
+        for m in meta_data:
+            args.append(list(m) + [self.phone_encoder, self.binarization_args])
+        num_workers = int(os.getenv('N_PROC', os.cpu_count() // 3))
+        for f_id, (_, item) in enumerate(
+                zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
+            if item is None:
+                continue
+            item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
+                if self.binarization_args['with_spk_embed'] else None
+            if not self.binarization_args['with_wav'] and 'wav' in item:
+                print("del wav")
+                del item['wav']
+            builder.add_item(item)
+            lengths.append(item['len'])
+            total_sec += item['sec']
+            if item.get('f0') is not None:
+                f0s.append(item['f0'])
+        builder.finalize()
+        np.save(f'{data_dir}/{prefix}_lengths.npy', lengths)
+        if len(f0s) > 0:
+            f0s = np.concatenate(f0s, 0)
+            f0s = f0s[f0s != 0]
+            np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
+        print(f"| {prefix} total duration: {total_sec:.3f}s")
+    @classmethod
+    def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
+        if hparams['vocoder'] in VOCODERS:
+            wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
+        else:
+            wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
+        res = {
+            'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
+            'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
+        }
+        try:
+            if binarization_args['with_f0']:
+                cls.get_pitch(wav, mel, res)
+                if binarization_args['with_f0cwt']:
+                    cls.get_f0cwt(res['f0'], res)
+            if binarization_args['with_txt']:
+                try:
+                    phone_encoded = res['phone'] = encoder.encode(ph)
+                except:
+                    traceback.print_exc()
+                    raise BinarizationError(f"Empty phoneme")
+                if binarization_args['with_align']:
+                    cls.get_align(tg_fn, ph, mel, phone_encoded, res)
+        except BinarizationError as e:
+            print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
+            return None
+        return res
+    @staticmethod
+    def get_align(tg_fn, ph, mel, phone_encoded, res):
+        if tg_fn is not None and os.path.exists(tg_fn):
+            mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
+        else:
+            raise BinarizationError(f"Align not found")
+        if mel2ph.max() - 1 >= len(phone_encoded):
+            raise BinarizationError(
+                f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
+        res['mel2ph'] = mel2ph
+        res['dur'] = dur
+    @staticmethod
+    def get_pitch(wav, mel, res):
+        f0, pitch_coarse = get_pitch(wav, mel, hparams)
+        if sum(f0) == 0:
+            raise BinarizationError("Empty f0")
+        res['f0'] = f0
+        res['pitch'] = pitch_coarse
+    @staticmethod
+    def get_f0cwt(f0, res):
+        from utils.cwt import get_cont_lf0, get_lf0_cwt
+        uv, cont_lf0_lpf = get_cont_lf0(f0)
+        logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
+        cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
+        Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
+        if np.any(np.isnan(Wavelet_lf0)):
+            raise BinarizationError("NaN CWT")
+        res['cwt_spec'] = Wavelet_lf0
+        res['cwt_scales'] = scales
+        res['f0_mean'] = logf0s_mean_org
+        res['f0_std'] = logf0s_std_org
+if __name__ == "__main__":
+    set_hparams()
+    BaseBinarizer().process()

data_gen/tts/base_preprocess.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import json
+import os
+import random
+import re
+import traceback
+from collections import Counter
+from functools import partial
+import librosa
+from tqdm import tqdm
+from data_gen.tts.txt_processors.base_text_processor import get_txt_processor_cls
+from data_gen.tts.wav_processors.base_processor import get_wav_processor_cls
+from utils.hparams import hparams
+from utils.multiprocess_utils import multiprocess_run_tqdm
+from utils.os_utils import link_file, move_file, remove_file
+from data_gen.tts.data_gen_utils import is_sil_phoneme, build_token_encoder
+class BasePreprocessor:
+    def __init__(self):
+        self.preprocess_args = hparams['preprocess_args']
+        txt_processor = self.preprocess_args['txt_processor']
+        self.txt_processor = get_txt_processor_cls(txt_processor)
+        self.raw_data_dir = hparams['raw_data_dir']
+        self.processed_dir = hparams['processed_data_dir']
+        self.spk_map_fn = f"{self.processed_dir}/spk_map.json"
+    def meta_data(self):
+        """
+        :return: {'item_name': Str, 'wav_fn': Str, 'txt': Str, 'spk_name': Str, 'txt_loader': None or Func}
+        """
+        raise NotImplementedError
+    def process(self):
+        processed_dir = self.processed_dir
+        wav_processed_tmp_dir = f'{processed_dir}/processed_tmp'
+        remove_file(wav_processed_tmp_dir)
+        os.makedirs(wav_processed_tmp_dir, exist_ok=True)
+        wav_processed_dir = f'{processed_dir}/{self.wav_processed_dirname}'
+        remove_file(wav_processed_dir)
+        os.makedirs(wav_processed_dir, exist_ok=True)
+        meta_data = list(tqdm(self.meta_data(), desc='Load meta data'))
+        item_names = [d['item_name'] for d in meta_data]
+        assert len(item_names) == len(set(item_names)), 'Key `item_name` should be Unique.'
+        # preprocess data
+        phone_list = []
+        word_list = []
+        spk_names = set()
+        process_item = partial(self.preprocess_first_pass,
+                               txt_processor=self.txt_processor,
+                               wav_processed_dir=wav_processed_dir,
+                               wav_processed_tmp=wav_processed_tmp_dir,
+                               preprocess_args=self.preprocess_args)
+        items = []
+        args = [{
+            'item_name': item_raw['item_name'],
+            'txt_raw': item_raw['txt'],
+            'wav_fn': item_raw['wav_fn'],
+            'txt_loader': item_raw.get('txt_loader'),
+            'others': item_raw.get('others', None)
+        } for item_raw in meta_data]
+        for item_, (item_id, item) in zip(meta_data, multiprocess_run_tqdm(process_item, args, desc='Preprocess')):
+            if item is not None:
+                item_.update(item)
+                item = item_
+                if 'txt_loader' in item:
+                    del item['txt_loader']
+                item['id'] = item_id
+                item['spk_name'] = item.get('spk_name', '<SINGLE_SPK>')
+                item['others'] = item.get('others', None)
+                phone_list += item['ph'].split(" ")
+                word_list += item['word'].split(" ")
+                spk_names.add(item['spk_name'])
+                items.append(item)
+        # add encoded tokens
+        ph_encoder, word_encoder = self._phone_encoder(phone_list), self._word_encoder(word_list)
+        spk_map = self.build_spk_map(spk_names)
+        args = [{
+            'ph': item['ph'], 'word': item['word'], 'spk_name': item['spk_name'],
+            'word_encoder': word_encoder, 'ph_encoder': ph_encoder, 'spk_map': spk_map
+        } for item in items]
+        for idx, item_new_kv in multiprocess_run_tqdm(self.preprocess_second_pass, args, desc='Add encoded tokens'):
+            items[idx].update(item_new_kv)
+        # build mfa data
+        if self.preprocess_args['use_mfa']:
+            mfa_dict = set()
+            mfa_input_dir = f'{processed_dir}/mfa_inputs'
+            remove_file(mfa_input_dir)
+            # group MFA inputs for better parallelism
+            mfa_groups = [i // self.preprocess_args['nsample_per_mfa_group'] for i in range(len(items))]
+            if self.preprocess_args['mfa_group_shuffle']:
+                random.seed(hparams['seed'])
+                random.shuffle(mfa_groups)
+            args = [{
+                'item': item, 'mfa_input_dir': mfa_input_dir,
+                'mfa_group': mfa_group, 'wav_processed_tmp': wav_processed_tmp_dir,
+                'preprocess_args': self.preprocess_args
+            } for item, mfa_group in zip(items, mfa_groups)]
+            for i, (ph_gb_word_nosil, new_wav_align_fn) in multiprocess_run_tqdm(
+                    self.build_mfa_inputs, args, desc='Build MFA data'):
+                items[i]['wav_align_fn'] = new_wav_align_fn
+                for w in ph_gb_word_nosil.split(" "):
+                    mfa_dict.add(f"{w} {w.replace('_', ' ')}")
+            mfa_dict = sorted(mfa_dict)
+            with open(f'{processed_dir}/mfa_dict.txt', 'w') as f:
+                f.writelines([f'{l}\n' for l in mfa_dict])
+        with open(f"{processed_dir}/{self.meta_csv_filename}.json", 'w') as f:
+            f.write(re.sub(r'\n\s+([\d+\]])', r'\1', json.dumps(items, ensure_ascii=False, sort_keys=False, indent=1)))
+        remove_file(wav_processed_tmp_dir)
+    @classmethod
+    def preprocess_first_pass(cls, item_name, txt_raw, txt_processor,
+                              wav_fn, wav_processed_dir, wav_processed_tmp,
+                              preprocess_args, txt_loader=None, others=None):
+        try:
+            if txt_loader is not None:
+                txt_raw = txt_loader(txt_raw)
+            ph, txt, word, ph2word, ph_gb_word = cls.txt_to_ph(txt_processor, txt_raw, preprocess_args)
+            wav_fn, wav_align_fn = cls.process_wav(
+                item_name, wav_fn,
+                hparams['processed_data_dir'],
+                wav_processed_tmp, preprocess_args)
+            # wav for binarization
+            ext = os.path.splitext(wav_fn)[1]
+            os.makedirs(wav_processed_dir, exist_ok=True)
+            new_wav_fn = f"{wav_processed_dir}/{item_name}{ext}"
+            move_link_func = move_file if os.path.dirname(wav_fn) == wav_processed_tmp else link_file
+            move_link_func(wav_fn, new_wav_fn)
+            return {
+                'txt': txt, 'txt_raw': txt_raw, 'ph': ph,
+                'word': word, 'ph2word': ph2word, 'ph_gb_word': ph_gb_word,
+                'wav_fn': new_wav_fn, 'wav_align_fn': wav_align_fn,
+                'others': others
+            }
+        except:
+            traceback.print_exc()
+            print(f"| Error is caught. item_name: {item_name}.")
+            return None
+    @staticmethod
+    def txt_to_ph(txt_processor, txt_raw, preprocess_args):
+        txt_struct, txt = txt_processor.process(txt_raw, preprocess_args)
+        ph = [p for w in txt_struct for p in w[1]]
+        return " ".join(ph), txt
+    @staticmethod
+    def process_wav(item_name, wav_fn, processed_dir, wav_processed_tmp, preprocess_args):
+        processors = [get_wav_processor_cls(v) for v in preprocess_args['wav_processors']]
+        processors = [k() for k in processors if k is not None]
+        if len(processors) >= 1:
+            sr_file = librosa.core.get_samplerate(wav_fn)
+            output_fn_for_align = None
+            ext = os.path.splitext(wav_fn)[1]
+            input_fn = f"{wav_processed_tmp}/{item_name}{ext}"
+            link_file(wav_fn, input_fn)
+            for p in processors:
+                outputs = p.process(input_fn, sr_file, wav_processed_tmp, processed_dir, item_name, preprocess_args)
+                if len(outputs) == 3:
+                    input_fn, sr, output_fn_for_align = outputs
+                else:
+                    input_fn, sr = outputs
+            return input_fn, output_fn_for_align
+        else:
+            return wav_fn, wav_fn
+    def _phone_encoder(self, ph_set):
+        ph_set_fn = f"{self.processed_dir}/phone_set.json"
+        if self.preprocess_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
+            ph_set = sorted(set(ph_set))
+            json.dump(ph_set, open(ph_set_fn, 'w'), ensure_ascii=False)
+            print("| Build phone set: ", ph_set)
+        else:
+            ph_set = json.load(open(ph_set_fn, 'r'))
+            print("| Load phone set: ", ph_set)
+        return build_token_encoder(ph_set_fn)
+    def _word_encoder(self, word_set):
+        word_set_fn = f"{self.processed_dir}/word_set.json"
+        if self.preprocess_args['reset_word_dict']:
+            word_set = Counter(word_set)
+            total_words = sum(word_set.values())
+            word_set = word_set.most_common(hparams['word_dict_size'])
+            num_unk_words = total_words - sum([x[1] for x in word_set])
+            word_set = ['<BOS>', '<EOS>'] + [x[0] for x in word_set]
+            word_set = sorted(set(word_set))
+            json.dump(word_set, open(word_set_fn, 'w'), ensure_ascii=False)
+            print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
+                  f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
+        else:
+            word_set = json.load(open(word_set_fn, 'r'))
+            print("| Load word set. Size: ", len(word_set), word_set[:10])
+        return build_token_encoder(word_set_fn)
+    @classmethod
+    def preprocess_second_pass(cls, word, ph, spk_name, word_encoder, ph_encoder, spk_map):
+        word_token = word_encoder.encode(word)
+        ph_token = ph_encoder.encode(ph)
+        spk_id = spk_map[spk_name]
+        return {'word_token': word_token, 'ph_token': ph_token, 'spk_id': spk_id}
+    def build_spk_map(self, spk_names):
+        spk_map = {x: i for i, x in enumerate(sorted(list(spk_names)))}
+        assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
+        print(f"| Number of spks: {len(spk_map)}, spk_map: {spk_map}")
+        json.dump(spk_map, open(self.spk_map_fn, 'w'), ensure_ascii=False)
+        return spk_map
+    @classmethod
+    def build_mfa_inputs(cls, item, mfa_input_dir, mfa_group, wav_processed_tmp, preprocess_args):
+        item_name = item['item_name']
+        wav_align_fn = item['wav_align_fn']
+        ph_gb_word = item['ph_gb_word']
+        ext = os.path.splitext(wav_align_fn)[1]
+        mfa_input_group_dir = f'{mfa_input_dir}/{mfa_group}'
+        os.makedirs(mfa_input_group_dir, exist_ok=True)
+        new_wav_align_fn = f"{mfa_input_group_dir}/{item_name}{ext}"
+        move_link_func = move_file if os.path.dirname(wav_align_fn) == wav_processed_tmp else link_file
+        move_link_func(wav_align_fn, new_wav_align_fn)
+        ph_gb_word_nosil = " ".join(["_".join([p for p in w.split("_") if not is_sil_phoneme(p)])
+                                     for w in ph_gb_word.split(" ") if not is_sil_phoneme(w)])
+        with open(f'{mfa_input_group_dir}/{item_name}.lab', 'w') as f_txt:
+            f_txt.write(ph_gb_word_nosil)
+        return ph_gb_word_nosil, new_wav_align_fn
+    def load_spk_map(self, base_dir):
+        spk_map_fn = f"{base_dir}/spk_map.json"
+        spk_map = json.load(open(spk_map_fn, 'r'))
+        return spk_map
+    def load_dict(self, base_dir):
+        ph_encoder = build_token_encoder(f'{base_dir}/phone_set.json')
+        return ph_encoder
+    @property
+    def meta_csv_filename(self):
+        return 'metadata'
+    @property
+    def wav_processed_dirname(self):
+        return 'wav_processed'

data_gen/tts/bin/binarize.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+import importlib
+from utils.hparams import set_hparams, hparams
+def binarize():
+    binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
+    pkg = ".".join(binarizer_cls.split(".")[:-1])
+    cls_name = binarizer_cls.split(".")[-1]
+    binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
+    print("| Binarizer: ", binarizer_cls)
+    binarizer_cls().process()
+if __name__ == '__main__':
+    set_hparams()
+    binarize()

data_gen/tts/data_gen_utils.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import warnings
+warnings.filterwarnings("ignore")
+# import parselmouth
+import os
+import torch
+from skimage.transform import resize
+from utils.text_encoder import TokenTextEncoder
+from utils.pitch_utils import f0_to_coarse
+import struct
+import webrtcvad
+from scipy.ndimage.morphology import binary_dilation
+import librosa
+import numpy as np
+from utils import audio
+import pyloudnorm as pyln
+import re
+import json
+from collections import OrderedDict
+PUNCS = '!,.?;:'
+int16_max = (2 ** 15) - 1
+def trim_long_silences(path, sr=None, return_raw_wav=False, norm=True, vad_max_silence_length=12):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a
+    threshold determined by the VAD parameters in params.py.
+    :param wav: the raw waveform as a numpy array of floats
+    :param vad_max_silence_length: Maximum number of consecutive silent frames a segment can have.
+    :return: the same waveform with silences trimmed away (length <= original wav length)
+    """
+    ## Voice Activation Detection
+    # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+    # This sets the granularity of the VAD. Should not need to be changed.
+    sampling_rate = 16000
+    wav_raw, sr = librosa.core.load(path, sr=sr)
+    if norm:
+        meter = pyln.Meter(sr)  # create BS.1770 meter
+        loudness = meter.integrated_loudness(wav_raw)
+        wav_raw = pyln.normalize.loudness(wav_raw, loudness, -20.0)
+        if np.abs(wav_raw).max() > 1.0:
+            wav_raw = wav_raw / np.abs(wav_raw).max()
+    wav = librosa.resample(wav_raw, sr, sampling_rate, res_type='kaiser_best')
+    vad_window_length = 30  # In milliseconds
+    # Number of frames to average together when performing the moving average smoothing.
+    # The larger this value, the larger the VAD variations must be to not get smoothed out.
+    vad_moving_average_width = 8
+    # Compute the voice detection window size
+    samples_per_window = (vad_window_length * sampling_rate) // 1000
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+    # Convert the float waveform to 16-bit mono PCM
+    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+                                         sample_rate=sampling_rate))
+    voice_flags = np.array(voice_flags)
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1:] / width
+    audio_mask = moving_average(voice_flags, vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(np.bool)
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+    audio_mask = resize(audio_mask, (len(wav_raw),)) > 0
+    if return_raw_wav:
+        return wav_raw, audio_mask, sr
+    return wav_raw[audio_mask], audio_mask, sr
+def process_utterance(wav_path,
+                      fft_size=1024,
+                      hop_size=256,
+                      win_length=1024,
+                      window="hann",
+                      num_mels=80,
+                      fmin=80,
+                      fmax=7600,
+                      eps=1e-6,
+                      sample_rate=22050,
+                      loud_norm=False,
+                      min_level_db=-100,
+                      return_linear=False,
+                      trim_long_sil=False, vocoder='pwg'):
+    if isinstance(wav_path, str):
+        if trim_long_sil:
+            wav, _, _ = trim_long_silences(wav_path, sample_rate)
+        else:
+            wav, _ = librosa.core.load(wav_path, sr=sample_rate)
+    else:
+        wav = wav_path
+    if loud_norm:
+        meter = pyln.Meter(sample_rate)  # create BS.1770 meter
+        loudness = meter.integrated_loudness(wav)
+        wav = pyln.normalize.loudness(wav, loudness, -22.0)
+        if np.abs(wav).max() > 1:
+            wav = wav / np.abs(wav).max()
+    # get amplitude spectrogram
+    x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
+                          win_length=win_length, window=window, pad_mode="constant")
+    spc = np.abs(x_stft)  # (n_bins, T)
+    # get mel basis
+    fmin = 0 if fmin == -1 else fmin
+    fmax = sample_rate / 2 if fmax == -1 else fmax
+    mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax)
+    mel = mel_basis @ spc
+    if vocoder == 'pwg':
+        mel = np.log10(np.maximum(eps, mel))  # (n_mel_bins, T)
+    else:
+        assert False, f'"{vocoder}" is not in ["pwg"].'
+    l_pad, r_pad = audio.librosa_pad_lr(wav, fft_size, hop_size, 1)
+    wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
+    wav = wav[:mel.shape[1] * hop_size]
+    if not return_linear:
+        return wav, mel
+    else:
+        spc = audio.amp_to_db(spc)
+        spc = audio.normalize(spc, {'min_level_db': min_level_db})
+        return wav, mel, spc
+def get_pitch(wav_data, mel, hparams):
+    """
+    :param wav_data: [T]
+    :param mel: [T, 80]
+    :param hparams:
+    :return:
+    """
+    time_step = hparams['hop_size'] / hparams['audio_sample_rate'] * 1000
+    f0_min = 80
+    f0_max = 750
+    if hparams['hop_size'] == 128:
+        pad_size = 4
+    elif hparams['hop_size'] == 256:
+        pad_size = 2
+    else:
+        assert False
+    f0 = parselmouth.Sound(wav_data, hparams['audio_sample_rate']).to_pitch_ac(
+        time_step=time_step / 1000, voicing_threshold=0.6,
+        pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
+    lpad = pad_size * 2
+    rpad = len(mel) - len(f0) - lpad
+    f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
+    # mel and f0 are extracted by 2 different libraries. we should force them to have the same length.
+    # Attention: we find that new version of some libraries could cause ``rpad'' to be a negetive value...
+    # Just to be sure, we recommend users to set up the same environments as them in requirements_auto.txt (by Anaconda)
+    delta_l = len(mel) - len(f0)
+    assert np.abs(delta_l) <= 8
+    if delta_l > 0:
+        f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
+    f0 = f0[:len(mel)]
+    pitch_coarse = f0_to_coarse(f0)
+    return f0, pitch_coarse
+def remove_empty_lines(text):
+    """remove empty lines"""
+    assert (len(text) > 0)
+    assert (isinstance(text, list))
+    text = [t.strip() for t in text]
+    if "" in text:
+        text.remove("")
+    return text
+class TextGrid(object):
+    def __init__(self, text):
+        text = remove_empty_lines(text)
+        self.text = text
+        self.line_count = 0
+        self._get_type()
+        self._get_time_intval()
+        self._get_size()
+        self.tier_list = []
+        self._get_item_list()
+    def _extract_pattern(self, pattern, inc):
+        """
+        Parameters
+        ----------
+        pattern : regex to extract pattern
+        inc : increment of line count after extraction
+        Returns
+        -------
+        group : extracted info
+        """
+        try:
+            group = re.match(pattern, self.text[self.line_count]).group(1)
+            self.line_count += inc
+        except AttributeError:
+            raise ValueError("File format error at line %d:%s" % (self.line_count, self.text[self.line_count]))
+        return group
+    def _get_type(self):
+        self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2)
+    def _get_time_intval(self):
+        self.xmin = self._extract_pattern(r"xmin = (.*)", 1)
+        self.xmax = self._extract_pattern(r"xmax = (.*)", 2)
+    def _get_size(self):
+        self.size = int(self._extract_pattern(r"size = (.*)", 2))
+    def _get_item_list(self):
+        """Only supports IntervalTier currently"""
+        for itemIdx in range(1, self.size + 1):
+            tier = OrderedDict()
+            item_list = []
+            tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1)
+            tier_class = self._extract_pattern(r"class = \"(.*)\"", 1)
+            if tier_class != "IntervalTier":
+                raise NotImplementedError("Only IntervalTier class is supported currently")
+            tier_name = self._extract_pattern(r"name = \"(.*)\"", 1)
+            tier_xmin = self._extract_pattern(r"xmin = (.*)", 1)
+            tier_xmax = self._extract_pattern(r"xmax = (.*)", 1)
+            tier_size = self._extract_pattern(r"intervals: size = (.*)", 1)
+            for i in range(int(tier_size)):
+                item = OrderedDict()
+                item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1)
+                item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1)
+                item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1)
+                item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1)
+                item_list.append(item)
+            tier["idx"] = tier_idx
+            tier["class"] = tier_class
+            tier["name"] = tier_name
+            tier["xmin"] = tier_xmin
+            tier["xmax"] = tier_xmax
+            tier["size"] = tier_size
+            tier["items"] = item_list
+            self.tier_list.append(tier)
+    def toJson(self):
+        _json = OrderedDict()
+        _json["file_type"] = self.file_type
+        _json["xmin"] = self.xmin
+        _json["xmax"] = self.xmax
+        _json["size"] = self.size
+        _json["tiers"] = self.tier_list
+        return json.dumps(_json, ensure_ascii=False, indent=2)
+def get_mel2ph(tg_fn, ph, mel, hparams):
+    ph_list = ph.split(" ")
+    with open(tg_fn, "r") as f:
+        tg = f.readlines()
+    tg = remove_empty_lines(tg)
+    tg = TextGrid(tg)
+    tg = json.loads(tg.toJson())
+    split = np.ones(len(ph_list) + 1, np.float) * -1
+    tg_idx = 0
+    ph_idx = 0
+    tg_align = [x for x in tg['tiers'][-1]['items']]
+    tg_align_ = []
+    for x in tg_align:
+        x['xmin'] = float(x['xmin'])
+        x['xmax'] = float(x['xmax'])
+        if x['text'] in ['sil', 'sp', '', 'SIL', 'PUNC']:
+            x['text'] = ''
+            if len(tg_align_) > 0 and tg_align_[-1]['text'] == '':
+                tg_align_[-1]['xmax'] = x['xmax']
+                continue
+        tg_align_.append(x)
+    tg_align = tg_align_
+    tg_len = len([x for x in tg_align if x['text'] != ''])
+    ph_len = len([x for x in ph_list if not is_sil_phoneme(x)])
+    assert tg_len == ph_len, (tg_len, ph_len, tg_align, ph_list, tg_fn)
+    while tg_idx < len(tg_align) or ph_idx < len(ph_list):
+        if tg_idx == len(tg_align) and is_sil_phoneme(ph_list[ph_idx]):
+            split[ph_idx] = 1e8
+            ph_idx += 1
+            continue
+        x = tg_align[tg_idx]
+        if x['text'] == '' and ph_idx == len(ph_list):
+            tg_idx += 1
+            continue
+        assert ph_idx < len(ph_list), (tg_len, ph_len, tg_align, ph_list, tg_fn)
+        ph = ph_list[ph_idx]
+        if x['text'] == '' and not is_sil_phoneme(ph):
+            assert False, (ph_list, tg_align)
+        if x['text'] != '' and is_sil_phoneme(ph):
+            ph_idx += 1
+        else:
+            assert (x['text'] == '' and is_sil_phoneme(ph)) \
+                   or x['text'].lower() == ph.lower() \
+                   or x['text'].lower() == 'sil', (x['text'], ph)
+            split[ph_idx] = x['xmin']
+            if ph_idx > 0 and split[ph_idx - 1] == -1 and is_sil_phoneme(ph_list[ph_idx - 1]):
+                split[ph_idx - 1] = split[ph_idx]
+            ph_idx += 1
+            tg_idx += 1
+    assert tg_idx == len(tg_align), (tg_idx, [x['text'] for x in tg_align])
+    assert ph_idx >= len(ph_list) - 1, (ph_idx, ph_list, len(ph_list), [x['text'] for x in tg_align], tg_fn)
+    mel2ph = np.zeros([mel.shape[0]], np.int)
+    split[0] = 0
+    split[-1] = 1e8
+    for i in range(len(split) - 1):
+        assert split[i] != -1 and split[i] <= split[i + 1], (split[:-1],)
+    split = [int(s * hparams['audio_sample_rate'] / hparams['hop_size'] + 0.5) for s in split]
+    for ph_idx in range(len(ph_list)):
+        mel2ph[split[ph_idx]:split[ph_idx + 1]] = ph_idx + 1
+    mel2ph_torch = torch.from_numpy(mel2ph)
+    T_t = len(ph_list)
+    dur = mel2ph_torch.new_zeros([T_t + 1]).scatter_add(0, mel2ph_torch, torch.ones_like(mel2ph_torch))
+    dur = dur[1:].numpy()
+    return mel2ph, dur
+def build_phone_encoder(data_dir):
+    phone_list_file = os.path.join(data_dir, 'phone_set.json')
+    phone_list = json.load(open(phone_list_file))
+    return TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
+def is_sil_phoneme(p):
+    return not p[0].isalpha()
+def build_token_encoder(token_list_file):
+    token_list = json.load(open(token_list_file))
+    return TokenTextEncoder(None, vocab_list=token_list, replace_oov='<UNK>')

data_gen/tts/txt_processors/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import en

data_gen/tts/txt_processors/base_text_processor.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from data_gen.tts.data_gen_utils import is_sil_phoneme
+REGISTERED_TEXT_PROCESSORS = {}
+def register_txt_processors(name):
+    def _f(cls):
+        REGISTERED_TEXT_PROCESSORS[name] = cls
+        return cls
+    return _f
+def get_txt_processor_cls(name):
+    return REGISTERED_TEXT_PROCESSORS.get(name, None)
+class BaseTxtProcessor:
+    @staticmethod
+    def sp_phonemes():
+        return ['|']
+    @classmethod
+    def process(cls, txt, preprocess_args):
+        raise NotImplementedError
+    @classmethod
+    def postprocess(cls, txt_struct, preprocess_args):
+        # remove sil phoneme in head and tail
+        while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]):
+            txt_struct = txt_struct[1:]
+        while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]):
+            txt_struct = txt_struct[:-1]
+        if preprocess_args['with_phsep']:
+            txt_struct = cls.add_bdr(txt_struct)
+        if preprocess_args['add_eos_bos']:
+            txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]]
+        return txt_struct
+    @classmethod
+    def add_bdr(cls, txt_struct):
+        txt_struct_ = []
+        for i, ts in enumerate(txt_struct):
+            txt_struct_.append(ts)
+            if i != len(txt_struct) - 1 and \
+                    not is_sil_phoneme(txt_struct[i][0]) and not is_sil_phoneme(txt_struct[i + 1][0]):
+                txt_struct_.append(['|', ['|']])
+        return txt_struct_

data_gen/tts/txt_processors/en.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import re
+import unicodedata
+from g2p_en import G2p
+from g2p_en.expand import normalize_numbers
+from nltk import pos_tag
+from nltk.tokenize import TweetTokenizer
+from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors
+from data_gen.tts.data_gen_utils import is_sil_phoneme, PUNCS
+class EnG2p(G2p):
+    word_tokenize = TweetTokenizer().tokenize
+    def __call__(self, text):
+        # preprocessing
+        words = EnG2p.word_tokenize(text)
+        tokens = pos_tag(words)  # tuples of (word, tag)
+        # steps
+        prons = []
+        for word, pos in tokens:
+            if re.search("[a-z]", word) is None:
+                pron = [word]
+            elif word in self.homograph2features:  # Check homograph
+                pron1, pron2, pos1 = self.homograph2features[word]
+                if pos.startswith(pos1):
+                    pron = pron1
+                else:
+                    pron = pron2
+            elif word in self.cmu:  # lookup CMU dict
+                pron = self.cmu[word][0]
+            else:  # predict for oov
+                pron = self.predict(word)
+            prons.extend(pron)
+            prons.extend([" "])
+        return prons[:-1]
+@register_txt_processors('en')
+class TxtProcessor(BaseTxtProcessor):
+    g2p = EnG2p()
+    @staticmethod
+    def preprocess_text(text):
+        text = normalize_numbers(text)
+        text = ''.join(char for char in unicodedata.normalize('NFD', text)
+                       if unicodedata.category(char) != 'Mn')  # Strip accents
+        text = text.lower()
+        text = re.sub("[\'\"()]+", "", text)
+        text = re.sub("[-]+", " ", text)
+        text = re.sub(f"[^ a-z{PUNCS}]", "", text)
+        text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text)  # !! -> !
+        text = re.sub(f"([{PUNCS}])+", r"\1", text)  # !! -> !
+        text = text.replace("i.e.", "that is")
+        text = text.replace("i.e.", "that is")
+        text = text.replace("etc.", "etc")
+        text = re.sub(f"([{PUNCS}])", r" \1 ", text)
+        text = re.sub(rf"\s+", r" ", text)
+        return text
+    @classmethod
+    def process(cls, txt, preprocess_args):
+        txt = cls.preprocess_text(txt).strip()
+        phs = cls.g2p(txt)
+        txt_struct = [[w, []] for w in txt.split(" ")]
+        i_word = 0
+        for p in phs:
+            if p == ' ':
+                i_word += 1
+            else:
+                txt_struct[i_word][1].append(p)
+        txt_struct = cls.postprocess(txt_struct, preprocess_args)
+        return txt_struct, txt

data_gen/tts/wav_processors/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from . import base_processor
2	+ from . import common_processors

data_gen/tts/wav_processors/base_processor.py ADDED Viewed

	@@ -0,0 +1,25 @@

+REGISTERED_WAV_PROCESSORS = {}
+def register_wav_processors(name):
+    def _f(cls):
+        REGISTERED_WAV_PROCESSORS[name] = cls
+        return cls
+    return _f
+def get_wav_processor_cls(name):
+    return REGISTERED_WAV_PROCESSORS.get(name, None)
+class BaseWavProcessor:
+    @property
+    def name(self):
+        raise NotImplementedError
+    def output_fn(self, input_fn):
+        return f'{input_fn[:-4]}_{self.name}.wav'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        raise NotImplementedError

data_gen/tts/wav_processors/common_processors.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+import subprocess
+import librosa
+import numpy as np
+from data_gen.tts.wav_processors.base_processor import BaseWavProcessor, register_wav_processors
+from data_gen.tts.data_gen_utils import trim_long_silences
+from utils.audio import save_wav
+from utils.rnnoise import rnnoise
+from utils.hparams import hparams
+@register_wav_processors(name='sox_to_wav')
+class ConvertToWavProcessor(BaseWavProcessor):
+    @property
+    def name(self):
+        return 'ToWav'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        if input_fn[-4:] == '.wav':
+            return input_fn, sr
+        else:
+            output_fn = self.output_fn(input_fn)
+            subprocess.check_call(f'sox -v 0.95 "{input_fn}" -t wav "{output_fn}"', shell=True)
+            return output_fn, sr
+@register_wav_processors(name='sox_resample')
+class ResampleProcessor(BaseWavProcessor):
+    @property
+    def name(self):
+        return 'Resample'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        output_fn = self.output_fn(input_fn)
+        sr_file = librosa.core.get_samplerate(input_fn)
+        if sr != sr_file:
+            subprocess.check_call(f'sox -v 0.95 "{input_fn}" -r{sr} "{output_fn}"', shell=True)
+            y, _ = librosa.core.load(input_fn, sr=sr)
+            y, _ = librosa.effects.trim(y)
+            save_wav(y, output_fn, sr)
+            return output_fn, sr
+        else:
+            return input_fn, sr
+@register_wav_processors(name='trim_sil')
+class TrimSILProcessor(BaseWavProcessor):
+    @property
+    def name(self):
+        return 'TrimSIL'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        output_fn = self.output_fn(input_fn)
+        y, _ = librosa.core.load(input_fn, sr=sr)
+        y, _ = librosa.effects.trim(y)
+        save_wav(y, output_fn, sr)
+        return output_fn
+@register_wav_processors(name='trim_all_sil')
+class TrimAllSILProcessor(BaseWavProcessor):
+    @property
+    def name(self):
+        return 'TrimSIL'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        output_fn = self.output_fn(input_fn)
+        y, audio_mask, _ = trim_long_silences(
+            input_fn, vad_max_silence_length=preprocess_args.get('vad_max_silence_length', 12))
+        save_wav(y, output_fn, sr)
+        if preprocess_args['save_sil_mask']:
+            os.makedirs(f'{processed_dir}/sil_mask', exist_ok=True)
+            np.save(f'{processed_dir}/sil_mask/{item_name}.npy', audio_mask)
+        return output_fn, sr
+@register_wav_processors(name='denoise')
+class DenoiseProcessor(BaseWavProcessor):
+    @property
+    def name(self):
+        return 'Denoise'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        output_fn = self.output_fn(input_fn)
+        rnnoise(input_fn, output_fn, out_sample_rate=sr)
+        return output_fn, sr

egs/datasets/audio/libritts/base_text2mel.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+raw_data_dir: 'data/raw/LibriTTS'
+processed_data_dir: 'data/processed/libritts'
+binary_data_dir: 'data/binary/libritts'
+pre_align_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign
+binarization_args:
+  shuffle: true
+use_spk_id: true
+test_num: 200
+num_spk: 2320
+pitch_type: frame
+min_frames: 128
+num_test_samples: 30
+mel_loss: "ssim:0.5|l1:0.5"
+vocoder_ckpt: ''

egs/datasets/audio/libritts/fs2.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - egs/egs_bases/tts/fs2.yaml
+  - ./base_text2mel.yaml

egs/datasets/audio/libritts/pre_align.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import os
+from data_gen.tts.base_pre_align import BasePreAlign
+import glob
+class LibrittsPreAlign(BasePreAlign):
+    def meta_data(self):
+        wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*/*.wav'))
+        for wav_fn in wav_fns:
+            item_name = os.path.basename(wav_fn)[:-4]
+            txt_fn = f'{wav_fn[:-4]}.normalized.txt'
+            spk = item_name.split("_")[0]
+            yield item_name, wav_fn, (self.load_txt, txt_fn), spk
+if __name__ == "__main__":
+    LibrittsPreAlign().process()

egs/datasets/audio/libritts/pwg.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+base_config: egs/egs_bases/tts/vocoder/pwg.yaml
+raw_data_dir: 'data/raw/LibriTTS'
+processed_data_dir: 'data/processed/libritts'
+binary_data_dir: 'data/binary/libritts_wav'
+generator_params:
+  kernel_size: 5
+num_spk: 400
+max_samples: 20480

egs/datasets/audio/lj/base_mel2wav.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+raw_data_dir: 'data/raw/LJSpeech-1.1'
+processed_data_dir: 'data/processed/ljspeech'
+binary_data_dir: 'data/binary/ljspeech_wav'
+binarization_args:
+  with_spk_embed: false

egs/datasets/audio/lj/pre_align.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from data_gen.tts.base_preprocess import BasePreprocessor
+class LJPreAlign(BasePreprocessor):
+    def meta_data(self):
+        for l in open(f'{self.raw_data_dir}/metadata.csv').readlines():
+            item_name, _, txt = l.strip().split("|")
+            wav_fn = f"{self.raw_data_dir}/wavs/{item_name}.wav"
+            yield item_name, wav_fn, txt, 'SPK1'
+if __name__ == "__main__":
+    LJPreAlign().process()

egs/datasets/audio/lj/pwg.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - egs/egs_bases/tts/vocoder/pwg.yaml
+  - ./base_mel2wav.yaml

egs/datasets/audio/vctk/base_mel2wav.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+raw_data_dir: 'data/raw/VCTK-Corpus'
+processed_data_dir: 'data/processed/vctk'
+binary_data_dir: 'data/binary/vctk_wav'

egs/datasets/audio/vctk/fs2.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+base_config:
+  - egs/egs_bases/tts/fs2.yaml
+raw_data_dir: 'data/raw/VCTK-Corpus'
+processed_data_dir: 'data/processed/vctk'
+binary_data_dir: 'data/binary/vctk'
+pre_align_cls: egs.datasets.audio.vctk.pre_align.VCTKPreAlign
+use_spk_id: true
+test_num: 200
+num_spk: 400
+binarization_args:
+  shuffle: true
+  trim_eos_bos: true

egs/datasets/audio/vctk/pre_align.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os
+from data_gen.tts.base_pre_align import BasePreAlign
+import glob
+class VCTKPreAlign(BasePreAlign):
+    def meta_data(self):
+        wav_fns = glob.glob(f'{self.raw_data_dir}/wav48/*/*.wav')
+        for wav_fn in wav_fns:
+            item_name = os.path.basename(wav_fn)[:-4]
+            spk = item_name.split("_")[0]
+            txt_fn = wav_fn.split("/")
+            txt_fn[-1] = f'{item_name}.txt'
+            txt_fn[-3] = f'txt'
+            txt_fn = "/".join(txt_fn)
+            if os.path.exists(txt_fn) and os.path.exists(wav_fn):
+                yield item_name, wav_fn, (self.load_txt, txt_fn), spk
+if __name__ == "__main__":
+    VCTKPreAlign().process()

egs/datasets/audio/vctk/pwg.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+base_config:
+  - egs/egs_bases/tts/vocoder/pwg.yaml
+  - ./base_mel2wav.yaml
+num_spk: 400
+max_samples: 20480

egs/egs_bases/config_base.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+# task
+binary_data_dir: ''
+work_dir: '' # experiment directory.
+infer: false # inference
+amp: false
+seed: 1234
+debug: false
+save_codes: []
+#  - configs
+#  - modules
+#  - tasks
+#  - utils
+#  - usr
+#############
+# dataset
+#############
+ds_workers: 1
+test_num: 100
+endless_ds: false
+sort_by_len: true
+#########
+# train and eval
+#########
+print_nan_grads: false
+load_ckpt: ''
+save_best: true
+num_ckpt_keep: 3
+clip_grad_norm: 0
+accumulate_grad_batches: 1
+tb_log_interval: 100
+num_sanity_val_steps: 5  # steps of validation at the beginning
+check_val_every_n_epoch: 10
+val_check_interval: 2000
+valid_monitor_key: 'val_loss'
+valid_monitor_mode: 'min'
+max_epochs: 1000
+max_updates: 1000000
+max_tokens: 31250
+max_sentences: 100000
+max_valid_tokens: -1
+max_valid_sentences: -1
+test_input_dir: ''
+resume_from_checkpoint: 0
+rename_tmux: true

egs/egs_bases/tts/base.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+# task
+base_config: ../config_base.yaml
+task_cls: ''
+#############
+# dataset
+#############
+raw_data_dir: ''
+processed_data_dir: ''
+binary_data_dir: ''
+dict_dir: ''
+pre_align_cls: ''
+binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
+pre_align_args:
+  txt_processor: en
+  use_tone: true # for ZH
+  sox_resample: false
+  sox_to_wav: false
+  allow_no_txt: false
+  trim_sil: false
+  denoise: false
+binarization_args:
+  shuffle: false
+  with_txt: true
+  with_wav: false
+  with_align: true
+  with_spk_embed: false
+  with_spk_id: true
+  with_f0: true
+  with_f0cwt: false
+  with_linear: false
+  with_word: true
+  trim_sil: false
+  trim_eos_bos: false
+  reset_phone_dict: true
+  reset_word_dict: true
+word_size: 30000
+pitch_extractor: parselmouth
+loud_norm: false
+endless_ds: true
+test_num: 100
+min_frames: 0
+max_frames: 1548
+frames_multiple: 1
+max_input_tokens: 1550
+audio_num_mel_bins: 80
+audio_sample_rate: 22050
+hop_size: 256  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
+win_size: 1024  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
+fmin: 80  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
+fmax: 7600  # To be increased/reduced depending on data.
+fft_size: 1024  # Extra window size is filled with 0 paddings to match this parameter
+min_level_db: -100
+ref_level_db: 20
+griffin_lim_iters: 60
+num_spk: 1
+mel_vmin: -6
+mel_vmax: 1.5
+ds_workers: 1
+#########
+# model
+#########
+dropout: 0.1
+enc_layers: 4
+dec_layers: 4
+hidden_size: 256
+num_heads: 2
+enc_ffn_kernel_size: 9
+dec_ffn_kernel_size: 9
+ffn_act: gelu
+ffn_padding: 'SAME'
+use_spk_id: false
+use_split_spk_id: false
+use_spk_embed: false
+###########
+# optimization
+###########
+lr: 2.0
+scheduler: rsqrt # rsqrt|none
+warmup_updates: 8000
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+weight_decay: 0
+clip_grad_norm: 1
+clip_grad_value: 0
+###########
+# train and eval
+###########
+max_tokens: 30000
+max_sentences: 100000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+valid_infer_interval: 10000
+train_set_name: 'train'
+train_sets: ''
+valid_set_name: 'valid'
+test_set_name: 'test'
+num_test_samples: 0
+num_valid_plots: 10
+test_ids: [ ]
+vocoder_denoise_c: 0.0
+profile_infer: false
+out_wav_norm: false
+save_gt: true
+save_f0: false
+gen_dir_name: ''

egs/egs_bases/tts/fs2.yaml ADDED Viewed

	@@ -0,0 +1,102 @@

+base_config: ./base.yaml
+task_cls: tasks.tts.fs2.FastSpeech2Task
+# model
+hidden_size: 256
+dropout: 0.1
+encoder_type: fft # rel_fft|fft|tacotron|tacotron2|conformer
+decoder_type: fft # fft|rnn|conv|conformer|wn
+# rnn enc/dec
+encoder_K: 8
+decoder_rnn_dim: 0 # for rnn decoder, 0 -> hidden_size * 2
+# fft enc/dec
+use_pos_embed: true
+dec_num_heads: 2
+dec_layers: 4
+ffn_hidden_size: 1024
+enc_ffn_kernel_size: 9
+dec_ffn_kernel_size: 9
+# conv enc/dec
+enc_dec_norm: ln
+conv_use_pos: false
+layers_in_block: 2
+enc_dilations: [ 1, 1, 1, 1 ]
+enc_kernel_size: 5
+dec_dilations: [ 1, 1, 1, 1 ] # for conv decoder
+dec_kernel_size: 5
+dur_loss: mse # huber|mol
+# duration
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+dur_predictor_kernel: 3
+dur_predictor_layers: 2
+predictor_dropout: 0.5
+# pitch and energy
+pitch_norm: standard # standard|log
+use_pitch_embed: true
+pitch_type: frame # frame|ph|cwt
+use_uv: true
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_add_f0_loss: false
+cwt_std_scale: 0.8
+pitch_ar: false
+pitch_embed_type: 0
+pitch_loss: 'l1' # l1|l2|ssim
+pitch_ssim_win: 11
+use_energy_embed: false
+# reference encoder and speaker embedding
+use_ref_enc: false
+use_var_enc: false
+lambda_commit: 0.25
+var_enc_vq_codes: 64
+ref_norm_layer: bn
+dec_inp_add_noise: false
+sil_add_noise: false
+ref_hidden_stride_kernel:
+  - 0,3,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
+  - 0,3,5
+  - 0,2,5
+  - 0,2,5
+  - 0,2,5
+pitch_enc_hidden_stride_kernel:
+  - 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
+  - 0,2,5
+  - 0,2,5
+dur_enc_hidden_stride_kernel:
+  - 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
+  - 0,2,3
+  - 0,1,3
+# mel
+mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
+# loss lambda
+lambda_f0: 1.0
+lambda_uv: 1.0
+lambda_energy: 0.1
+lambda_ph_dur: 0.1
+lambda_sent_dur: 1.0
+lambda_word_dur: 1.0
+predictor_grad: 0.1
+# train and eval
+pretrain_fs_ckpt: ''
+warmup_updates: 2000
+max_tokens: 32000
+max_sentences: 100000
+max_valid_sentences: 1
+max_updates: 120000
+use_gt_dur: false
+use_gt_f0: false
+ds_workers: 2
+lr: 1.0

egs/egs_bases/tts/vocoder/base.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+base_config: ../base.yaml
+binarization_args:
+  with_wav: true
+  with_spk_embed: false
+  with_align: false
+  with_word: false
+  with_txt: false
+###########
+# train and eval
+###########
+max_samples: 25600
+max_sentences: 5
+max_valid_sentences: 1
+max_updates: 1000000
+val_check_interval: 2000
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+# If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+aux_context_window: 0 # Context window size for auxiliary feature.
+use_pitch_embed: false
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+disc_start_steps: 40000 # Number of steps to start to train discriminator.

egs/egs_bases/tts/vocoder/pwg.yaml ADDED Viewed

	@@ -0,0 +1,82 @@

+base_config: ./base.yaml
+task_cls: tasks.vocoder.pwg.PwgTask
+aux_context_window: 2 # Context window size for auxiliary feature.
+use_pitch_embed: false
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+  in_channels: 1        # Number of input channels.
+  out_channels: 1       # Number of output channels.
+  kernel_size: 3        # Kernel size of dilated convolution.
+  layers: 30            # Number of residual block layers.
+  stacks: 3             # Number of stacks i.e., dilation cycles.
+  residual_channels: 64 # Number of channels in residual conv.
+  gate_channels: 128    # Number of channels in gated conv.
+  skip_channels: 64     # Number of channels in skip conv.
+  aux_channels: 80      # Number of channels for auxiliary feature conv.
+  # Must be the same as num_mels.
+  # If set to 2, previous 2 and future 2 frames will be considered.
+  dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+  use_weight_norm: true # Whether to use weight norm.
+  # If set to true, it will be applied to all of the conv layers.
+  upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+  upsample_params:                      # Upsampling network parameters.
+    upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+  use_pitch_embed: false
+  use_nsf: false
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+  in_channels: 1        # Number of input channels.
+  out_channels: 1       # Number of output channels.
+  kernel_size: 3        # Number of output channels.
+  layers: 10            # Number of conv layers.
+  conv_channels: 64     # Number of chnn layers.
+  bias: true            # Whether to use bias parameter in conv.
+  use_weight_norm: true # Whether to use weight norm.
+  # If set to true, it will be applied to all of the conv layers.
+  nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+  nonlinear_activation_params:      # Nonlinear function parameters
+    negative_slope: 0.2           # Alpha in LeakyReLU.
+rerun_gen: true
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+  fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+  hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+  win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+  window: "hann_window"         # Window function for STFT-based loss
+use_mel_loss: false
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+  lr: 0.0001             # Generator's learning rate.
+  eps: 1.0e-6            # Generator's epsilon.
+  weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+  step_size: 200000      # Generator's scheduler step size.
+  gamma: 0.5             # Generator's scheduler gamma.
+  # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+  lr: 0.00005            # Discriminator's learning rate.
+  eps: 1.0e-6            # Discriminator's epsilon.
+  weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+  step_size: 200000      # Discriminator's scheduler step size.
+  gamma: 0.5             # Discriminator's scheduler gamma.
+  # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+disc_start_steps: 40000 # Number of steps to start to train discriminator.

inference/ProDiff.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+from inference.base_tts_infer import BaseTTSInfer
+from utils.ckpt_utils import load_ckpt, get_last_checkpoint
+from utils.hparams import hparams
+from modules.ProDiff.model.ProDiff import GaussianDiffusion
+from usr.diff.net import DiffNet
+import os
+import numpy as np
+from functools import partial
+class ProDiffInfer(BaseTTSInfer):
+    def build_model(self):
+        f0_stats_fn = f'{hparams["binary_data_dir"]}/train_f0s_mean_std.npy'
+        if os.path.exists(f0_stats_fn):
+            hparams['f0_mean'], hparams['f0_std'] = np.load(f0_stats_fn)
+            hparams['f0_mean'] = float(hparams['f0_mean'])
+            hparams['f0_std'] = float(hparams['f0_std'])
+        model = GaussianDiffusion(
+            phone_encoder=self.ph_encoder,
+            out_dims=80, denoise_fn=DiffNet(hparams['audio_num_mel_bins']),
+            timesteps=hparams['timesteps'],
+            loss_type=hparams['diff_loss_type'],
+            spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
+        )
+        checkpoint = torch.load(hparams['teacher_ckpt'], map_location='cpu')["state_dict"]['model']
+        teacher_timesteps = int(checkpoint['timesteps'].item())
+        teacher_timescales = int(checkpoint['timescale'].item())
+        student_timesteps = teacher_timesteps // 2
+        student_timescales = teacher_timescales * 2
+        to_torch = partial(torch.tensor, dtype=torch.float32)
+        model.register_buffer('timesteps', to_torch(student_timesteps))      # beta
+        model.register_buffer('timescale', to_torch(student_timescales))      # beta
+        model.eval()
+        load_ckpt(model, hparams['work_dir'], 'model')
+        return model
+    def forward_model(self, inp):
+        sample = self.input_to_batch(inp)
+        txt_tokens = sample['txt_tokens']  # [B, T_t]
+        with torch.no_grad():
+            output = self.model(txt_tokens, infer=True)
+            mel_out = output['mel_out']
+            wav_out = self.run_vocoder(mel_out)
+        wav_out = wav_out.squeeze().cpu().numpy()
+        return wav_out
+if __name__ == '__main__':
+    ProDiffInfer.example_run()

inference/ProDiff_Teacher.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+from inference.base_tts_infer import BaseTTSInfer
+from utils.ckpt_utils import load_ckpt, get_last_checkpoint
+from utils.hparams import hparams
+from modules.ProDiff.model.ProDiff_teacher import GaussianDiffusion
+from usr.diff.net import DiffNet
+import os
+import numpy as np
+class ProDiffTeacherInfer(BaseTTSInfer):
+    def build_model(self):
+        f0_stats_fn = f'{hparams["binary_data_dir"]}/train_f0s_mean_std.npy'
+        if os.path.exists(f0_stats_fn):
+            hparams['f0_mean'], hparams['f0_std'] = np.load(f0_stats_fn)
+            hparams['f0_mean'] = float(hparams['f0_mean'])
+            hparams['f0_std'] = float(hparams['f0_std'])
+        model = GaussianDiffusion(
+            phone_encoder=self.ph_encoder,
+            out_dims=80, denoise_fn=DiffNet(hparams['audio_num_mel_bins']),
+            timesteps=hparams['timesteps'],
+            loss_type=hparams['diff_loss_type'],
+            spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
+        )
+        model.eval()
+        load_ckpt(model, hparams['work_dir'], 'model')
+        return model
+    def forward_model(self, inp):
+        sample = self.input_to_batch(inp)
+        txt_tokens = sample['txt_tokens']  # [B, T_t]
+        with torch.no_grad():
+            output = self.model(txt_tokens, infer=True)
+            mel_out = output['mel_out']
+            wav_out = self.run_vocoder(mel_out)
+        wav_out = wav_out.squeeze().cpu().numpy()
+        return wav_out
+if __name__ == '__main__':
+    ProDiffTeacherInfer.example_run()

inference/base_tts_infer.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import os
+import torch
+from tasks.tts.dataset_utils import FastSpeechWordDataset
+from tasks.tts.tts_utils import load_data_preprocessor
+import numpy as np
+from modules.FastDiff.module.util import compute_hyperparams_given_schedule, sampling_given_noise_schedule
+import os
+import torch
+from modules.FastDiff.module.FastDiff_model import FastDiff
+from utils.ckpt_utils import load_ckpt
+from utils.hparams import set_hparams
+class BaseTTSInfer:
+    def __init__(self, hparams, device=None):
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.hparams = hparams
+        self.device = device
+        self.data_dir = hparams['binary_data_dir']
+        self.preprocessor, self.preprocess_args = load_data_preprocessor()
+        self.ph_encoder = self.preprocessor.load_dict(self.data_dir)
+        self.spk_map = self.preprocessor.load_spk_map(self.data_dir)
+        self.ds_cls = FastSpeechWordDataset
+        self.model = self.build_model()
+        self.model.eval()
+        self.model.to(self.device)
+        self.vocoder, self.diffusion_hyperparams, self.noise_schedule = self.build_vocoder()
+        self.vocoder.eval()
+        self.vocoder.to(self.device)
+    def build_model(self):
+        raise NotImplementedError
+    def forward_model(self, inp):
+        raise NotImplementedError
+    def build_vocoder(self):
+        base_dir = self.hparams['vocoder_ckpt']
+        config_path = f'{base_dir}/config.yaml'
+        config = set_hparams(config_path, global_hparams=False)
+        vocoder = FastDiff(audio_channels=config['audio_channels'],
+                 inner_channels=config['inner_channels'],
+                 cond_channels=config['cond_channels'],
+                 upsample_ratios=config['upsample_ratios'],
+                 lvc_layers_each_block=config['lvc_layers_each_block'],
+                 lvc_kernel_size=config['lvc_kernel_size'],
+                 kpnet_hidden_channels=config['kpnet_hidden_channels'],
+                 kpnet_conv_size=config['kpnet_conv_size'],
+                 dropout=config['dropout'],
+                 diffusion_step_embed_dim_in=config['diffusion_step_embed_dim_in'],
+                 diffusion_step_embed_dim_mid=config['diffusion_step_embed_dim_mid'],
+                 diffusion_step_embed_dim_out=config['diffusion_step_embed_dim_out'],
+                 use_weight_norm=config['use_weight_norm'])
+        load_ckpt(vocoder, base_dir, 'model')
+        # Init hyperparameters by linear schedule
+        noise_schedule = torch.linspace(float(config["beta_0"]), float(config["beta_T"]), int(config["T"]))
+        diffusion_hyperparams = compute_hyperparams_given_schedule(noise_schedule)
+        if config['noise_schedule'] != '':
+            noise_schedule = config['noise_schedule']
+            if isinstance(noise_schedule, list):
+                noise_schedule = torch.FloatTensor(noise_schedule)
+        else:
+            # Select Schedule
+            try:
+                reverse_step = int(self.hparams.get('N'))
+            except:
+                print(
+                    'Please specify $N (the number of revere iterations) in config file. Now denoise with 4 iterations.')
+                reverse_step = 4
+            if reverse_step == 1000:
+                noise_schedule = torch.linspace(0.000001, 0.01, 1000)
+            elif reverse_step == 200:
+                noise_schedule = torch.linspace(0.0001, 0.02, 200)
+            # Below are schedules derived by Noise Predictor.
+            # We will release codes of noise predictor training process & noise scheduling process soon. Please Stay Tuned!
+            elif reverse_step == 8:
+                noise_schedule = [6.689325005027058e-07, 1.0033881153503899e-05, 0.00015496854030061513,
+                                  0.002387222135439515, 0.035597629845142365, 0.3681158423423767, 0.4735414385795593,
+                                  0.5]
+            elif reverse_step == 6:
+                noise_schedule = [1.7838445955931093e-06, 2.7984189728158526e-05, 0.00043231004383414984,
+                                  0.006634317338466644, 0.09357017278671265, 0.6000000238418579]
+            elif reverse_step == 4:
+                noise_schedule = [3.2176e-04, 2.5743e-03, 2.5376e-02, 7.0414e-01]
+            elif reverse_step == 3:
+                noise_schedule = [9.0000e-05, 9.0000e-03, 6.0000e-01]
+            else:
+                raise NotImplementedError
+        if isinstance(noise_schedule, list):
+            noise_schedule = torch.FloatTensor(noise_schedule)
+        return vocoder, diffusion_hyperparams, noise_schedule
+    def run_vocoder(self, c):
+        c = c.transpose(2, 1)
+        audio_length = c.shape[-1] * self.hparams["hop_size"]
+        y = sampling_given_noise_schedule(
+            self.vocoder, (1, 1, audio_length), self.diffusion_hyperparams, self.noise_schedule, condition=c, ddim=False, return_sequence=False)
+        return y
+    def preprocess_input(self, inp):
+        """
+        :param inp: {'text': str, 'item_name': (str, optional), 'spk_name': (str, optional)}
+        :return:
+        """
+        preprocessor, preprocess_args = self.preprocessor, self.preprocess_args
+        text_raw = inp['text']
+        item_name = inp.get('item_name', '<ITEM_NAME>')
+        spk_name = inp.get('spk_name', 'SPK1')
+        ph, txt = preprocessor.txt_to_ph(
+            preprocessor.txt_processor, text_raw, preprocess_args)
+        ph_token = self.ph_encoder.encode(ph)
+        spk_id = self.spk_map[spk_name]
+        item = {'item_name': item_name, 'text': txt, 'ph': ph, 'spk_id': spk_id, 'ph_token': ph_token}
+        item['ph_len'] = len(item['ph_token'])
+        return item
+    def input_to_batch(self, item):
+        item_names = [item['item_name']]
+        text = [item['text']]
+        ph = [item['ph']]
+        txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device)
+        txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
+        spk_ids = torch.LongTensor(item['spk_id'])[None, :].to(self.device)
+        batch = {
+            'item_name': item_names,
+            'text': text,
+            'ph': ph,
+            'txt_tokens': txt_tokens,
+            'txt_lengths': txt_lengths,
+            'spk_ids': spk_ids,
+        }
+        return batch
+    def postprocess_output(self, output):
+        return output
+    def infer_once(self, inp):
+        inp = self.preprocess_input(inp)
+        output = self.forward_model(inp)
+        output = self.postprocess_output(output)
+        return output
+    @classmethod
+    def example_run(cls):
+        from utils.hparams import set_hparams
+        from utils.hparams import hparams as hp
+        from utils.audio import save_wav
+        set_hparams()
+        inp = {
+            'text': hp['text']
+        }
+        infer_ins = cls(hp)
+        out = infer_ins.infer_once(inp)
+        os.makedirs('infer_out', exist_ok=True)
+        save_wav(out, f'infer_out/{hp["text"]}.wav', hp['audio_sample_rate'])

inference/gradio/gradio_settings.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+title: 'Extremely-Fast diffusion text-to-speech synthesis pipeline with ProDiff and FastDiff'
+description: |
+  Gradio demo for **2-iter** ProDiff and **4-iter** FastDiff. To use it, simply add your audio, or click one of the examples to load them. **This space is running on CPU, inference will be slower.**
+  ## Key Features
+  - **Extremely-Fast** diffusion text-to-speech synthesis pipeline for potential **industrial deployment**.
+  - **Tutorial and code base** for speech diffusion models.
+  - More **supported diffusion mechanism** (e.g., guided diffusion) will be available.
+article: |
+  ## Reference
+  Link to <a href='https://github.com/Rongjiehuang/ProDiff' style='color:blue;' target='_blank\'>ProDiff Github REPO</a>
+  If you find this code useful in your research, please cite our work:
+  ```
+    @inproceedings{huang2022prodiff,
+      title={ProDiff: Progressive Fast Diffusion Model For High-Quality Text-to-Speech},
+      author={Huang, Rongjie and Zhao, Zhou and Liu, Huadai and Liu, Jinglin and Cui, Chenye and Ren, Yi},
+      booktitle={Proceedings of the 30th ACM International Conference on Multimedia},
+      year={2022}
+    @inproceedings{huang2022fastdiff,
+      title={FastDiff: A Fast Conditional Diffusion Model for High-Quality Speech Synthesis},
+      author={Huang, Rongjie and Lam, Max WY and Wang, Jun and Su, Dan and Yu, Dong and Ren, Yi and Zhao, Zhou},
+      booktitle = {Proceedings of the Thirty-First International Joint Conference on Artificial Intelligence, {IJCAI-22}},
+      year={2022}
+    }
+  ```
+  ## Disclaimer
+  Any organization or individual is prohibited from using any technology mentioned in this paper to generate someone's speech without his/her consent, including but not limited to government leaders, political figures, and celebrities. If you do not comply with this item, you could be in violation of copyright laws.
+example_inputs:
+  - |-
+    the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.
+  - |-
+    Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition.
+inference_cls: inference.ProDiff.ProDiffInfer
+exp_name: ProDiff
+config: modules/ProDiff/config/prodiff.yaml

inference/gradio/infer.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import importlib
+import re
+import gradio as gr
+import yaml
+from gradio.inputs import Textbox
+from inference.base_tts_infer import BaseTTSInfer
+from utils.hparams import set_hparams
+from utils.hparams import hparams as hp
+import numpy as np
+from data_gen.tts.data_gen_utils import is_sil_phoneme, PUNCS
+class GradioInfer:
+    def __init__(self, exp_name, config, inference_cls, title, description, article, example_inputs):
+        self.exp_name = exp_name
+        self.config = config
+        self.title = title
+        self.description = description
+        self.article = article
+        self.example_inputs = example_inputs
+        pkg = ".".join(inference_cls.split(".")[:-1])
+        cls_name = inference_cls.split(".")[-1]
+        self.inference_cls = getattr(importlib.import_module(pkg), cls_name)
+    def greet(self, text):
+        sents = re.split(rf'([{PUNCS}])', text.replace('\n', ','))
+        if sents[-1] not in list(PUNCS):
+            sents = sents + ['.']
+        audio_outs = []
+        s = ""
+        for i in range(0, len(sents), 2):
+            if len(sents[i]) > 0:
+                s += sents[i] + sents[i + 1]
+            if len(s) >= 400 or (i >= len(sents) - 2 and len(s) > 0):
+                audio_out = self.infer_ins.infer_once({
+                    'text': s
+                })
+                audio_out = audio_out * 32767
+                audio_out = audio_out.astype(np.int16)
+                audio_outs.append(audio_out)
+                audio_outs.append(np.zeros(int(hp['audio_sample_rate'] * 0.3)).astype(np.int16))
+                s = ""
+        audio_outs = np.concatenate(audio_outs)
+        return hp['audio_sample_rate'], audio_outs
+    def run(self):
+        set_hparams(exp_name=self.exp_name, config=self.config)
+        infer_cls = self.inference_cls
+        self.infer_ins: BaseTTSInfer = infer_cls(hp)
+        example_inputs = self.example_inputs
+        iface = gr.Interface(fn=self.greet,
+                             inputs=Textbox(
+                                 lines=10, placeholder=None, default=example_inputs[0], label="input text"),
+                             outputs="audio",
+                             allow_flagging="never",
+                             title=self.title,
+                             description=self.description,
+                             article=self.article,
+                             examples=example_inputs,
+                             enable_queue=True)
+        iface.launch(share=True,cache_examples=True)
+if __name__ == '__main__':
+    gradio_config = yaml.safe_load(open('inference/gradio/gradio_settings.yaml'))
+    g = GradioInfer(**gradio_config)
+    g.run()

modules/FastDiff/config/FastDiff.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+base_config:
+  - ./base.yaml
+audio_sample_rate: 22050
+raw_data_dir: 'data/raw/LJSpeech-1.1'
+processed_data_dir: 'data/processed/LJSpeech'
+binary_data_dir: 'data/binary/LJSpeech'

modules/FastDiff/config/FastDiff_libritts.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+base_config:
+  - ./base.yaml
+audio_sample_rate: 22050
+raw_data_dir: 'data/raw/LibriTTS'
+processed_data_dir: 'data/processed/LibriTTS'
+binary_data_dir: 'data/binary/LibriTTS'

modules/FastDiff/config/FastDiff_sc09.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+base_config:
+  - egs/egs_bases/tts/vocoder/base.yaml
+  - egs/datasets/audio/lj/base_mel2wav.yaml
+  - ./base.yaml
+#raw_data_dir: '/home1/huangrongjie/dataset/sc09/data/'
+#processed_data_dir: 'data/processed/SC09'
+#binary_data_dir: 'data/binary/SC09'
+raw_data_dir: '/home1/huangrongjie/Project/AdaGrad/data/raw/SC09/'
+processed_data_dir: 'data/processed/SC09_ten_processed'
+binary_data_dir: 'data/binary/SC09_ten_processed'
+pre_align_cls: egs.datasets.audio.sc09.pre_align.Sc09PreAlign
+audio_sample_rate: 16000
+max_samples: 12800
+pre_align_args:
+  sox_resample: false
+  sox_to_wav: false
+  allow_no_txt: true
+  trim_sil: true
+  denoise: true
+loud_norm: true

modules/FastDiff/config/FastDiff_tacotron.yaml ADDED Viewed

	@@ -0,0 +1,58 @@

+base_config:
+  - egs/egs_bases/tts/vocoder/pwg.yaml
+  - egs/egs_bases/tts/base_mel2wav.yaml
+  - egs/datasets/audio/lj/pwg.yaml
+raw_data_dir: 'data/raw/LJSpeech-1.1'
+processed_data_dir: 'data/processed/LJSpeech_FastDiff'
+#binary_data_dir: 'data/binary/LJSpeech_Taco'
+binary_data_dir: /apdcephfs/private_nlphuang/preprocess/AdaGrad/data/binary/LJSpeech_Taco
+binarizer_cls: data_gen.tts.vocoder_binarizer.VocoderBinarizer
+pre_align_cls: egs.datasets.audio.lj.pre_align.LJPreAlign
+task_cls: modules.FastDiff.task.FastDiff.FastDiffTask
+binarization_args:
+  with_wav: true
+  with_spk_embed: false
+  with_align: false
+  with_word: false
+  with_txt: false
+  with_f0: false
+# data
+num_spk: 400
+max_samples: 25600
+aux_context_window: 0
+max_sentences: 20
+test_input_dir: '' # 'wavs' # wav->wav inference
+test_mel_dir: '' # 'mels' # mel->wav inference
+use_wav: True # mel->wav inference
+# training
+num_sanity_val_steps: -1
+max_updates: 1000000
+lr: 2e-4
+weight_decay: 0
+# FastDiff
+audio_channels: 1
+inner_channels: 32
+cond_channels: 80
+upsample_ratios: [8, 8, 4]
+lvc_layers_each_block: 4
+lvc_kernel_size: 3
+kpnet_hidden_channels: 64
+kpnet_conv_size: 3
+dropout: 0.0
+diffusion_step_embed_dim_in: 128
+diffusion_step_embed_dim_mid: 512
+diffusion_step_embed_dim_out: 512
+use_weight_norm: True
+# Diffusion
+T: 1000
+beta_0: 0.000001
+beta_T: 0.01
+noise_schedule: ''
+N: ''

modules/FastDiff/config/FastDiff_vctk.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+base_config:
+  - ./base.yaml
+audio_sample_rate: 22050
+raw_data_dir: 'data/raw/VCTK'
+processed_data_dir: 'data/processed/VCTK'
+binary_data_dir: 'data/binary/VCTK'

modules/FastDiff/config/base.yaml ADDED Viewed

	@@ -0,0 +1,157 @@

+#############
+# Custom dataset preprocess
+#############
+audio_num_mel_bins: 80
+audio_sample_rate: 22050
+hop_size: 256  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
+win_size: 1024  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
+fmin: 80  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
+fmax: 7600  # To be increased/reduced depending on data.
+fft_size: 1024  # Extra window size is filled with 0 paddings to match this parameter
+min_level_db: -100
+ref_level_db: 20
+griffin_lim_iters: 60
+num_spk: 1 # number of speakers
+mel_vmin: -6
+mel_vmax: 1.5
+#############
+# FastDiff Model
+#############
+audio_channels: 1
+inner_channels: 32
+cond_channels: 80
+upsample_ratios: [8, 8, 4]
+lvc_layers_each_block: 4
+lvc_kernel_size: 3
+kpnet_hidden_channels: 64
+kpnet_conv_size: 3
+dropout: 0.0
+diffusion_step_embed_dim_in: 128
+diffusion_step_embed_dim_mid: 512
+diffusion_step_embed_dim_out: 512
+use_weight_norm: True
+###########
+# Diffusion
+###########
+T: 1000
+beta_0: 0.000001
+beta_T: 0.01
+noise_schedule: ''
+N: ''
+###########
+# train and eval
+###########
+task_cls: modules.FastDiff.task.FastDiff.FastDiffTask
+max_updates: 1000000 # max training steps
+max_samples: 25600 # audio length in training
+max_sentences: 20 # max batch size in training
+num_sanity_val_steps: -1
+max_valid_sentences: 1
+valid_infer_interval: 10000
+val_check_interval: 2000
+num_test_samples: 0
+num_valid_plots: 10
+#############
+# Stage 1 of data processing
+#############
+pre_align_cls: egs.datasets.audio.pre_align.PreAlign
+pre_align_args:
+  nsample_per_mfa_group: 1000
+  txt_processor: en
+  use_tone: true # for ZH
+  sox_resample: false
+  sox_to_wav: false
+  allow_no_txt: true
+  trim_sil: false
+  denoise: false
+#############
+# Stage 2 of data processing
+#############
+binarizer_cls: data_gen.tts.vocoder_binarizer.VocoderBinarizer
+binarization_args:
+  with_wav: true
+  with_spk_embed: false
+  with_align: false
+  with_word: false
+  with_txt: false
+  with_f0: false
+  shuffle: false
+  with_spk_id: true
+  with_f0cwt: false
+  with_linear: false
+  trim_eos_bos: false
+  reset_phone_dict: true
+  reset_word_dict: true
+###########
+# optimization
+###########
+lr: 2e-4    # learning rate
+weight_decay: 0
+scheduler: rsqrt # rsqrt|none
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+clip_grad_norm: 1
+clip_grad_value: 0
+#############
+# Setting for this Pytorch framework
+#############
+max_input_tokens: 1550
+frames_multiple: 1
+use_word_input: false
+vocoder: FastDiff
+vocoder_ckpt: checkpoints/FastDiff
+vocoder_denoise_c: 0.0
+max_tokens: 30000
+max_valid_tokens: 60000
+test_ids: [ ]
+profile_infer: false
+out_wav_norm: false
+save_gt: true
+save_f0: false
+aux_context_window: 0
+test_input_dir: '' # 'wavs' # wav->wav inference
+test_mel_dir: '' # 'mels' # mel->wav inference
+use_wav: True # mel->wav inference
+pitch_extractor: parselmouth
+loud_norm: false
+endless_ds: true
+test_num: 100
+min_frames: 0
+max_frames: 1548
+ds_workers: 1
+gen_dir_name: ''
+accumulate_grad_batches: 1
+tb_log_interval: 100
+print_nan_grads: false
+work_dir: '' # experiment directory.
+infer: false # inference
+amp: false
+debug: false
+save_codes: []
+save_best: true
+num_ckpt_keep: 3
+sort_by_len: true
+load_ckpt: ''
+check_val_every_n_epoch: 10
+max_epochs: 1000
+eval_max_batches: -1
+resume_from_checkpoint: 0
+rename_tmux: true
+valid_monitor_key: 'val_loss'
+valid_monitor_mode: 'min'
+train_set_name: 'train'
+train_sets: ''
+valid_set_name: 'valid'
+test_set_name: 'test'
+seed: 1234