Spaces:

NATSpeech
/

PortaSpeech

Runtime error

App Files Files Community

RayeRen commited on Feb 3, 2022

Commit

d1b91e7

0 Parent(s):

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +148 -0
README.md +9 -0
checkpoints/fs2_exp/config.yaml +219 -0
checkpoints/fs2_exp/model_ckpt_steps_98000.ckpt +3 -0
checkpoints/hifi_lj/config.yaml +207 -0
checkpoints/hifi_lj/model_ckpt_steps_2076000.ckpt +3 -0
checkpoints/ps_normal_exp/config.yaml +258 -0
checkpoints/ps_normal_exp/model_ckpt_steps_278000.ckpt +3 -0
checkpoints/ps_small_exp/config.yaml +258 -0
checkpoints/ps_small_exp/model_ckpt_steps_410000.ckpt +3 -0
data/binary/ljspeech/phone_set.json +1 -0
data/binary/ljspeech/spk_map.json +1 -0
data/binary/ljspeech/word_set.json +0 -0
data/binary/ljspeech_cwt/phone_set.json +1 -0
data/binary/ljspeech_cwt/spk_map.json +1 -0
data/binary/ljspeech_cwt/word_set.json +0 -0
data_gen/tts/base_binarizer.py +225 -0
data_gen/tts/base_preprocess.py +251 -0
data_gen/tts/binarizer_zh.py +25 -0
data_gen/tts/runs/adapt_mfa_align.py +18 -0
data_gen/tts/runs/align_and_binarize.py +12 -0
data_gen/tts/runs/binarize.py +17 -0
data_gen/tts/runs/preprocess.py +17 -0
data_gen/tts/runs/train_mfa_align.py +46 -0
data_gen/tts/txt_processors/__init__.py +1 -0
data_gen/tts/txt_processors/base_text_processor.py +48 -0
data_gen/tts/txt_processors/en.py +78 -0
data_gen/tts/wav_processors/__init__.py +2 -0
data_gen/tts/wav_processors/base_processor.py +25 -0
data_gen/tts/wav_processors/common_processors.py +86 -0
docs/fastspeech2.md +53 -0
docs/framework.md +106 -0
docs/portaspeech.md +61 -0
docs/prepare_data.md +25 -0
docs/prepare_vocoder.md +49 -0
egs/datasets/audio/lj/base_mel2wav.yaml +4 -0
egs/datasets/audio/lj/base_text2mel.yaml +16 -0
egs/datasets/audio/lj/fs.yaml +3 -0
egs/datasets/audio/lj/fs2_orig.yaml +4 -0
egs/datasets/audio/lj/hifigan.yaml +3 -0
egs/datasets/audio/lj/preprocess.py +9 -0
egs/datasets/audio/lj/ps_flow.yaml +3 -0
egs/datasets/audio/lj/ps_flow_nips2021.yaml +11 -0
egs/datasets/audio/lj/ps_flow_small.yaml +3 -0
egs/datasets/audio/lj/ps_flow_small_nips2021.yaml +11 -0
egs/egs_bases/config_base.yaml +41 -0
egs/egs_bases/tts/base.yaml +56 -0
egs/egs_bases/tts/dataset_params.yaml +52 -0
egs/egs_bases/tts/fs.yaml +75 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.ckpt filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,148 @@

+### Project ignore
+infer_out
+flagged
+rsync
+.idea
+.DS_Store
+bak
+tmp
+*.tar.gz
+mos
+nbs
+/configs_usr/*
+!/configs_usr/.gitkeep
+/egs_usr/*
+!/egs_usr/.gitkeep
+/rnnoise
+#/usr/*
+#!/usr/.gitkeep
+scripts_usr
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+将删除 datasets/remi/test/

README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+---
+title: FastSpeech2
+emoji: 🤗
+colorFrom: yellow
+colorTo: orange
+sdk: gradio
+app_file: "inference/tts/gradio/infer.py"
+pinned: false
+---

checkpoints/fs2_exp/config.yaml ADDED Viewed

	@@ -0,0 +1,219 @@

+accumulate_grad_batches: 1
+amp: false
+audio_num_mel_bins: 80
+audio_sample_rate: 22050
+base_config:
+- egs/egs_bases/tts/fs2_orig.yaml
+- ./base_text2mel.yaml
+binarization_args:
+  min_sil_duration: 0.1
+  shuffle: false
+  test_range:
+  - 0
+  - 523
+  train_range:
+  - 871
+  - -1
+  trim_eos_bos: false
+  valid_range:
+  - 523
+  - 871
+  with_align: true
+  with_f0: true
+  with_f0cwt: true
+  with_linear: false
+  with_spk_embed: false
+  with_wav: false
+binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
+binary_data_dir: data/binary/ljspeech_cwt
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+conv_use_pos: false
+cwt_std_scale: 1.0
+debug: false
+dec_dilations:
+- 1
+- 1
+- 1
+- 1
+dec_ffn_kernel_size: 9
+dec_inp_add_noise: false
+dec_kernel_size: 5
+dec_layers: 4
+dec_post_net_kernel: 3
+decoder_rnn_dim: 0
+decoder_type: fft
+dropout: 0.0
+ds_workers: 2
+dur_predictor_kernel: 3
+dur_predictor_layers: 2
+enc_dec_norm: ln
+enc_dilations:
+- 1
+- 1
+- 1
+- 1
+enc_ffn_kernel_size: 9
+enc_kernel_size: 5
+enc_layers: 4
+enc_post_net_kernel: 3
+enc_pre_ln: true
+enc_prenet: true
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+eval_max_batches: -1
+f0_max: 800
+f0_min: 80
+ffn_act: gelu
+ffn_hidden_size: 1024
+fft_size: 1024
+fmax: 7600
+fmin: 80
+frames_multiple: 1
+gen_dir_name: ''
+griffin_lim_iters: 30
+hidden_size: 256
+hop_size: 256
+infer: false
+lambda_commit: 0.25
+lambda_energy: 0.1
+lambda_f0: 1.0
+lambda_ph_dur: 0.1
+lambda_sent_dur: 1.0
+lambda_uv: 1.0
+lambda_word_dur: 1.0
+layers_in_block: 2
+load_ckpt: ''
+loud_norm: false
+lr: 0.0005
+max_epochs: 1000
+max_frames: 1548
+max_input_tokens: 1550
+max_sentences: 128
+max_tokens: 40000
+max_updates: 160000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+mel_losses: l1:0.5|ssim:0.5
+mel_vmax: 1.5
+mel_vmin: -6
+min_frames: 0
+num_ckpt_keep: 3
+num_heads: 2
+num_sanity_val_steps: 5
+num_spk: 1
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_extractor: parselmouth
+pitch_key: pitch
+pitch_type: cwt
+predictor_dropout: 0.5
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+preprocess_args:
+  add_eos_bos: true
+  mfa_group_shuffle: false
+  mfa_offset: 0.02
+  nsample_per_mfa_group: 1000
+  reset_phone_dict: true
+  reset_word_dict: true
+  save_sil_mask: true
+  txt_processor: en
+  use_mfa: true
+  vad_max_silence_length: 12
+  wav_processors: []
+  with_phsep: true
+preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
+print_nan_grads: false
+processed_data_dir: data/processed/ljspeech
+profile_infer: false
+raw_data_dir: data/raw/LJSpeech-1.1
+ref_norm_layer: bn
+rename_tmux: true
+resume_from_checkpoint: 0
+save_best: false
+save_codes:
+- tasks
+- modules
+- egs
+save_f0: false
+save_gt: true
+scheduler: warmup
+seed: 1234
+sort_by_len: true
+task_cls: tasks.tts.fs2_orig.FastSpeech2OrigTask
+tb_log_interval: 100
+test_ids:
+- 0
+- 1
+- 2
+- 3
+- 4
+- 5
+- 6
+- 7
+- 8
+- 9
+- 10
+- 11
+- 12
+- 13
+- 14
+- 15
+- 16
+- 17
+- 18
+- 19
+- 68
+- 70
+- 74
+- 87
+- 110
+- 172
+- 190
+- 215
+- 231
+- 294
+- 316
+- 324
+- 402
+- 422
+- 485
+- 500
+- 505
+- 508
+- 509
+- 519
+test_input_yaml: ''
+test_num: 100
+test_set_name: test
+train_set_name: train
+train_sets: ''
+use_energy_embed: true
+use_gt_dur: false
+use_gt_energy: false
+use_gt_f0: false
+use_pitch_embed: true
+use_pos_embed: true
+use_spk_embed: false
+use_spk_id: false
+use_uv: true
+use_word_input: false
+val_check_interval: 2000
+valid_infer_interval: 10000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+vocoder: HifiGAN
+vocoder_ckpt: checkpoints/hifi_lj
+warmup_updates: 4000
+weight_decay: 0
+win_size: 1024
+word_dict_size: 10000
+work_dir: checkpoints/fs2_exp

checkpoints/fs2_exp/model_ckpt_steps_98000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d4f450bb3115e04b4ea93eed8c9318f08d01582bed1dd86886b32d50601dc58
+size 108423039

checkpoints/hifi_lj/config.yaml ADDED Viewed

	@@ -0,0 +1,207 @@

+accumulate_grad_batches: 1
+adam_b1: 0.8
+adam_b2: 0.99
+amp: false
+audio_num_mel_bins: 80
+audio_sample_rate: 22050
+base_config:
+- configs/tts/hifigan.yaml
+- configs/tts/lj/base_mel2wav.yaml
+binarization_args:
+  shuffle: false
+  trim_eos_bos: false
+  trim_sil: false
+  with_align: false
+  with_f0: true
+  with_f0cwt: false
+  with_linear: false
+  with_spk_embed: false
+  with_txt: true
+  with_wav: true
+binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
+binary_data_dir: data/binary/ljspeech_wav
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+dict_dir: ''
+disc_start_steps: 40000
+discriminator_grad_norm: 1
+discriminator_optimizer_params:
+  eps: 1.0e-06
+  lr: 0.0002
+  weight_decay: 0.0
+discriminator_params:
+  bias: true
+  conv_channels: 64
+  in_channels: 1
+  kernel_size: 3
+  layers: 10
+  nonlinear_activation: LeakyReLU
+  nonlinear_activation_params:
+    negative_slope: 0.2
+  out_channels: 1
+  use_weight_norm: true
+discriminator_scheduler_params:
+  gamma: 0.999
+  step_size: 600
+dropout: 0.1
+ds_workers: 1
+enc_ffn_kernel_size: 9
+enc_layers: 4
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 1024
+fm_loss: false
+fmax: 7600
+fmin: 80
+frames_multiple: 1
+gen_dir_name: ''
+generator_grad_norm: 10
+generator_optimizer_params:
+  eps: 1.0e-06
+  lr: 0.0002
+  weight_decay: 0.0
+generator_params:
+  aux_channels: 80
+  aux_context_window: 0
+  dropout: 0.0
+  gate_channels: 128
+  in_channels: 1
+  kernel_size: 3
+  layers: 30
+  out_channels: 1
+  residual_channels: 64
+  skip_channels: 64
+  stacks: 3
+  upsample_net: ConvInUpsampleNetwork
+  upsample_params:
+    upsample_scales:
+    - 4
+    - 4
+    - 4
+    - 4
+  use_nsf: false
+  use_pitch_embed: false
+  use_weight_norm: true
+generator_scheduler_params:
+  gamma: 0.999
+  step_size: 600
+griffin_lim_iters: 60
+hidden_size: 256
+hop_size: 256
+infer: false
+lambda_adv: 4.0
+lambda_mel: 45.0
+load_ckpt: ''
+loud_norm: false
+lr: 2.0
+max_epochs: 1000
+max_eval_sentences: 1
+max_eval_tokens: 60000
+max_frames: 1548
+max_input_tokens: 1550
+max_samples: 8192
+max_sentences: 24
+max_tokens: 30000
+max_updates: 3000000
+mel_vmax: 1.5
+mel_vmin: -6
+min_level_db: -100
+num_ckpt_keep: 3
+num_heads: 2
+num_mels: 80
+num_sanity_val_steps: 5
+num_spk: 1
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_extractor: parselmouth
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  forced_align: mfa
+  sox_resample: false
+  trim_sil: false
+  txt_processor: en
+  use_tone: true
+pre_align_cls: ''
+print_nan_grads: false
+processed_data_dir: data/processed/ljspeech
+profile_infer: false
+raw_data_dir: data/raw/LJSpeech-1.1
+ref_level_db: 20
+rerun_gen: true
+resblock: '1'
+resblock_dilation_sizes:
+- - 1
+  - 3
+  - 5
+- - 1
+  - 3
+  - 5
+- - 1
+  - 3
+  - 5
+resblock_kernel_sizes:
+- 3
+- 7
+- 11
+reset_phone_dict: true
+resume_from_checkpoint: 0
+sampling_rate: 22050
+save_best: true
+save_codes: []
+save_f0: false
+save_gt: true
+seed: 1234
+sort_by_len: true
+stft_loss_params:
+  fft_sizes:
+  - 1024
+  - 2048
+  - 512
+  hop_sizes:
+  - 120
+  - 240
+  - 50
+  win_lengths:
+  - 600
+  - 1200
+  - 240
+  window: hann_window
+stop_token_weight: 5.0
+task_cls: tasks.vocoder.hifigan.HifiGanTask
+tb_log_interval: 100
+test_input_dir: ''
+test_num: 100
+test_set_name: test
+train_set_name: train
+upsample_initial_channel: 512
+upsample_kernel_sizes:
+- 16
+- 16
+- 4
+- 4
+upsample_rates:
+- 8
+- 8
+- 2
+- 2
+use_mel_loss: false
+use_pitch_embed: false
+val_check_interval: 2000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+vocoder: pwg
+vocoder_ckpt: ''
+warmup_updates: 8000
+weight_decay: 0
+win_length: null
+win_size: 1024
+window: hann
+work_dir: checkpoints/0414_hifi_lj_1

checkpoints/hifi_lj/model_ckpt_steps_2076000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bbc40f0471a92394f6bf057820cf66a1f50d29db22c997341448bd496a0792d
+size 55786088

checkpoints/ps_normal_exp/config.yaml ADDED Viewed

	@@ -0,0 +1,258 @@

+accumulate_grad_batches: 1
+add_word_pos: true
+amp: false
+audio_num_mel_bins: 80
+audio_sample_rate: 22050
+base_config:
+- ./ps_flow.yaml
+binarization_args:
+  min_sil_duration: 0.1
+  shuffle: false
+  test_range:
+  - 0
+  - 523
+  train_range:
+  - 871
+  - -1
+  trim_eos_bos: false
+  valid_range:
+  - 523
+  - 871
+  with_align: true
+  with_f0: true
+  with_f0cwt: false
+  with_linear: false
+  with_spk_embed: false
+  with_wav: false
+binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
+binary_data_dir: data/binary/ljspeech
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+conv_use_pos: false
+debug: false
+dec_dilations:
+- 1
+- 1
+- 1
+- 1
+dec_ffn_kernel_size: 9
+dec_inp_add_noise: false
+dec_kernel_size: 5
+dec_layers: 4
+dec_post_net_kernel: 3
+decoder_rnn_dim: 0
+decoder_type: conv
+detach_postflow_input: true
+dropout: 0.0
+ds_workers: 2
+dur_level: word
+dur_predictor_kernel: 5
+dur_predictor_layers: 3
+enc_dec_norm: ln
+enc_dilations:
+- 1
+- 1
+- 1
+- 1
+enc_ffn_kernel_size: 5
+enc_kernel_size: 5
+enc_layers: 4
+enc_post_net_kernel: 3
+enc_pre_ln: false
+enc_prenet: true
+encoder_K: 8
+encoder_type: rel_fft
+endless_ds: true
+eval_max_batches: -1
+f0_max: 800
+f0_min: 80
+ffn_act: gelu
+ffn_hidden_size: 768
+fft_size: 1024
+fmax: 7600
+fmin: 80
+frames_multiple: 4
+fvae_dec_n_layers: 4
+fvae_decoder_type: wn
+fvae_enc_dec_hidden: 192
+fvae_enc_n_layers: 8
+fvae_encoder_type: wn
+fvae_kernel_size: 5
+fvae_noise_scale: 1.0
+fvae_strides: 4
+gen_dir_name: ''
+glow_kernel_size: 3
+griffin_lim_iters: 30
+hidden_size: 192
+hop_size: 256
+infer: false
+infer_post_glow: true
+kl_min: 0.0
+kl_start_steps: 10000
+lambda_commit: 0.25
+lambda_energy: 0.1
+lambda_f0: 1.0
+lambda_kl: 1.0
+lambda_ph_dur: 0.1
+lambda_sent_dur: 0.0
+lambda_uv: 1.0
+lambda_word_dur: 1.0
+latent_size: 16
+layers_in_block: 2
+load_ckpt: ''
+loud_norm: false
+lr: 0.0002
+max_epochs: 1000
+max_frames: 1548
+max_input_tokens: 1550
+max_sentences: 64
+max_tokens: 40000
+max_updates: 480000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+mel_losses: l1:0.5|ssim:0.5
+mel_vmax: 1.5
+mel_vmin: -6
+min_frames: 0
+noise_scale: 0.8
+num_ckpt_keep: 3
+num_heads: 2
+num_sanity_val_steps: 5
+num_spk: 1
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_extractor: parselmouth
+pitch_key: pitch
+pitch_type: frame
+post_decoder: false
+post_decoder_detach_ling: false
+post_flow_lr: 0.001
+post_glow_hidden: 192
+post_glow_kernel_size: 3
+post_glow_n_block_layers: 3
+post_glow_n_blocks: 12
+post_glow_training_start: 160000
+post_share_cond_layers: false
+posterior_start_steps: 0
+predictor_dropout: 0.2
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+preprocess_args:
+  add_eos_bos: true
+  mfa_group_shuffle: false
+  mfa_offset: 0.02
+  nsample_per_mfa_group: 1000
+  reset_phone_dict: true
+  reset_word_dict: true
+  save_sil_mask: true
+  txt_processor: en
+  use_mfa: true
+  vad_max_silence_length: 12
+  wav_processors: []
+  with_phsep: true
+preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
+print_nan_grads: false
+prior_glow_hidden: 64
+prior_glow_n_blocks: 4
+processed_data_dir: data/processed/ljspeech
+profile_infer: false
+raw_data_dir: data/raw/LJSpeech-1.1
+ref_norm_layer: bn
+rename_tmux: true
+resume_from_checkpoint: 0
+save_best: false
+save_codes:
+- tasks
+- modules
+- egs
+save_f0: false
+save_gt: true
+scheduler: warmup
+seed: 1234
+share_wn_layers: 4
+sigmoid_scale: false
+sort_by_len: true
+task_cls: tasks.tts.ps_flow.PortaSpeechFlowTask
+tb_log_interval: 100
+test_ids:
+- 0
+- 1
+- 2
+- 3
+- 4
+- 5
+- 6
+- 7
+- 8
+- 9
+- 10
+- 11
+- 12
+- 13
+- 14
+- 15
+- 16
+- 17
+- 18
+- 19
+- 68
+- 70
+- 74
+- 87
+- 110
+- 172
+- 190
+- 215
+- 231
+- 294
+- 316
+- 324
+- 402
+- 422
+- 485
+- 500
+- 505
+- 508
+- 509
+- 519
+test_input_yaml: ''
+test_num: 100
+test_set_name: test
+text_encoder_postnet: false
+train_set_name: train
+train_sets: ''
+two_stage: true
+use_cond_proj: false
+use_fvae: true
+use_gt_dur: false
+use_gt_f0: false
+use_latent_cond: false
+use_pitch_embed: false
+use_pos_embed: true
+use_post_flow: true
+use_prior_flow: true
+use_spk_embed: false
+use_spk_id: false
+use_txt_cond: true
+use_uv: true
+use_word_encoder: false
+use_word_input: false
+val_check_interval: 2000
+valid_infer_interval: 10000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+vocoder: HifiGAN
+vocoder_ckpt: checkpoints/hifi_lj
+warmup_updates: 8000
+weight_decay: 0
+win_size: 1024
+word_dict_size: 10000
+word_enc_layers: 4
+word_encoder_type: rel_fft
+work_dir: checkpoints/ps_normal_exp

checkpoints/ps_normal_exp/model_ckpt_steps_278000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13a51035b84c2a385d05ce695f6dca0b5095e7bd7ea3b1d34a22aed4d9c9b5fc
+size 104081102

checkpoints/ps_small_exp/config.yaml ADDED Viewed

	@@ -0,0 +1,258 @@

+accumulate_grad_batches: 1
+add_word_pos: true
+amp: false
+audio_num_mel_bins: 80
+audio_sample_rate: 22050
+base_config:
+- ./ps_flow_small.yaml
+binarization_args:
+  min_sil_duration: 0.1
+  shuffle: false
+  test_range:
+  - 0
+  - 523
+  train_range:
+  - 871
+  - -1
+  trim_eos_bos: false
+  valid_range:
+  - 523
+  - 871
+  with_align: true
+  with_f0: true
+  with_f0cwt: false
+  with_linear: false
+  with_spk_embed: false
+  with_wav: false
+binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
+binary_data_dir: data/binary/ljspeech
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+conv_use_pos: false
+debug: false
+dec_dilations:
+- 1
+- 1
+- 1
+- 1
+dec_ffn_kernel_size: 9
+dec_inp_add_noise: false
+dec_kernel_size: 5
+dec_layers: 4
+dec_post_net_kernel: 3
+decoder_rnn_dim: 0
+decoder_type: conv
+detach_postflow_input: true
+dropout: 0.0
+ds_workers: 2
+dur_level: word
+dur_predictor_kernel: 5
+dur_predictor_layers: 3
+enc_dec_norm: ln
+enc_dilations:
+- 1
+- 1
+- 1
+- 1
+enc_ffn_kernel_size: 3
+enc_kernel_size: 5
+enc_layers: 3
+enc_post_net_kernel: 3
+enc_pre_ln: false
+enc_prenet: true
+encoder_K: 8
+encoder_type: rel_fft
+endless_ds: true
+eval_max_batches: -1
+f0_max: 800
+f0_min: 80
+ffn_act: gelu
+ffn_hidden_size: 512
+fft_size: 1024
+fmax: 7600
+fmin: 80
+frames_multiple: 4
+fvae_dec_n_layers: 3
+fvae_decoder_type: wn
+fvae_enc_dec_hidden: 128
+fvae_enc_n_layers: 8
+fvae_encoder_type: wn
+fvae_kernel_size: 3
+fvae_noise_scale: 1.0
+fvae_strides: 4
+gen_dir_name: ''
+glow_kernel_size: 3
+griffin_lim_iters: 30
+hidden_size: 128
+hop_size: 256
+infer: false
+infer_post_glow: true
+kl_min: 0.0
+kl_start_steps: 10000
+lambda_commit: 0.25
+lambda_energy: 0.1
+lambda_f0: 1.0
+lambda_kl: 1.0
+lambda_ph_dur: 0.1
+lambda_sent_dur: 0.0
+lambda_uv: 1.0
+lambda_word_dur: 1.0
+latent_size: 16
+layers_in_block: 2
+load_ckpt: ''
+loud_norm: false
+lr: 0.0002
+max_epochs: 1000
+max_frames: 1548
+max_input_tokens: 1550
+max_sentences: 128
+max_tokens: 40000
+max_updates: 480000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+mel_losses: l1:0.5|ssim:0.5
+mel_vmax: 1.5
+mel_vmin: -6
+min_frames: 0
+noise_scale: 0.6
+num_ckpt_keep: 3
+num_heads: 2
+num_sanity_val_steps: 5
+num_spk: 1
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_extractor: parselmouth
+pitch_key: pitch
+pitch_type: frame
+post_decoder: false
+post_decoder_detach_ling: false
+post_flow_lr: 0.001
+post_glow_hidden: 128
+post_glow_kernel_size: 3
+post_glow_n_block_layers: 3
+post_glow_n_blocks: 8
+post_glow_training_start: 160000
+post_share_cond_layers: false
+posterior_start_steps: 0
+predictor_dropout: 0.2
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+preprocess_args:
+  add_eos_bos: true
+  mfa_group_shuffle: false
+  mfa_offset: 0.02
+  nsample_per_mfa_group: 1000
+  reset_phone_dict: true
+  reset_word_dict: true
+  save_sil_mask: true
+  txt_processor: en
+  use_mfa: true
+  vad_max_silence_length: 12
+  wav_processors: []
+  with_phsep: true
+preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
+print_nan_grads: false
+prior_glow_hidden: 32
+prior_glow_n_blocks: 3
+processed_data_dir: data/processed/ljspeech
+profile_infer: false
+raw_data_dir: data/raw/LJSpeech-1.1
+ref_norm_layer: bn
+rename_tmux: true
+resume_from_checkpoint: 0
+save_best: false
+save_codes:
+- tasks
+- modules
+- egs
+save_f0: false
+save_gt: true
+scheduler: warmup
+seed: 1234
+share_wn_layers: 4
+sigmoid_scale: false
+sort_by_len: true
+task_cls: tasks.tts.ps_flow.PortaSpeechFlowTask
+tb_log_interval: 100
+test_ids:
+- 0
+- 1
+- 2
+- 3
+- 4
+- 5
+- 6
+- 7
+- 8
+- 9
+- 10
+- 11
+- 12
+- 13
+- 14
+- 15
+- 16
+- 17
+- 18
+- 19
+- 68
+- 70
+- 74
+- 87
+- 110
+- 172
+- 190
+- 215
+- 231
+- 294
+- 316
+- 324
+- 402
+- 422
+- 485
+- 500
+- 505
+- 508
+- 509
+- 519
+test_input_yaml: ''
+test_num: 100
+test_set_name: test
+text_encoder_postnet: false
+train_set_name: train
+train_sets: ''
+two_stage: true
+use_cond_proj: false
+use_fvae: true
+use_gt_dur: false
+use_gt_f0: false
+use_latent_cond: false
+use_pitch_embed: false
+use_pos_embed: true
+use_post_flow: true
+use_prior_flow: true
+use_spk_embed: false
+use_spk_id: false
+use_txt_cond: true
+use_uv: true
+use_word_encoder: false
+use_word_input: false
+val_check_interval: 2000
+valid_infer_interval: 10000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+vocoder: HifiGAN
+vocoder_ckpt: checkpoints/hifi_lj
+warmup_updates: 8000
+weight_decay: 0
+win_size: 1024
+word_dict_size: 10000
+word_enc_layers: 3
+word_encoder_type: rel_fft
+work_dir: checkpoints/ps_small_exp

checkpoints/ps_small_exp/model_ckpt_steps_410000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6905d8969febca192f2239a99e833d9084b2e07cb6894a63e286901ab1d16553
+size 32754716

data/binary/ljspeech/phone_set.json ADDED Viewed

	@@ -0,0 +1 @@

+ ["!", ",", ".", ":", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]

data/binary/ljspeech/spk_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"<SINGLE_SPK>": 0}

data/binary/ljspeech/word_set.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/binary/ljspeech_cwt/phone_set.json ADDED Viewed

	@@ -0,0 +1 @@

data/binary/ljspeech_cwt/spk_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"<SINGLE_SPK>": 0}

data/binary/ljspeech_cwt/word_set.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data_gen/tts/base_binarizer.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import json
+import os
+import random
+import traceback
+from functools import partial
+import numpy as np
+from resemblyzer import VoiceEncoder
+from tqdm import tqdm
+import utils.commons.single_thread_env  # NOQA
+from utils.audio import librosa_wav2spec
+from utils.audio.align import get_mel2ph, mel2token_to_dur
+from utils.audio.cwt import get_lf0_cwt, get_cont_lf0
+from utils.audio.pitch.utils import f0_to_coarse
+from utils.audio.pitch_extractors import extract_pitch_simple
+from utils.commons.hparams import hparams
+from utils.commons.indexed_datasets import IndexedDatasetBuilder
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from utils.os_utils import remove_file, copy_file
+np.seterr(divide='ignore', invalid='ignore')
+class BinarizationError(Exception):
+    pass
+class BaseBinarizer:
+    def __init__(self, processed_data_dir=None):
+        if processed_data_dir is None:
+            processed_data_dir = hparams['processed_data_dir']
+        self.processed_data_dir = processed_data_dir
+        self.binarization_args = hparams['binarization_args']
+        self.items = {}
+        self.item_names = []
+    def load_meta_data(self):
+        processed_data_dir = self.processed_data_dir
+        items_list = json.load(open(f"{processed_data_dir}/metadata.json"))
+        for r in tqdm(items_list, desc='Loading meta data.'):
+            item_name = r['item_name']
+            self.items[item_name] = r
+            self.item_names.append(item_name)
+        if self.binarization_args['shuffle']:
+            random.seed(1234)
+            random.shuffle(self.item_names)
+    @property
+    def train_item_names(self):
+        range_ = self._convert_range(self.binarization_args['train_range'])
+        return self.item_names[range_[0]:range_[1]]
+    @property
+    def valid_item_names(self):
+        range_ = self._convert_range(self.binarization_args['valid_range'])
+        return self.item_names[range_[0]:range_[1]]
+    @property
+    def test_item_names(self):
+        range_ = self._convert_range(self.binarization_args['test_range'])
+        return self.item_names[range_[0]:range_[1]]
+    def _convert_range(self, range_):
+        if range_[1] == -1:
+            range_[1] = len(self.item_names)
+        return range_
+    def meta_data(self, prefix):
+        if prefix == 'valid':
+            item_names = self.valid_item_names
+        elif prefix == 'test':
+            item_names = self.test_item_names
+        else:
+            item_names = self.train_item_names
+        for item_name in item_names:
+            yield self.items[item_name]
+    def process(self):
+        self.load_meta_data()
+        os.makedirs(hparams['binary_data_dir'], exist_ok=True)
+        for fn in ['phone_set.json', 'word_set.json', 'spk_map.json']:
+            remove_file(f"{hparams['binary_data_dir']}/{fn}")
+            copy_file(f"{hparams['processed_data_dir']}/{fn}", f"{hparams['binary_data_dir']}/{fn}")
+        self.process_data('valid')
+        self.process_data('test')
+        self.process_data('train')
+    def process_data(self, prefix):
+        data_dir = hparams['binary_data_dir']
+        builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
+        meta_data = list(self.meta_data(prefix))
+        process_item = partial(self.process_item, binarization_args=self.binarization_args)
+        ph_lengths = []
+        mel_lengths = []
+        total_sec = 0
+        items = []
+        args = [{'item': item} for item in meta_data]
+        for item_id, item in multiprocess_run_tqdm(process_item, args, desc='Processing data'):
+            if item is not None:
+                items.append(item)
+        if self.binarization_args['with_spk_embed']:
+            args = [{'wav': item['wav']} for item in items]
+            for item_id, spk_embed in multiprocess_run_tqdm(
+                    self.get_spk_embed, args,
+                    init_ctx_func=lambda wid: {'voice_encoder': VoiceEncoder().cuda()}, num_workers=4,
+                    desc='Extracting spk embed'):
+                items[item_id]['spk_embed'] = spk_embed
+        for item in items:
+            if not self.binarization_args['with_wav'] and 'wav' in item:
+                del item['wav']
+            builder.add_item(item)
+            mel_lengths.append(item['len'])
+            assert item['len'] > 0, (item['item_name'], item['txt'], item['mel2ph'])
+            if 'ph_len' in item:
+                ph_lengths.append(item['ph_len'])
+            total_sec += item['sec']
+        builder.finalize()
+        np.save(f'{data_dir}/{prefix}_lengths.npy', mel_lengths)
+        if len(ph_lengths) > 0:
+            np.save(f'{data_dir}/{prefix}_ph_lengths.npy', ph_lengths)
+        print(f"| {prefix} total duration: {total_sec:.3f}s")
+    @classmethod
+    def process_item(cls, item, binarization_args):
+        item['ph_len'] = len(item['ph_token'])
+        item_name = item['item_name']
+        wav_fn = item['wav_fn']
+        wav, mel = cls.process_audio(wav_fn, item, binarization_args)
+        try:
+            n_bos_frames, n_eos_frames = 0, 0
+            if binarization_args['with_align']:
+                tg_fn = f"{hparams['processed_data_dir']}/mfa_outputs/{item_name}.TextGrid"
+                item['tg_fn'] = tg_fn
+                cls.process_align(tg_fn, item)
+                if binarization_args['trim_eos_bos']:
+                    n_bos_frames = item['dur'][0]
+                    n_eos_frames = item['dur'][-1]
+                    T = len(mel)
+                    item['mel'] = mel[n_bos_frames:T - n_eos_frames]
+                    item['mel2ph'] = item['mel2ph'][n_bos_frames:T - n_eos_frames]
+                    item['mel2word'] = item['mel2word'][n_bos_frames:T - n_eos_frames]
+                    item['dur'] = item['dur'][1:-1]
+                    item['dur_word'] = item['dur_word'][1:-1]
+                    item['len'] = item['mel'].shape[0]
+                    item['wav'] = wav[n_bos_frames * hparams['hop_size']:len(wav) - n_eos_frames * hparams['hop_size']]
+            if binarization_args['with_f0']:
+                cls.process_pitch(item, n_bos_frames, n_eos_frames)
+        except BinarizationError as e:
+            print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
+            return None
+        except Exception as e:
+            traceback.print_exc()
+            print(f"| Skip item. item_name: {item_name}, wav_fn: {wav_fn}")
+            return None
+        return item
+    @classmethod
+    def process_audio(cls, wav_fn, res, binarization_args):
+        wav2spec_dict = librosa_wav2spec(
+            wav_fn,
+            fft_size=hparams['fft_size'],
+            hop_size=hparams['hop_size'],
+            win_length=hparams['win_size'],
+            num_mels=hparams['audio_num_mel_bins'],
+            fmin=hparams['fmin'],
+            fmax=hparams['fmax'],
+            sample_rate=hparams['audio_sample_rate'],
+            loud_norm=hparams['loud_norm'])
+        mel = wav2spec_dict['mel']
+        wav = wav2spec_dict['wav'].astype(np.float16)
+        if binarization_args['with_linear']:
+            res['linear'] = wav2spec_dict['linear']
+        res.update({'mel': mel, 'wav': wav, 'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0]})
+        return wav, mel
+    @staticmethod
+    def process_align(tg_fn, item):
+        ph = item['ph']
+        mel = item['mel']
+        ph_token = item['ph_token']
+        if tg_fn is not None and os.path.exists(tg_fn):
+            mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams['hop_size'], hparams['audio_sample_rate'],
+                                     hparams['binarization_args']['min_sil_duration'])
+        else:
+            raise BinarizationError(f"Align not found")
+        if np.array(mel2ph).max() - 1 >= len(ph_token):
+            raise BinarizationError(
+                f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(ph_token)}")
+        item['mel2ph'] = mel2ph
+        item['dur'] = dur
+        ph2word = item['ph2word']
+        mel2word = [ph2word[p - 1] for p in item['mel2ph']]
+        item['mel2word'] = mel2word  # [T_mel]
+        dur_word = mel2token_to_dur(mel2word, len(item['word_token']))
+        item['dur_word'] = dur_word.tolist()  # [T_word]
+    @staticmethod
+    def process_pitch(item, n_bos_frames, n_eos_frames):
+        wav, mel = item['wav'], item['mel']
+        f0 = extract_pitch_simple(item['wav'])
+        if sum(f0) == 0:
+            raise BinarizationError("Empty f0")
+        assert len(mel) == len(f0), (len(mel), len(f0))
+        pitch_coarse = f0_to_coarse(f0)
+        item['f0'] = f0
+        item['pitch'] = pitch_coarse
+        if hparams['binarization_args']['with_f0cwt']:
+            uv, cont_lf0_lpf = get_cont_lf0(f0)
+            logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
+            cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
+            cwt_spec, scales = get_lf0_cwt(cont_lf0_lpf_norm)
+            item['cwt_spec'] = cwt_spec
+            item['cwt_mean'] = logf0s_mean_org
+            item['cwt_std'] = logf0s_std_org
+    @staticmethod
+    def get_spk_embed(wav, ctx):
+        return ctx['voice_encoder'].embed_utterance(wav.astype(float))
+    @property
+    def num_workers(self):
+        return int(os.getenv('N_PROC', hparams.get('N_PROC', os.cpu_count())))

data_gen/tts/base_preprocess.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import json
+import os
+import random
+import re
+import traceback
+from collections import Counter
+from functools import partial
+import librosa
+from tqdm import tqdm
+from data_gen.tts.txt_processors.base_text_processor import get_txt_processor_cls
+from data_gen.tts.wav_processors.base_processor import get_wav_processor_cls
+from utils.commons.hparams import hparams
+from utils.commons.multiprocess_utils import multiprocess_run_tqdm
+from utils.os_utils import link_file, move_file, remove_file
+from utils.text.text_encoder import is_sil_phoneme, build_token_encoder
+class BasePreprocessor:
+    def __init__(self):
+        self.preprocess_args = hparams['preprocess_args']
+        txt_processor = self.preprocess_args['txt_processor']
+        self.txt_processor = get_txt_processor_cls(txt_processor)
+        self.raw_data_dir = hparams['raw_data_dir']
+        self.processed_dir = hparams['processed_data_dir']
+        self.spk_map_fn = f"{self.processed_dir}/spk_map.json"
+    def meta_data(self):
+        """
+        :return: {'item_name': Str, 'wav_fn': Str, 'txt': Str, 'spk_name': Str, 'txt_loader': None or Func}
+        """
+        raise NotImplementedError
+    def process(self):
+        processed_dir = self.processed_dir
+        wav_processed_tmp_dir = f'{processed_dir}/processed_tmp'
+        remove_file(wav_processed_tmp_dir)
+        os.makedirs(wav_processed_tmp_dir, exist_ok=True)
+        wav_processed_dir = f'{processed_dir}/{self.wav_processed_dirname}'
+        remove_file(wav_processed_dir)
+        os.makedirs(wav_processed_dir, exist_ok=True)
+        meta_data = list(tqdm(self.meta_data(), desc='Load meta data'))
+        item_names = [d['item_name'] for d in meta_data]
+        assert len(item_names) == len(set(item_names)), 'Key `item_name` should be Unique.'
+        # preprocess data
+        phone_list = []
+        word_list = []
+        spk_names = set()
+        process_item = partial(self.preprocess_first_pass,
+                               txt_processor=self.txt_processor,
+                               wav_processed_dir=wav_processed_dir,
+                               wav_processed_tmp=wav_processed_tmp_dir,
+                               preprocess_args=self.preprocess_args)
+        items = []
+        args = [{
+            'item_name': item_raw['item_name'],
+            'txt_raw': item_raw['txt'],
+            'wav_fn': item_raw['wav_fn'],
+            'txt_loader': item_raw.get('txt_loader'),
+            'others': item_raw.get('others', None)
+        } for item_raw in meta_data]
+        for item_, (item_id, item) in zip(meta_data, multiprocess_run_tqdm(process_item, args, desc='Preprocess')):
+            if item is not None:
+                item_.update(item)
+                item = item_
+                if 'txt_loader' in item:
+                    del item['txt_loader']
+                item['id'] = item_id
+                item['spk_name'] = item.get('spk_name', '<SINGLE_SPK>')
+                item['others'] = item.get('others', None)
+                phone_list += item['ph'].split(" ")
+                word_list += item['word'].split(" ")
+                spk_names.add(item['spk_name'])
+                items.append(item)
+        # add encoded tokens
+        ph_encoder, word_encoder = self._phone_encoder(phone_list), self._word_encoder(word_list)
+        spk_map = self.build_spk_map(spk_names)
+        args = [{
+            'ph': item['ph'], 'word': item['word'], 'spk_name': item['spk_name'],
+            'word_encoder': word_encoder, 'ph_encoder': ph_encoder, 'spk_map': spk_map
+        } for item in items]
+        for idx, item_new_kv in multiprocess_run_tqdm(self.preprocess_second_pass, args, desc='Add encoded tokens'):
+            items[idx].update(item_new_kv)
+        # build mfa data
+        if self.preprocess_args['use_mfa']:
+            mfa_dict = set()
+            mfa_input_dir = f'{processed_dir}/mfa_inputs'
+            remove_file(mfa_input_dir)
+            # group MFA inputs for better parallelism
+            mfa_groups = [i // self.preprocess_args['nsample_per_mfa_group'] for i in range(len(items))]
+            if self.preprocess_args['mfa_group_shuffle']:
+                random.seed(hparams['seed'])
+                random.shuffle(mfa_groups)
+            args = [{
+                'item': item, 'mfa_input_dir': mfa_input_dir,
+                'mfa_group': mfa_group, 'wav_processed_tmp': wav_processed_tmp_dir,
+                'preprocess_args': self.preprocess_args
+            } for item, mfa_group in zip(items, mfa_groups)]
+            for i, (ph_gb_word_nosil, new_wav_align_fn) in multiprocess_run_tqdm(
+                    self.build_mfa_inputs, args, desc='Build MFA data'):
+                items[i]['wav_align_fn'] = new_wav_align_fn
+                for w in ph_gb_word_nosil.split(" "):
+                    mfa_dict.add(f"{w} {w.replace('_', ' ')}")
+            mfa_dict = sorted(mfa_dict)
+            with open(f'{processed_dir}/mfa_dict.txt', 'w') as f:
+                f.writelines([f'{l}\n' for l in mfa_dict])
+        with open(f"{processed_dir}/{self.meta_csv_filename}.json", 'w') as f:
+            f.write(re.sub(r'\n\s+([\d+\]])', r'\1', json.dumps(items, ensure_ascii=False, sort_keys=False, indent=1)))
+        remove_file(wav_processed_tmp_dir)
+    @classmethod
+    def preprocess_first_pass(cls, item_name, txt_raw, txt_processor,
+                              wav_fn, wav_processed_dir, wav_processed_tmp,
+                              preprocess_args, txt_loader=None, others=None):
+        try:
+            if txt_loader is not None:
+                txt_raw = txt_loader(txt_raw)
+            ph, txt, word, ph2word, ph_gb_word = cls.txt_to_ph(txt_processor, txt_raw, preprocess_args)
+            wav_fn, wav_align_fn = cls.process_wav(
+                item_name, wav_fn,
+                hparams['processed_data_dir'],
+                wav_processed_tmp, preprocess_args)
+            # wav for binarization
+            ext = os.path.splitext(wav_fn)[1]
+            os.makedirs(wav_processed_dir, exist_ok=True)
+            new_wav_fn = f"{wav_processed_dir}/{item_name}{ext}"
+            move_link_func = move_file if os.path.dirname(wav_fn) == wav_processed_tmp else link_file
+            move_link_func(wav_fn, new_wav_fn)
+            return {
+                'txt': txt, 'txt_raw': txt_raw, 'ph': ph,
+                'word': word, 'ph2word': ph2word, 'ph_gb_word': ph_gb_word,
+                'wav_fn': new_wav_fn, 'wav_align_fn': wav_align_fn,
+                'others': others
+            }
+        except:
+            traceback.print_exc()
+            print(f"| Error is caught. item_name: {item_name}.")
+            return None
+    @staticmethod
+    def txt_to_ph(txt_processor, txt_raw, preprocess_args):
+        txt_struct, txt = txt_processor.process(txt_raw, preprocess_args)
+        ph = [p for w in txt_struct for p in w[1]]
+        ph_gb_word = ["_".join(w[1]) for w in txt_struct]
+        words = [w[0] for w in txt_struct]
+        # word_id=0 is reserved for padding
+        ph2word = [w_id + 1 for w_id, w in enumerate(txt_struct) for _ in range(len(w[1]))]
+        return " ".join(ph), txt, " ".join(words), ph2word, " ".join(ph_gb_word)
+    @staticmethod
+    def process_wav(item_name, wav_fn, processed_dir, wav_processed_tmp, preprocess_args):
+        processors = [get_wav_processor_cls(v) for v in preprocess_args['wav_processors']]
+        processors = [k() for k in processors if k is not None]
+        if len(processors) >= 1:
+            sr_file = librosa.core.get_samplerate(wav_fn)
+            output_fn_for_align = None
+            ext = os.path.splitext(wav_fn)[1]
+            input_fn = f"{wav_processed_tmp}/{item_name}{ext}"
+            link_file(wav_fn, input_fn)
+            for p in processors:
+                outputs = p.process(input_fn, sr_file, wav_processed_tmp, processed_dir, item_name, preprocess_args)
+                if len(outputs) == 3:
+                    input_fn, sr, output_fn_for_align = outputs
+                else:
+                    input_fn, sr = outputs
+            return input_fn, output_fn_for_align
+        else:
+            return wav_fn, wav_fn
+    def _phone_encoder(self, ph_set):
+        ph_set_fn = f"{self.processed_dir}/phone_set.json"
+        if self.preprocess_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
+            ph_set = sorted(set(ph_set))
+            json.dump(ph_set, open(ph_set_fn, 'w'), ensure_ascii=False)
+            print("| Build phone set: ", ph_set)
+        else:
+            ph_set = json.load(open(ph_set_fn, 'r'))
+            print("| Load phone set: ", ph_set)
+        return build_token_encoder(ph_set_fn)
+    def _word_encoder(self, word_set):
+        word_set_fn = f"{self.processed_dir}/word_set.json"
+        if self.preprocess_args['reset_word_dict']:
+            word_set = Counter(word_set)
+            total_words = sum(word_set.values())
+            word_set = word_set.most_common(hparams['word_dict_size'])
+            num_unk_words = total_words - sum([x[1] for x in word_set])
+            word_set = ['<BOS>', '<EOS>'] + [x[0] for x in word_set]
+            word_set = sorted(set(word_set))
+            json.dump(word_set, open(word_set_fn, 'w'), ensure_ascii=False)
+            print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
+                  f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
+        else:
+            word_set = json.load(open(word_set_fn, 'r'))
+            print("| Load word set. Size: ", len(word_set), word_set[:10])
+        return build_token_encoder(word_set_fn)
+    @classmethod
+    def preprocess_second_pass(cls, word, ph, spk_name, word_encoder, ph_encoder, spk_map):
+        word_token = word_encoder.encode(word)
+        ph_token = ph_encoder.encode(ph)
+        spk_id = spk_map[spk_name]
+        return {'word_token': word_token, 'ph_token': ph_token, 'spk_id': spk_id}
+    def build_spk_map(self, spk_names):
+        spk_map = {x: i for i, x in enumerate(sorted(list(spk_names)))}
+        assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
+        print(f"| Number of spks: {len(spk_map)}, spk_map: {spk_map}")
+        json.dump(spk_map, open(self.spk_map_fn, 'w'), ensure_ascii=False)
+        return spk_map
+    @classmethod
+    def build_mfa_inputs(cls, item, mfa_input_dir, mfa_group, wav_processed_tmp, preprocess_args):
+        item_name = item['item_name']
+        wav_align_fn = item['wav_align_fn']
+        ph_gb_word = item['ph_gb_word']
+        ext = os.path.splitext(wav_align_fn)[1]
+        mfa_input_group_dir = f'{mfa_input_dir}/{mfa_group}'
+        os.makedirs(mfa_input_group_dir, exist_ok=True)
+        new_wav_align_fn = f"{mfa_input_group_dir}/{item_name}{ext}"
+        move_link_func = move_file if os.path.dirname(wav_align_fn) == wav_processed_tmp else link_file
+        move_link_func(wav_align_fn, new_wav_align_fn)
+        ph_gb_word_nosil = " ".join(["_".join([p for p in w.split("_") if not is_sil_phoneme(p)])
+                                     for w in ph_gb_word.split(" ") if not is_sil_phoneme(w)])
+        with open(f'{mfa_input_group_dir}/{item_name}.lab', 'w') as f_txt:
+            f_txt.write(ph_gb_word_nosil)
+        return ph_gb_word_nosil, new_wav_align_fn
+    def load_spk_map(self, base_dir):
+        spk_map_fn = f"{base_dir}/spk_map.json"
+        spk_map = json.load(open(spk_map_fn, 'r'))
+        return spk_map
+    def load_dict(self, base_dir):
+        ph_encoder = build_token_encoder(f'{base_dir}/phone_set.json')
+        word_encoder = build_token_encoder(f'{base_dir}/word_set.json')
+        return ph_encoder, word_encoder
+    @property
+    def meta_csv_filename(self):
+        return 'metadata'
+    @property
+    def wav_processed_dirname(self):
+        return 'wav_processed'

data_gen/tts/binarizer_zh.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import numpy as np
+from data_gen.tts.base_binarizer import BaseBinarizer
+class ZhBinarizer(BaseBinarizer):
+    @staticmethod
+    def process_align(tg_fn, item):
+        BaseBinarizer.process_align(tg_fn, item)
+        # char-level pitch
+        if 'f0' in item:
+            ph_list = item['ph'].split(" ")
+            item['f0_ph'] = np.array([0 for _ in item['f0']], dtype=float)
+            char_start_idx = 0
+            f0s_char = []
+            for idx, (f0_, ph_idx) in enumerate(zip(item['f0'], item['mel2ph'])):
+                is_pinyin = ph_list[ph_idx - 1][0].isalpha()
+                if not is_pinyin or ph_idx - item['mel2ph'][idx - 1] > 1:
+                    if len(f0s_char) > 0:
+                        item['f0_ph'][char_start_idx:idx] = sum(f0s_char) / len(f0s_char)
+                    f0s_char = []
+                    char_start_idx = idx
+                    if not is_pinyin:
+                        char_start_idx += 1
+                if f0_ > 0:
+                    f0s_char.append(f0_)

data_gen/tts/runs/adapt_mfa_align.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import utils.commons.single_thread_env  # NOQA
+import os
+import subprocess
+from utils.commons.hparams import hparams, set_hparams
+def adapt_mfa_align():
+    CORPUS = hparams['processed_data_dir'].split("/")[-1]
+    print(f"| Run MFA for {CORPUS}.")
+    NUM_JOB = int(os.getenv('N_PROC', os.cpu_count()))
+    subprocess.check_call(
+        f'CORPUS={CORPUS} NUM_JOB={NUM_JOB} bash scripts/run_mfa_adapt.sh',
+        shell=True)
+if __name__ == '__main__':
+    set_hparams(print_hparams=False)
+    adapt_mfa_align()

data_gen/tts/runs/align_and_binarize.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import utils.commons.single_thread_env  # NOQA
+from utils.commons.hparams import set_hparams, hparams
+from data_gen.tts.runs.binarize import binarize
+from data_gen.tts.runs.preprocess import preprocess
+from data_gen.tts.runs.train_mfa_align import train_mfa_align
+if __name__ == '__main__':
+    set_hparams()
+    preprocess()
+    if hparams['preprocess_args']['use_mfa']:
+        train_mfa_align()
+    binarize()

data_gen/tts/runs/binarize.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import utils.commons.single_thread_env  # NOQA
+from utils.commons.hparams import hparams, set_hparams
+import importlib
+def binarize():
+    binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
+    pkg = ".".join(binarizer_cls.split(".")[:-1])
+    cls_name = binarizer_cls.split(".")[-1]
+    binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
+    print("| Binarizer: ", binarizer_cls)
+    binarizer_cls().process()
+if __name__ == '__main__':
+    set_hparams()
+    binarize()

data_gen/tts/runs/preprocess.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import utils.commons.single_thread_env  # NOQA
+from utils.commons.hparams import hparams, set_hparams
+import importlib
+def preprocess():
+    assert hparams['preprocess_cls'] != ''
+    pkg = ".".join(hparams["preprocess_cls"].split(".")[:-1])
+    cls_name = hparams["preprocess_cls"].split(".")[-1]
+    process_cls = getattr(importlib.import_module(pkg), cls_name)
+    process_cls().process()
+if __name__ == '__main__':
+    set_hparams()
+    preprocess()

data_gen/tts/runs/train_mfa_align.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import utils.commons.single_thread_env  # NOQA
+import glob
+import subprocess
+from textgrid import TextGrid
+import os
+from utils.commons.hparams import hparams, set_hparams
+def train_mfa_align(mfa_outputs="mfa_outputs",
+                    mfa_inputs="mfa_inputs",
+                    model_name=None, pretrain_model_name=None,
+                    mfa_cmd='train'):
+    CORPUS = hparams['processed_data_dir'].split("/")[-1]
+    NUM_JOB = int(os.getenv('N_PROC', os.cpu_count()))
+    env_vars = [f'CORPUS={CORPUS}', f'NUM_JOB={NUM_JOB}']
+    if mfa_outputs is not None:
+        env_vars.append(f'MFA_OUTPUTS={mfa_outputs}')
+    if mfa_inputs is not None:
+        env_vars.append(f'MFA_INPUTS={mfa_inputs}')
+    if model_name is not None:
+        env_vars.append(f'MODEL_NAME={model_name}')
+    if pretrain_model_name is not None:
+        env_vars.append(f'PRETRAIN_MODEL_NAME={pretrain_model_name}')
+    if mfa_cmd is not None:
+        env_vars.append(f'MFA_CMD={mfa_cmd}')
+    env_str = ' '.join(env_vars)
+    print(f"| Run MFA for {CORPUS}. Env vars: {env_str}")
+    subprocess.check_call(f'{env_str} bash mfa_usr/run_mfa_train_align.sh', shell=True)
+    mfa_offset = hparams['preprocess_args']['mfa_offset']
+    if mfa_offset > 0:
+        for tg_fn in glob.glob(f'{hparams["processed_data_dir"]}/{mfa_outputs}/*.TextGrid'):
+            tg = TextGrid.fromFile(tg_fn)
+            max_time = tg.maxTime
+            for tier in tg.tiers:
+                for interval in tier.intervals:
+                    interval.maxTime = min(interval.maxTime + mfa_offset, max_time)
+                    interval.minTime = min(interval.minTime + mfa_offset, max_time)
+                tier.intervals[0].minTime = 0
+                tier.maxTime = min(tier.maxTime + mfa_offset, max_time)
+            tg.write(tg_fn)
+            TextGrid.fromFile(tg_fn)
+if __name__ == '__main__':
+    set_hparams(print_hparams=False)
+    train_mfa_align()

data_gen/tts/txt_processors/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import en

data_gen/tts/txt_processors/base_text_processor.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from utils.text.text_encoder import is_sil_phoneme
+REGISTERED_TEXT_PROCESSORS = {}
+def register_txt_processors(name):
+    def _f(cls):
+        REGISTERED_TEXT_PROCESSORS[name] = cls
+        return cls
+    return _f
+def get_txt_processor_cls(name):
+    return REGISTERED_TEXT_PROCESSORS.get(name, None)
+class BaseTxtProcessor:
+    @staticmethod
+    def sp_phonemes():
+        return ['|']
+    @classmethod
+    def process(cls, txt, preprocess_args):
+        raise NotImplementedError
+    @classmethod
+    def postprocess(cls, txt_struct, preprocess_args):
+        # remove sil phoneme in head and tail
+        while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]):
+            txt_struct = txt_struct[1:]
+        while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]):
+            txt_struct = txt_struct[:-1]
+        if preprocess_args['with_phsep']:
+            txt_struct = cls.add_bdr(txt_struct)
+        if preprocess_args['add_eos_bos']:
+            txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]]
+        return txt_struct
+    @classmethod
+    def add_bdr(cls, txt_struct):
+        txt_struct_ = []
+        for i, ts in enumerate(txt_struct):
+            txt_struct_.append(ts)
+            if i != len(txt_struct) - 1 and \
+                    not is_sil_phoneme(txt_struct[i][0]) and not is_sil_phoneme(txt_struct[i + 1][0]):
+                txt_struct_.append(['|', ['|']])
+        return txt_struct_

data_gen/tts/txt_processors/en.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import re
+import unicodedata
+from g2p_en import G2p
+from g2p_en.expand import normalize_numbers
+from nltk import pos_tag
+from nltk.tokenize import TweetTokenizer
+from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors
+from utils.text.text_encoder import PUNCS, is_sil_phoneme
+class EnG2p(G2p):
+    word_tokenize = TweetTokenizer().tokenize
+    def __call__(self, text):
+        # preprocessing
+        words = EnG2p.word_tokenize(text)
+        tokens = pos_tag(words)  # tuples of (word, tag)
+        # steps
+        prons = []
+        for word, pos in tokens:
+            if re.search("[a-z]", word) is None:
+                pron = [word]
+            elif word in self.homograph2features:  # Check homograph
+                pron1, pron2, pos1 = self.homograph2features[word]
+                if pos.startswith(pos1):
+                    pron = pron1
+                else:
+                    pron = pron2
+            elif word in self.cmu:  # lookup CMU dict
+                pron = self.cmu[word][0]
+            else:  # predict for oov
+                pron = self.predict(word)
+            prons.extend(pron)
+            prons.extend([" "])
+        return prons[:-1]
+@register_txt_processors('en')
+class TxtProcessor(BaseTxtProcessor):
+    g2p = EnG2p()
+    @staticmethod
+    def preprocess_text(text):
+        text = normalize_numbers(text)
+        text = ''.join(char for char in unicodedata.normalize('NFD', text)
+                       if unicodedata.category(char) != 'Mn')  # Strip accents
+        text = text.lower()
+        text = re.sub("[\'\"()]+", "", text)
+        text = re.sub("[-]+", " ", text)
+        text = re.sub(f"[^ a-z{PUNCS}]", "", text)
+        text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text)  # !! -> !
+        text = re.sub(f"([{PUNCS}])+", r"\1", text)  # !! -> !
+        text = text.replace("i.e.", "that is")
+        text = text.replace("i.e.", "that is")
+        text = text.replace("etc.", "etc")
+        text = re.sub(f"([{PUNCS}])", r" \1 ", text)
+        text = re.sub(rf"\s+", r" ", text)
+        return text
+    @classmethod
+    def process(cls, txt, preprocess_args):
+        txt = cls.preprocess_text(txt).strip()
+        phs = cls.g2p(txt)
+        txt_struct = [[w, []] for w in txt.split(" ")]
+        i_word = 0
+        for p in phs:
+            if p == ' ':
+                i_word += 1
+            else:
+                txt_struct[i_word][1].append(p)
+        txt_struct = cls.postprocess(txt_struct, preprocess_args)
+        return txt_struct, txt

data_gen/tts/wav_processors/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from . import base_processor
2	+ from . import common_processors

data_gen/tts/wav_processors/base_processor.py ADDED Viewed

	@@ -0,0 +1,25 @@

+REGISTERED_WAV_PROCESSORS = {}
+def register_wav_processors(name):
+    def _f(cls):
+        REGISTERED_WAV_PROCESSORS[name] = cls
+        return cls
+    return _f
+def get_wav_processor_cls(name):
+    return REGISTERED_WAV_PROCESSORS.get(name, None)
+class BaseWavProcessor:
+    @property
+    def name(self):
+        raise NotImplementedError
+    def output_fn(self, input_fn):
+        return f'{input_fn[:-4]}_{self.name}.wav'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        raise NotImplementedError

data_gen/tts/wav_processors/common_processors.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+import subprocess
+import librosa
+import numpy as np
+from data_gen.tts.wav_processors.base_processor import BaseWavProcessor, register_wav_processors
+from utils.audio import trim_long_silences
+from utils.audio.io import save_wav
+from utils.audio.rnnoise import rnnoise
+from utils.commons.hparams import hparams
+@register_wav_processors(name='sox_to_wav')
+class ConvertToWavProcessor(BaseWavProcessor):
+    @property
+    def name(self):
+        return 'ToWav'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        if input_fn[-4:] == '.wav':
+            return input_fn, sr
+        else:
+            output_fn = self.output_fn(input_fn)
+            subprocess.check_call(f'sox -v 0.95 "{input_fn}" -t wav "{output_fn}"', shell=True)
+            return output_fn, sr
+@register_wav_processors(name='sox_resample')
+class ResampleProcessor(BaseWavProcessor):
+    @property
+    def name(self):
+        return 'Resample'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        output_fn = self.output_fn(input_fn)
+        sr_file = librosa.core.get_samplerate(input_fn)
+        if sr != sr_file:
+            subprocess.check_call(f'sox -v 0.95 "{input_fn}" -r{sr} "{output_fn}"', shell=True)
+            y, _ = librosa.core.load(input_fn, sr=sr)
+            y, _ = librosa.effects.trim(y)
+            save_wav(y, output_fn, sr)
+            return output_fn, sr
+        else:
+            return input_fn, sr
+@register_wav_processors(name='trim_sil')
+class TrimSILProcessor(BaseWavProcessor):
+    @property
+    def name(self):
+        return 'TrimSIL'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        output_fn = self.output_fn(input_fn)
+        y, _ = librosa.core.load(input_fn, sr=sr)
+        y, _ = librosa.effects.trim(y)
+        save_wav(y, output_fn, sr)
+        return output_fn
+@register_wav_processors(name='trim_all_sil')
+class TrimAllSILProcessor(BaseWavProcessor):
+    @property
+    def name(self):
+        return 'TrimSIL'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        output_fn = self.output_fn(input_fn)
+        y, audio_mask, _ = trim_long_silences(
+            input_fn, vad_max_silence_length=preprocess_args.get('vad_max_silence_length', 12))
+        save_wav(y, output_fn, sr)
+        if preprocess_args['save_sil_mask']:
+            os.makedirs(f'{processed_dir}/sil_mask', exist_ok=True)
+            np.save(f'{processed_dir}/sil_mask/{item_name}.npy', audio_mask)
+        return output_fn, sr
+@register_wav_processors(name='denoise')
+class DenoiseProcessor(BaseWavProcessor):
+    @property
+    def name(self):
+        return 'Denoise'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        output_fn = self.output_fn(input_fn)
+        rnnoise(input_fn, output_fn, out_sample_rate=sr)
+        return output_fn, sr

docs/fastspeech2.md ADDED Viewed

	@@ -0,0 +1,53 @@

+# Run FastSpeech 2
+## Quick Start
+### Install Dependencies
+Install dependencies following [readme.md](../readme.md)
+### Set Config Path and Experiment Name
+```bash
+export CONFIG_NAME=egs/datasets/audio/lj/fs2_orig.yaml
+export MY_EXP_NAME=fs2_exp
+```
+### Preprocess and binary dataset
+Prepare dataset following [prepare_data.md](./prepare_data.md)
+### Prepare Vocoder
+Prepare vocoder following [prepare_vocoder.md](./prepare_vocoder.md)
+## Training
+```bash
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $CONFIG_NAME --exp_name $MY_EXP_NAME --reset
+```
+You can check the training and validation curves open Tensorboard via:
+```bash
+tensorboard --logdir checkpoints/$MY_EXP_NAME
+```
+## Inference (Testing)
+```bash
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $CONFIG_NAME --exp_name $MY_EXP_NAME --infer
+```
+## Citation
+If you find this useful for your research, please use the following.
+```
+@inproceedings{ren2020fastspeech,
+  title={FastSpeech 2: Fast and High-Quality End-to-End Text to Speech},
+  author={Ren, Yi and Hu, Chenxu and Tan, Xu and Qin, Tao and Zhao, Sheng and Zhao, Zhou and Liu, Tie-Yan},
+  booktitle={International Conference on Learning Representations},
+  year={2020}
+}
+```

docs/framework.md ADDED Viewed

	@@ -0,0 +1,106 @@

+# Framework of NATSpeech
+NATSpeech is a simple framework for Non-Autoregressive Text-to-Speech.
+## Directory Structure
+- `egs`: configuration files, which will be loaded by `utils/commons/hparams.py`
+- `data_gen`: data binarization codes
+- `modules`: modules and models
+- `tasks`: the training and inference logics
+- `utils`: commonly used utils
+- `data`: data
+    - `raw`: raw data
+    - `processed`: data after preprocess
+    - `binary`: binary data
+- `checkpoints`: model checkpoints, tensorboard logs and generated results for all experiments.
+## How to Add New Tasks and Run?
+We show the basic steps of adding a new task/model and running the code (LJSpeech dataset as an example).
+### Add the model
+Add your model to `modules`.
+### Add the task
+Task classes are used to manage the training and inference procedures.
+A new task (e.g., `tasks.tts.fs.FastSpeechTask`) should inherit the base task (`tasks.tts.speech_base.TTSBaseTask`)
+class.
+You must implement these methods:
+- `build_tts_model`, which builds the model for your task. - `run_model`, indicating how to use the model in training
+  and inference.
+You can override `test_step` and `save_valid_result` to change the validation/testing logics or add more plots to
+tensorboard.
+### Add a new config file
+Add a new config file in `egs/datasets/audio/lj/YOUR_TASK.yaml`. For example:
+```yaml
+base_config: ./base_text2mel.yaml
+task_cls: tasks.tts.fs.FastSpeechTask
+# model configs
+hidden_size: 256
+dropout: 0.1
+# some more configs .....
+```
+If you use a new dataset `YOUR_DATASET`, you should also add a `YOUR_DATASET_Processor`
+in `egs/datasets/audio/YOUR_DATASET/preprocess.py`, inheriting `data_gen.tts.base_preprocess.BasePreprocessor`, which
+loads some meta information of the dataset.
+### Preprocess and binary dataset
+```bash
+python data_gen/tts/runs/align_and_binarize.py --config egs/datasets/audio/lj/base_text2mel.yaml
+```
+### Training
+```bash
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config YOUR_CONFIG --exp_name YOUR_EXP_NAME --reset
+```
+You can open Tensorboard via:
+```bash
+tensorboard --logdir checkpoints/EXP_NAME
+```
+### Inference (Testing)
+```bash
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config egs/datasets/audio/lj/YOUR_TASK.yaml --exp_name YOUR_EXP_NAME --reset --infer
+```
+## Design Philosophy
+### Random-Access Binarized Dataset
+To address the IO problem when reading small files, we design a `IndexedDataset` class (_utils/commons/indexed_datasets.py_)
+### Global Config
+We introduce a global config `hparams`, which is load from a `.yaml` config file and can be used in anywhere. However,
+we do not recommend using it in some general-purpose modules.
+### BaseTrainer Framework
+Our [base trainer](utils/commons/trainer.py) and [base task ](utils/commons/base_task.py) classes refer
+to [PytorchLightning](https://github.com/PyTorchLightning/pytorch-lightning), which builds some commonly used
+training/inference code structure. Our framework supports multi-process GPU training without changing the subclass
+codes.
+### Checkpoint Saving
+All checkpoints and tensorboard logs are saved in `checkpoints/EXP_NAME`, where `EXP_NAME` is set in the running
+command: `python tasks/run.py .... --exp_name EXP_NAME`. You can use `tensorboard --logdir checkpoints/EXP_NAME` to open
+the tensorboard and check the training loss curves etc.

docs/portaspeech.md ADDED Viewed

	@@ -0,0 +1,61 @@

+# Run PortaSpeech
+## Quick Start
+### Install Dependencies
+Install dependencies following [readme.md](../readme.md)
+### Set Config Path and Experiment Name
+#### PortaSpeech (normal)
+```bash
+export CONFIG_NAME=egs/datasets/audio/lj/ps_flow_nips2021.yaml
+export MY_EXP_NAME=ps_normal_exp
+```
+#### PortaSpeech (small)
+```bash
+export CONFIG_NAME=egs/datasets/audio/lj/ps_flow_small_nips2021.yaml
+export MY_EXP_NAME=ps_small_exp
+```
+### Preprocess and binary dataset
+Prepare dataset following [prepare_data.md](./prepare_data.md)
+### Prepare Vocoder
+Prepare vocoder following [prepare_vocoder.md](./prepare_vocoder.md)
+## Training
+```bash
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $CONFIG_NAME --exp_name $MY_EXP_NAME --reset
+```
+You can check the training and validation curves open Tensorboard via:
+```bash
+tensorboard --logdir checkpoints/$MY_EXP_NAME
+```
+## Inference (Testing)
+```bash
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $PS_CONFIG --exp_name $MY_EXP_NAME --infer
+```
+## Citation
+If you find this useful for your research, please use the following.
+```
+@article{ren2021portaspeech,
+  title={PortaSpeech: Portable and High-Quality Generative Text-to-Speech},
+  author={Ren, Yi and Liu, Jinglin and Zhao, Zhou},
+  journal={Advances in Neural Information Processing Systems},
+  volume={34},
+  year={2021}
+}
+```

docs/prepare_data.md ADDED Viewed

	@@ -0,0 +1,25 @@

+# Prepare Dataset
+## LJSpeech
+### Download Dataset
+```bash
+mkdir -p data/raw/ljspeech
+cd data/raw
+wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
+bzip2 -d LJSpeech-1.1.tar.bz2
+tar -xvf LJSpeech-1.1.tar
+cd ../../
+```
+### Forced Align and Preprocess Dataset
+```bash
+# Preprocess step: text and unify the file structure.
+python data_gen/tts/runs/preprocess.py --config $CONFIG_NAME
+# Align step: MFA alignment.
+python data_gen/tts/runs/train_mfa_align.py --config $CONFIG_NAME
+# Binarization step: Binarize data for fast IO. You only need to rerun this line when running different task if you have `preprocess`ed and `align`ed the dataset before.
+python data_gen/tts/runs/binarize.py --config $CONFIG_NAME
+```
+## More datasets will be supported soon...

docs/prepare_vocoder.md ADDED Viewed

	@@ -0,0 +1,49 @@

+# Prepare Vocoder
+We use [HiFi-GAN](https://github.com/jik876/hifi-gan) as the default vocoder.
+## LJSpeech
+### Use Pretrained Model
+```bash
+wget https://github.com/xx/xx/releases/download/pretrain-model/hifi_lj.zip
+unzip hifi_lj.zip
+mv hifi_lj checkpoints/hifi_lj
+```
+### Train Your Vocoder
+#### Set Config Path and Experiment Name
+```bash
+export CONFIG_NAME=egs/datasets/audio/lj/hifigan.yaml
+export MY_EXP_NAME=my_hifigan_exp
+```
+#### Prepare Dataset
+Prepare dataset following [prepare_data.md](./prepare_data.md).
+If you have run the `prepare_data` step of the acoustic
+model (e.g., FastSpeech 2 and PortaSpeech), you only need to binarize the dataset for the vocoder training:
+```bash
+python data_gen/tts/runs/binarize.py --config $CONFIG_NAME
+```
+#### Training
+```bash
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $CONFIG_NAME --exp_name $MY_EXP_NAME --reset
+```
+#### Inference (Testing)
+```bash
+CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $PS_CONFIG --exp_name $MY_EXP_NAME --infer
+```
+#### Use the trained vocoder
+Modify the `vocoder_ckpt` in config files of acoustic models (e.g., `egs/datasets/audio/lj/base_text2mel.yaml`) to $MY_EXP_NAME (e.g., `vocoder_ckpt: checkpoints/my_hifigan_exp`)

egs/datasets/audio/lj/base_mel2wav.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+base_config: egs/egs_bases/tts/vocoder/base.yaml
+raw_data_dir: 'data/raw/LJSpeech-1.1'
+processed_data_dir: 'data/processed/ljspeech'
+binary_data_dir: 'data/binary/ljspeech_wav'

egs/datasets/audio/lj/base_text2mel.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+base_config: egs/egs_bases/tts/base.yaml
+raw_data_dir: 'data/raw/LJSpeech-1.1'
+processed_data_dir: 'data/processed/ljspeech'
+binary_data_dir: 'data/binary/ljspeech'
+preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
+binarization_args:
+  train_range: [ 871, -1 ]
+  test_range: [ 0, 523 ]
+  valid_range: [ 523, 871 ]
+test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+            68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
+            316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
+f0_min: 80
+f0_max: 600
+vocoder_ckpt: checkpoints/hifi_lj

egs/datasets/audio/lj/fs.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - egs/egs_bases/tts/fs.yaml
+  - ./base_text2mel.yaml

egs/datasets/audio/lj/fs2_orig.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+base_config:
+  - egs/egs_bases/tts/fs2_orig.yaml
+  - ./base_text2mel.yaml
+binary_data_dir: 'data/binary/ljspeech_cwt'

egs/datasets/audio/lj/hifigan.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - egs/egs_bases/tts/vocoder/hifigan.yaml
+  - ./base_mel2wav.yaml

egs/datasets/audio/lj/preprocess.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from data_gen.tts.base_preprocess import BasePreprocessor
+class LJPreprocess(BasePreprocessor):
+    def meta_data(self):
+        for l in open(f'{self.raw_data_dir}/metadata.csv').readlines():
+            item_name, _, txt = l.strip().split("|")
+            wav_fn = f"{self.raw_data_dir}/wavs/{item_name}.wav"
+            yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt}

egs/datasets/audio/lj/ps_flow.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - egs/egs_bases/tts/ps_flow.yaml
+  - ./base_text2mel.yaml

egs/datasets/audio/lj/ps_flow_nips2021.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+base_config:
+  - ./ps_flow.yaml
+max_sentences: 64
+dur_level: word
+use_word_encoder: false
+enc_prenet: true
+enc_pre_ln: false
+fvae_encoder_type: wn
+fvae_decoder_type: wn
+text_encoder_postnet: false
+warmup_updates: 8000

egs/datasets/audio/lj/ps_flow_small.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - egs/egs_bases/tts/ps_flow_small.yaml
+  - ./base_text2mel.yaml

egs/datasets/audio/lj/ps_flow_small_nips2021.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+base_config:
+  - ./ps_flow_small.yaml
+max_sentences: 128
+dur_level: word
+use_word_encoder: false
+enc_prenet: true
+enc_pre_ln: false
+fvae_encoder_type: wn
+fvae_decoder_type: wn
+text_encoder_postnet: false
+warmup_updates: 8000

egs/egs_bases/config_base.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+# task
+binary_data_dir: ''
+work_dir: '' # experiment directory.
+infer: false # infer
+amp: false
+seed: 1234
+debug: false
+save_codes: ['tasks', 'modules', 'egs']
+#############
+# dataset
+#############
+ds_workers: 1
+test_num: 100
+endless_ds: true
+sort_by_len: true
+#########
+# train and eval
+#########
+print_nan_grads: false
+load_ckpt: ''
+save_best: false
+num_ckpt_keep: 3
+clip_grad_norm: 0
+accumulate_grad_batches: 1
+tb_log_interval: 100
+num_sanity_val_steps: 5  # steps of validation at the beginning
+check_val_every_n_epoch: 10
+val_check_interval: 2000
+valid_monitor_key: 'val_loss'
+valid_monitor_mode: 'min'
+max_epochs: 1000
+max_updates: 1000000
+max_tokens: 40000
+max_sentences: 100000
+max_valid_tokens: -1
+max_valid_sentences: -1
+eval_max_batches: -1
+resume_from_checkpoint: 0
+rename_tmux: true

egs/egs_bases/tts/base.yaml ADDED Viewed

	@@ -0,0 +1,56 @@

+# task
+base_config:
+  - ../config_base.yaml
+  - ./dataset_params.yaml
+#############
+# dataset in training
+#############
+endless_ds: true
+min_frames: 0
+max_frames: 1548
+frames_multiple: 1
+max_input_tokens: 1550
+ds_workers: 1
+#########
+# model
+#########
+use_spk_id: false
+use_spk_embed: false
+mel_losses: "ssim:0.5|l1:0.5"
+###########
+# optimization
+###########
+lr: 0.0005
+scheduler: warmup # rsqrt|warmup|none
+warmup_updates: 4000
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+weight_decay: 0
+clip_grad_norm: 1
+clip_grad_value: 0
+###########
+# train and eval
+###########
+use_word_input: false
+max_valid_sentences: 1
+max_valid_tokens: 60000
+valid_infer_interval: 10000
+train_set_name: 'train'
+train_sets: ''
+valid_set_name: 'valid'
+test_set_name: 'test'
+num_valid_plots: 10
+test_ids: [ ]
+test_input_yaml: ''
+vocoder: HifiGAN
+vocoder_ckpt: ''
+profile_infer: false
+out_wav_norm: false
+save_gt: true
+save_f0: false
+gen_dir_name: ''

egs/egs_bases/tts/dataset_params.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+audio_num_mel_bins: 80
+audio_sample_rate: 22050
+hop_size: 256  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
+win_size: 1024  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
+fft_size: 1024  # Extra window size is filled with 0 paddings to match this parameter
+fmin: 80  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
+fmax: 7600  # To be increased/reduced depending on data.
+f0_min: 80
+f0_max: 800
+griffin_lim_iters: 30
+pitch_extractor: parselmouth
+num_spk: 1
+mel_vmin: -6
+mel_vmax: 1.5
+loud_norm: false
+raw_data_dir: ''
+processed_data_dir: ''
+binary_data_dir: ''
+preprocess_cls: ''
+binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
+preprocess_args:
+  nsample_per_mfa_group: 1000
+  # text process
+  txt_processor: en
+  use_mfa: true
+  with_phsep: true
+  reset_phone_dict: true
+  reset_word_dict: true
+  add_eos_bos: true
+  # mfa
+  mfa_group_shuffle: false
+  mfa_offset: 0.02
+  # wav processors
+  wav_processors: [ ]
+  save_sil_mask: true
+  vad_max_silence_length: 12
+binarization_args:
+  shuffle: false
+  with_wav: false
+  with_align: true
+  with_spk_embed: false
+  with_f0: true
+  with_f0cwt: false
+  with_linear: false
+  trim_eos_bos: false
+  min_sil_duration: 0.1
+  train_range: [ 200, -1 ]
+  test_range: [ 0, 100 ]
+  valid_range: [ 100, 200 ]
+word_dict_size: 10000
+pitch_key: pitch

egs/egs_bases/tts/fs.yaml ADDED Viewed

	@@ -0,0 +1,75 @@

+base_config: ./base.yaml
+task_cls: tasks.tts.fs.FastSpeechTask
+# model
+hidden_size: 256
+dropout: 0.0
+encoder_type: rel_fft # rel_fft|fft|tacotron|tacotron2|conformer
+decoder_type: conv # fft|rnn|conv|conformer|wn
+# rnn enc/dec
+encoder_K: 8
+decoder_rnn_dim: 0 # for rnn decoder, 0 -> hidden_size * 2
+# fft enc/dec
+enc_layers: 4
+enc_ffn_kernel_size: 9
+enc_prenet: true
+enc_pre_ln: true
+dec_layers: 4
+dec_ffn_kernel_size: 9
+num_heads: 2
+ffn_act: gelu
+ffn_hidden_size: 1024
+use_pos_embed: true
+# conv enc/dec
+enc_dec_norm: ln
+conv_use_pos: false
+layers_in_block: 2
+enc_dilations: [ 1, 1, 1, 1 ]
+enc_kernel_size: 5
+enc_post_net_kernel: 3
+dec_dilations: [ 1, 1, 1, 1 ] # for conv decoder
+dec_kernel_size: 5
+dec_post_net_kernel: 3
+# duration
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+dur_predictor_kernel: 3
+dur_predictor_layers: 2
+predictor_dropout: 0.5
+# pitch and energy
+use_pitch_embed: false
+pitch_type: frame # frame|ph|cwt
+use_uv: true
+# reference encoder and speaker embedding
+lambda_commit: 0.25
+ref_norm_layer: bn
+dec_inp_add_noise: false
+# mel
+mel_losses: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
+# loss lambda
+lambda_f0: 1.0
+lambda_uv: 1.0
+lambda_energy: 0.1
+lambda_ph_dur: 0.1
+lambda_sent_dur: 1.0
+lambda_word_dur: 1.0
+predictor_grad: 0.1
+# train and eval
+warmup_updates: 4000
+max_tokens: 40000
+max_sentences: 128
+max_valid_sentences: 1
+max_updates: 160000
+use_gt_dur: false
+use_gt_f0: false
+ds_workers: 2