Spaces:

Rongjiehuang
/

GenerSpeech

Build error

App Files Files Community

Rongjiehuang commited on Nov 2, 2022

Commit

222619b

1 Parent(s): af80ff8

update

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +151 -0
LICENSE +21 -0
assets/0011_001570.TextGrid +156 -0
assets/0011_001570.lab +1 -0
assets/0011_001570.txt +1 -0
assets/0011_001570.wav +0 -0
checkpoints/Emotion_encoder.pt +3 -0
checkpoints/GenerSpeech/config.yaml +249 -0
checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt +3 -0
checkpoints/trainset_hifigan/config.yaml +178 -0
checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt +3 -0
data/binary/training_set/mfa_dict.txt +0 -0
data/binary/training_set/mfa_model.zip +3 -0
data/binary/training_set/phone_set.json +1 -0
data/binary/training_set/train_f0s_mean_std.npy +3 -0
data/binary/training_set/word_set.json +1 -0
data_gen/tts/base_binarizer.py +224 -0
data_gen/tts/base_binarizer_emotion.py +352 -0
data_gen/tts/base_preprocess.py +250 -0
data_gen/tts/bin/binarize.py +20 -0
data_gen/tts/bin/pre_align.py +20 -0
data_gen/tts/bin/train_mfa_align.py +15 -0
data_gen/tts/data_gen_utils.py +356 -0
data_gen/tts/emotion/audio.py +107 -0
data_gen/tts/emotion/inference.py +177 -0
data_gen/tts/emotion/model.py +78 -0
data_gen/tts/emotion/params_data.py +29 -0
data_gen/tts/emotion/params_model.py +11 -0
data_gen/tts/emotion/test_emotion.py +184 -0
data_gen/tts/txt_processors/__init__.py +1 -0
data_gen/tts/txt_processors/base_text_processor.py +47 -0
data_gen/tts/txt_processors/en.py +77 -0
data_gen/tts/wav_processors/__init__.py +2 -0
data_gen/tts/wav_processors/base_processor.py +25 -0
data_gen/tts/wav_processors/common_processors.py +86 -0
egs/datasets/audio/emotion/base_text2mel.yaml +17 -0
egs/datasets/audio/emotion/pre_align.py +25 -0
egs/datasets/audio/libritts/base_text2mel.yaml +14 -0
egs/datasets/audio/libritts/fs2.yaml +3 -0
egs/datasets/audio/libritts/pre_align.py +21 -0
egs/datasets/audio/libritts/pwg.yaml +8 -0
egs/datasets/audio/lj/base_mel2wav.yaml +5 -0
egs/datasets/audio/lj/pre_align.py +13 -0
egs/datasets/audio/lj/pwg.yaml +3 -0
egs/datasets/audio/vctk/base_mel2wav.yaml +3 -0
egs/datasets/audio/vctk/fs2.yaml +12 -0
egs/datasets/audio/vctk/pre_align.py +22 -0
egs/datasets/audio/vctk/pwg.yaml +6 -0
egs/egs_bases/config_base.yaml +46 -0

.gitattributes CHANGED Viewed

@@ -31,3 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,151 @@

+### Project ignore
+/ParallelWaveGAN
+/wavegan_pretrained*
+/pretrained_models
+rsync
+.idea
+.DS_Store
+bak
+tmp
+*.tar.gz
+# mfa and kaldi
+kaldi_align/exp
+mfa
+montreal-forced-aligner
+mos
+nbs
+/configs_usr/*
+!/configs_usr/.gitkeep
+/fast_transformers
+/rnnoise
+/usr/*
+!/usr/.gitkeep
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+将删除 datasets/remi/test/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2021 Jinglin Liu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

assets/0011_001570.TextGrid ADDED Viewed

	@@ -0,0 +1,156 @@

+File type = "ooTextFile"
+Object class = "TextGrid"
+xmin = 0.0
+xmax = 2.266
+tiers? <exists>
+size = 2
+item []:
+	item [1]:
+		class = "IntervalTier"
+		name = "words"
+		xmin = 0.0
+		xmax = 2.266
+		intervals: size = 10
+			intervals [1]:
+				xmin = 0.000
+				xmax = 0.290
+				text = "sil"
+			intervals [2]:
+				xmin = 0.290
+				xmax = 0.320
+				text = ""
+			intervals [3]:
+				xmin = 0.320
+				xmax = 0.470
+				text = "b_ah1_t"
+			intervals [4]:
+				xmin = 0.470
+				xmax = 0.760
+				text = "ih1_f"
+			intervals [5]:
+				xmin = 0.760
+				xmax = 0.830
+				text = "y_uw1"
+			intervals [6]:
+				xmin = 0.830
+				xmax = 1.370
+				text = "hh_ae1_d_ah0_n_t"
+			intervals [7]:
+				xmin = 1.370
+				xmax = 1.600
+				text = "d_ah1_n"
+			intervals [8]:
+				xmin = 1.600
+				xmax = 1.900
+				text = "dh_eh1_m"
+			intervals [9]:
+				xmin = 1.900
+				xmax = 1.930
+				text = "sil"
+			intervals [10]:
+				xmin = 1.930
+				xmax = 2.266
+				text = ""
+	item [2]:
+		class = "IntervalTier"
+		name = "phones"
+		xmin = 0.0
+		xmax = 2.266
+		intervals: size = 24
+			intervals [1]:
+				xmin = 0.000
+				xmax = 0.290
+				text = "SIL"
+			intervals [2]:
+				xmin = 0.290
+				xmax = 0.320
+				text = "sp"
+			intervals [3]:
+				xmin = 0.320
+				xmax = 0.400
+				text = "B"
+			intervals [4]:
+				xmin = 0.400
+				xmax = 0.440
+				text = "AH1"
+			intervals [5]:
+				xmin = 0.440
+				xmax = 0.470
+				text = "T"
+			intervals [6]:
+				xmin = 0.470
+				xmax = 0.530
+				text = "IH1"
+			intervals [7]:
+				xmin = 0.530
+				xmax = 0.760
+				text = "F"
+			intervals [8]:
+				xmin = 0.760
+				xmax = 0.800
+				text = "Y"
+			intervals [9]:
+				xmin = 0.800
+				xmax = 0.830
+				text = "UW1"
+			intervals [10]:
+				xmin = 0.830
+				xmax = 0.980
+				text = "HH"
+			intervals [11]:
+				xmin = 0.980
+				xmax = 1.180
+				text = "AE1"
+			intervals [12]:
+				xmin = 1.180
+				xmax = 1.220
+				text = "D"
+			intervals [13]:
+				xmin = 1.220
+				xmax = 1.250
+				text = "AH0"
+			intervals [14]:
+				xmin = 1.250
+				xmax = 1.340
+				text = "N"
+			intervals [15]:
+				xmin = 1.340
+				xmax = 1.370
+				text = "T"
+			intervals [16]:
+				xmin = 1.370
+				xmax = 1.410
+				text = "D"
+			intervals [17]:
+				xmin = 1.410
+				xmax = 1.550
+				text = "AH1"
+			intervals [18]:
+				xmin = 1.550
+				xmax = 1.600
+				text = "N"
+			intervals [19]:
+				xmin = 1.600
+				xmax = 1.650
+				text = "DH"
+			intervals [20]:
+				xmin = 1.650
+				xmax = 1.800
+				text = "EH1"
+			intervals [21]:
+				xmin = 1.800
+				xmax = 1.900
+				text = "M"
+			intervals [22]:
+				xmin = 1.900
+				xmax = 1.930
+				text = "SIL"
+			intervals [23]:
+				xmin = 1.930
+				xmax = 2.250
+				text = "sp"
+			intervals [24]:
+				xmin = 2.250
+				xmax = 2.266
+				text = ""

assets/0011_001570.lab ADDED Viewed

	@@ -0,0 +1 @@


1	+ SIL B_AH1_T IH1_F Y_UW1 HH_AE1_D_AH0_N_T D_AH1_N DH_EH1_M SIL

assets/0011_001570.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ But if you hadn't done them.

assets/0011_001570.wav ADDED Viewed

Binary file (72.6 kB). View file

checkpoints/Emotion_encoder.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9de4930cbd8e5ba51efdef84c326e3728a5482dd7668f82960e4cb0f97cc8e5
+size 17095350

checkpoints/GenerSpeech/config.yaml ADDED Viewed

	@@ -0,0 +1,249 @@

+accumulate_grad_batches: 1
+amp: false
+audio_num_mel_bins: 80
+audio_sample_rate: 16000
+base_config:
+- egs/egs_bases/tts/fs2_adv.yaml
+- egs/datasets/audio/emotion/base_text2mel.yaml
+binarization_args:
+  reset_phone_dict: true
+  reset_word_dict: true
+  shuffle: true
+  trim_eos_bos: false
+  trim_sil: false
+  with_align: true
+  with_f0: true
+  with_f0cwt: false
+  with_linear: false
+  with_spk_embed: true
+  with_spk_id: true
+  with_txt: true
+  with_wav: true
+  with_word: true
+binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
+binary_data_dir: data/binary/training_set
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+conv_use_pos: false
+crop: false
+cwt_add_f0_loss: false
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_std_scale: 0.8
+debug: false
+dec_dilations:
+- 1
+- 1
+- 1
+- 1
+dec_ffn_kernel_size: 9
+dec_inp_add_noise: false
+dec_kernel_size: 5
+dec_layers: 4
+dec_num_heads: 2
+decoder_rnn_dim: 0
+decoder_type: fft
+dict_dir: ''
+disc_hidden_size: 128
+disc_interval: 1
+disc_lr: 0.0001
+disc_norm: in
+disc_reduction: stack
+disc_start_steps: 0
+disc_win_num: 3
+discriminator_grad_norm: 1
+discriminator_optimizer_params:
+  eps: 1.0e-06
+  weight_decay: 0.0
+discriminator_scheduler_params:
+  gamma: 0.5
+  step_size: 60000
+dropout: 0.05
+ds_workers: 2
+dur_enc_hidden_stride_kernel:
+- 0,2,3
+- 0,2,3
+- 0,1,3
+dur_loss: mse
+dur_predictor_kernel: 3
+dur_predictor_layers: 2
+emotion_encoder_path: /home1/huangrongjie/Project/Emotion_encoder/1121_emotion_encoder.pt
+enc_dec_norm: ln
+enc_dilations:
+- 1
+- 1
+- 1
+- 1
+enc_ffn_kernel_size: 9
+enc_kernel_size: 5
+enc_layers: 4
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+ffn_act: gelu
+ffn_hidden_size: 1024
+ffn_padding: SAME
+fft_size: 1024
+fmax: 7600
+fmin: 80
+forcing: 20000
+frames_multiple: 1
+gen_dir_name: ''
+generator_grad_norm: 5.0
+griffin_lim_iters: 60
+hidden_size: 256
+hop_size: 256
+infer: false
+lambda_commit: 0.25
+lambda_energy: 0.1
+lambda_f0: 1.0
+lambda_mel_adv: 0.1
+lambda_ph_dur: 0.1
+lambda_sent_dur: 1.0
+lambda_uv: 1.0
+lambda_word_dur: 1.0
+layers_in_block: 2
+load_ckpt: ''
+loud_norm: false
+lr: 1.0
+max_epochs: 1000
+max_frames: 1548
+max_input_tokens: 1550
+max_sentences: 100000
+max_tokens: 30000
+max_updates: 300000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+mel_disc_hidden_size: 128
+mel_gan: true
+mel_hidden_size: 256
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6
+min_frames: 128
+min_level_db: -100
+nVQ: 128
+noise_scale: 0.8
+num_ckpt_keep: 2
+num_heads: 2
+num_sanity_val_steps: -1
+num_spk: 500
+num_test_samples: 72
+num_valid_plots: 10
+optimizer_adam_beta1: 0.5
+optimizer_adam_beta2: 0.999
+out_wav_norm: false
+pitch_ar: false
+pitch_embed_type: 0
+pitch_enc_hidden_stride_kernel:
+- 0,2,5
+- 0,2,5
+- 0,2,5
+pitch_extractor: parselmouth
+pitch_loss: l1
+pitch_norm: standard
+pitch_ssim_win: 11
+pitch_type: frame
+post_glow_hidden: 128
+post_glow_kernel_size: 3
+post_glow_n_block_layers: 3
+post_glow_n_blocks: 8
+post_share_cond_layers: false
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  sox_resample: false
+  sox_to_wav: false
+  trim_sil: false
+  txt_processor: en
+  use_tone: true
+pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign
+predictor_dropout: 0.5
+predictor_grad: 1.0
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+preprocess_args:
+  add_eos_bos: true
+  mfa_group_shuffle: false
+  mfa_offset: 0.02
+  nsample_per_mfa_group: 1000
+  reset_phone_dict: true
+  reset_word_dict: true
+  save_sil_mask: true
+  txt_processor: en
+  use_mfa: true
+  vad_max_silence_length: 12
+  wav_processors: []
+  with_phsep: true
+preprocess_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign
+pretrain_fs_ckpt: ''
+print_nan_grads: false
+processed_data_dir: data/processed/emotion
+profile_infer: false
+raw_data_dir: data/raw/ESD
+ref_audio: ''
+ref_hidden_stride_kernel:
+- 0,3,5
+- 0,3,5
+- 0,2,5
+- 0,2,5
+- 0,2,5
+ref_level_db: 20
+ref_norm_layer: bn
+rename_tmux: true
+rerun_gen: false
+resume_from_checkpoint: 0
+save_best: false
+save_codes: []
+save_f0: false
+save_gt: true
+scheduler: rsqrt
+seed: 1234
+share_wn_layers: 4
+sigmoid_scale: false
+sil_add_noise: false
+sort_by_len: true
+task_cls: modules.GenerSpeech.task.generspeech.GenerSpeechTask
+tb_log_interval: 100
+test_ids: []
+test_input_dir: ''
+test_num: 200
+test_set_name: test
+text: ''
+train_set_name: train
+train_sets: ''
+use_cond_disc: false
+use_emotion: true
+use_energy_embed: false
+use_gt_dur: false
+use_gt_f0: false
+use_latent_cond: true
+use_pitch_embed: true
+use_pos_embed: true
+use_ref_enc: false
+use_spk_embed: true
+use_spk_id: false
+use_split_spk_id: false
+use_txt_cond: true
+use_uv: true
+use_var_enc: false
+use_word: true
+vae_dropout: 0.0
+val_check_interval: 2000
+valid_infer_interval: 10000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+var_enc_vq_codes: 64
+vocoder: hifigan
+vocoder_ckpt: checkpoints/trainset_hifigan
+vocoder_denoise_c: 0.0
+vq_start: 20500
+warmup_updates: 2000
+weight_decay: 0
+win_size: 1024
+word_size: 30000
+work_dir: checkpoints/GenerSpeech_release4

checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b872bb686013cee2a98cc610b8b66b788c46ff4c33130682b63af4ac005405ea
+size 619582860

checkpoints/trainset_hifigan/config.yaml ADDED Viewed

	@@ -0,0 +1,178 @@

+accumulate_grad_batches: 1
+adam_b1: 0.8
+adam_b2: 0.99
+amp: false
+audio_num_mel_bins: 80
+audio_sample_rate: 16000
+aux_context_window: 0
+base_config:
+- egs/egs_bases/tts/vocoder/hifigan.yaml
+- egs/datasets/audio/emotion/base_text2mel.yaml
+binarization_args:
+  reset_phone_dict: true
+  reset_word_dict: true
+  shuffle: true
+  trim_eos_bos: false
+  trim_sil: false
+  with_align: false
+  with_f0: true
+  with_f0cwt: false
+  with_linear: false
+  with_spk_embed: false
+  with_spk_id: true
+  with_txt: false
+  with_wav: true
+  with_word: false
+binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
+binary_data_dir: data/binary/training_set
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+clip_grad_value: 0
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+dict_dir: ''
+disc_start_steps: 40000
+discriminator_grad_norm: 1
+discriminator_optimizer_params:
+  lr: 0.0002
+discriminator_scheduler_params:
+  gamma: 0.999
+  step_size: 600
+dropout: 0.1
+ds_workers: 1
+enc_ffn_kernel_size: 9
+enc_layers: 4
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 1024
+fmax: 7600
+fmin: 80
+frames_multiple: 1
+gen_dir_name: ''
+generator_grad_norm: 10
+generator_optimizer_params:
+  lr: 0.0002
+generator_scheduler_params:
+  gamma: 0.999
+  step_size: 600
+griffin_lim_iters: 60
+hidden_size: 256
+hop_size: 256
+infer: false
+lambda_adv: 1.0
+lambda_cdisc: 4.0
+lambda_mel: 5.0
+lambda_mel_adv: 1.0
+load_ckpt: ''
+loud_norm: false
+lr: 2.0
+max_epochs: 1000
+max_frames: 1548
+max_input_tokens: 1550
+max_samples: 8192
+max_sentences: 24
+max_tokens: 30000
+max_updates: 1000000
+max_valid_sentences: 1
+max_valid_tokens: 60000
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6
+min_frames: 128
+min_level_db: -100
+num_ckpt_keep: 3
+num_heads: 2
+num_mels: 80
+num_sanity_val_steps: -1
+num_spk: 10
+num_test_samples: 30
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_extractor: parselmouth
+pitch_type: frame
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  sox_resample: false
+  sox_to_wav: false
+  trim_sil: false
+  txt_processor: en
+  use_tone: true
+pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign
+print_nan_grads: false
+processed_data_dir: data/processed/emotion,data/processed/LibriTTS
+profile_infer: false
+raw_data_dir: data/raw/ESD
+ref_level_db: 20
+rename_tmux: true
+resblock: '1'
+resblock_dilation_sizes:
+- - 1
+  - 3
+  - 5
+- - 1
+  - 3
+  - 5
+- - 1
+  - 3
+  - 5
+resblock_kernel_sizes:
+- 3
+- 7
+- 11
+resume_from_checkpoint: 0
+save_best: true
+save_codes: []
+save_f0: false
+save_gt: true
+scheduler: rsqrt
+seed: 1234
+sort_by_len: true
+task_cls: tasks.vocoder.hifigan.HifiGanTask
+tb_log_interval: 100
+test_ids: []
+test_input_dir: ''
+test_num: 200
+test_set_name: test
+train_set_name: train
+train_sets: ''
+upsample_initial_channel: 512
+upsample_kernel_sizes:
+- 16
+- 16
+- 4
+- 4
+upsample_rates:
+- 8
+- 8
+- 2
+- 2
+use_cdisc: false
+use_cond_disc: false
+use_emotion: true
+use_fm_loss: false
+use_ms_stft: false
+use_pitch_embed: false
+use_spec_disc: false
+use_spk_embed: false
+use_spk_id: true
+use_split_spk_id: false
+val_check_interval: 2000
+valid_infer_interval: 10000
+valid_monitor_key: val_loss
+valid_monitor_mode: min
+valid_set_name: valid
+vocoder: pwg
+vocoder_ckpt: ''
+vocoder_denoise_c: 0.0
+warmup_updates: 8000
+weight_decay: 0
+win_length: null
+win_size: 1024
+window: hann
+word_size: 30000
+work_dir: checkpoints/trainset_hifigan

checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a2577919899400a111ef42a2aba65797d282c259d083d2c276539dda9d17870
+size 1016199247

data/binary/training_set/mfa_dict.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

data/binary/training_set/mfa_model.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71dc26b9aba3529892eebc21088db2b8eee41c89d87085c24148cf96b029a62c
+size 23850075

data/binary/training_set/phone_set.json ADDED Viewed

	@@ -0,0 +1 @@

+ ["!", ",", ".", ":", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]

data/binary/training_set/train_f0s_mean_std.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8489ff2f4fd60c6a445b35f0a5a6629923880abebe11ff6ead6c2ebd4bfe28f5
+size 144

data/binary/training_set/word_set.json ADDED Viewed

	@@ -0,0 +1 @@

+ [".", "the", ",", "a", "and", "of", "her", "these", "with", "is", "its", "we", "to", "at", "things", "for", "rainbow", "into", "as", "end", "will", "she", "can", "please", "call", "stella", "take", "shape", "long", "round", "arch", "path", "high", "above", "two", "ends", "apparently", "beyond", "horizon", "six", "spoons", "fresh", "snow", "peas", "five", "thick", "slabs", "blue", "cheese", "maybe", "snack", "brother", "bob", "i", "there", "according", "legend", "boiling", "pot", "gold", "one", "when", "sunlight", "strikes", "raindrops", "in", "air", "they", "act", "prism", "form", "ask", "bring", "from", "store", "scoop", "three", "red", "bags", "go", "meet", "wednesday", "train", "station", "also", "need", "small", "plastic", "snake", "big", "toy", "frog", "kids", "division", "white", "light", "many", "beautiful", "colors", "you", "your", "say", "he", "have", "be", "just", "know", "because", "was", "man", "infinite", "resource", "sagacity", "shouldnt", "pricked", "him", "horn", "all", "this", "won", "by", "our", "labour", "neither", "yea", "nor", "nay", "but", "if", "hadnt", "done", "them", "emperor", "no", "admittance", "except", "on", "party", "business", "smiled", "calmly", "mother", "knows", "that", "best", "smile", "id", "soon", "swim", "way", "others", "do", "searched", "through", "box", "name", "more", "hilarious", "?", "words", "behind", "ears", "nonsense", "tom", "fell", "cloven", "head", "vowed", "hed", "change", "pigtails", "place", "shall", "good", "bye", "part", "fish", "mouth", "chew", "leaves", "quickly", "said", "rabbit", "pay", "half", "crown", "week", "extra", "daisy", "creams", "pink", "edges"]

data_gen/tts/base_binarizer.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+from utils.multiprocess_utils import chunked_multiprocess_run
+import random
+import traceback
+import json
+from resemblyzer import VoiceEncoder
+from tqdm import tqdm
+from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder
+from utils.hparams import set_hparams, hparams
+import numpy as np
+from utils.indexed_datasets import IndexedDatasetBuilder
+from vocoders.base_vocoder import VOCODERS
+import pandas as pd
+class BinarizationError(Exception):
+    pass
+class BaseBinarizer:
+    def __init__(self, processed_data_dir=None):
+        if processed_data_dir is None:
+            processed_data_dir = hparams['processed_data_dir']
+        self.processed_data_dirs = processed_data_dir.split(",")
+        self.binarization_args = hparams['binarization_args']
+        self.pre_align_args = hparams['pre_align_args']
+        self.forced_align = self.pre_align_args['forced_align']
+        tg_dir = None
+        if self.forced_align == 'mfa':
+            tg_dir = 'mfa_outputs'
+        if self.forced_align == 'kaldi':
+            tg_dir = 'kaldi_outputs'
+        self.item2txt = {}
+        self.item2ph = {}
+        self.item2wavfn = {}
+        self.item2tgfn = {}
+        self.item2spk = {}
+        for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
+            self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
+            for r_idx, r in self.meta_df.iterrows():
+                item_name = raw_item_name = r['item_name']
+                if len(self.processed_data_dirs) > 1:
+                    item_name = f'ds{ds_id}_{item_name}'
+                self.item2txt[item_name] = r['txt']
+                self.item2ph[item_name] = r['ph']
+                self.item2wavfn[item_name] = os.path.join(hparams['raw_data_dir'], 'wavs', os.path.basename(r['wav_fn']).split('_')[1])
+                self.item2spk[item_name] = r.get('spk', 'SPK1')
+                if len(self.processed_data_dirs) > 1:
+                    self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
+                if tg_dir is not None:
+                    self.item2tgfn[item_name] = f"{processed_data_dir}/{tg_dir}/{raw_item_name}.TextGrid"
+        self.item_names = sorted(list(self.item2txt.keys()))
+        if self.binarization_args['shuffle']:
+            random.seed(1234)
+            random.shuffle(self.item_names)
+    @property
+    def train_item_names(self):
+        return self.item_names[hparams['test_num']+hparams['valid_num']:]
+    @property
+    def valid_item_names(self):
+        return self.item_names[0: hparams['test_num']+hparams['valid_num']]  #
+    @property
+    def test_item_names(self):
+        return self.item_names[0: hparams['test_num']]  # Audios for MOS testing are in 'test_ids'
+    def build_spk_map(self):
+        spk_map = set()
+        for item_name in self.item_names:
+            spk_name = self.item2spk[item_name]
+            spk_map.add(spk_name)
+        spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
+        assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
+        return spk_map
+    def item_name2spk_id(self, item_name):
+        return self.spk_map[self.item2spk[item_name]]
+    def _phone_encoder(self):
+        ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
+        ph_set = []
+        if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
+            for processed_data_dir in self.processed_data_dirs:
+                ph_set += [x.split(' ')[0] for x in open(f'{processed_data_dir}/dict.txt').readlines()]
+            ph_set = sorted(set(ph_set))
+            json.dump(ph_set, open(ph_set_fn, 'w'))
+        else:
+            ph_set = json.load(open(ph_set_fn, 'r'))
+        print("| phone set: ", ph_set)
+        return build_phone_encoder(hparams['binary_data_dir'])
+    def meta_data(self, prefix):
+        if prefix == 'valid':
+            item_names = self.valid_item_names
+        elif prefix == 'test':
+            item_names = self.test_item_names
+        else:
+            item_names = self.train_item_names
+        for item_name in item_names:
+            ph = self.item2ph[item_name]
+            txt = self.item2txt[item_name]
+            tg_fn = self.item2tgfn.get(item_name)
+            wav_fn = self.item2wavfn[item_name]
+            spk_id = self.item_name2spk_id(item_name)
+            yield item_name, ph, txt, tg_fn, wav_fn, spk_id
+    def process(self):
+        os.makedirs(hparams['binary_data_dir'], exist_ok=True)
+        self.spk_map = self.build_spk_map()
+        print("| spk_map: ", self.spk_map)
+        spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
+        json.dump(self.spk_map, open(spk_map_fn, 'w'))
+        self.phone_encoder = self._phone_encoder()
+        self.process_data('valid')
+        self.process_data('test')
+        self.process_data('train')
+    def process_data(self, prefix):
+        data_dir = hparams['binary_data_dir']
+        args = []
+        builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
+        lengths = []
+        f0s = []
+        total_sec = 0
+        if self.binarization_args['with_spk_embed']:
+            voice_encoder = VoiceEncoder().cuda()
+        meta_data = list(self.meta_data(prefix))
+        for m in meta_data:
+            args.append(list(m) + [self.phone_encoder, self.binarization_args])
+        num_workers = int(os.getenv('N_PROC', os.cpu_count() // 3))
+        for f_id, (_, item) in enumerate(
+                zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
+            if item is None:
+                continue
+            item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
+                if self.binarization_args['with_spk_embed'] else None
+            if not self.binarization_args['with_wav'] and 'wav' in item:
+                print("del wav")
+                del item['wav']
+            builder.add_item(item)
+            lengths.append(item['len'])
+            total_sec += item['sec']
+            if item.get('f0') is not None:
+                f0s.append(item['f0'])
+        builder.finalize()
+        np.save(f'{data_dir}/{prefix}_lengths.npy', lengths)
+        if len(f0s) > 0:
+            f0s = np.concatenate(f0s, 0)
+            f0s = f0s[f0s != 0]
+            np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
+        print(f"| {prefix} total duration: {total_sec:.3f}s")
+    @classmethod
+    def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
+        if hparams['vocoder'] in VOCODERS:
+            wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
+        else:
+            wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
+        res = {
+            'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
+            'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
+        }
+        try:
+            if binarization_args['with_f0']:
+                cls.get_pitch(wav, mel, res)
+                if binarization_args['with_f0cwt']:
+                    cls.get_f0cwt(res['f0'], res)
+            if binarization_args['with_txt']:
+                try:
+                    phone_encoded = res['phone'] = encoder.encode(ph)
+                except:
+                    traceback.print_exc()
+                    raise BinarizationError(f"Empty phoneme")
+                if binarization_args['with_align']:
+                    cls.get_align(tg_fn, ph, mel, phone_encoded, res)
+        except BinarizationError as e:
+            print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
+            return None
+        return res
+    @staticmethod
+    def get_align(tg_fn, ph, mel, phone_encoded, res):
+        if tg_fn is not None and os.path.exists(tg_fn):
+            mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
+        else:
+            raise BinarizationError(f"Align not found")
+        if mel2ph.max() - 1 >= len(phone_encoded):
+            raise BinarizationError(
+                f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
+        res['mel2ph'] = mel2ph
+        res['dur'] = dur
+    @staticmethod
+    def get_pitch(wav, mel, res):
+        f0, pitch_coarse = get_pitch(wav, mel, hparams)
+        if sum(f0) == 0:
+            raise BinarizationError("Empty f0")
+        res['f0'] = f0
+        res['pitch'] = pitch_coarse
+    @staticmethod
+    def get_f0cwt(f0, res):
+        from utils.cwt import get_cont_lf0, get_lf0_cwt
+        uv, cont_lf0_lpf = get_cont_lf0(f0)
+        logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
+        cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
+        Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
+        if np.any(np.isnan(Wavelet_lf0)):
+            raise BinarizationError("NaN CWT")
+        res['cwt_spec'] = Wavelet_lf0
+        res['cwt_scales'] = scales
+        res['f0_mean'] = logf0s_mean_org
+        res['f0_std'] = logf0s_std_org
+if __name__ == "__main__":
+    set_hparams()
+    BaseBinarizer().process()

data_gen/tts/base_binarizer_emotion.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+import torch
+from collections import Counter
+from utils.text_encoder import TokenTextEncoder
+from data_gen.tts.emotion import inference as EmotionEncoder
+from data_gen.tts.emotion.inference import embed_utterance as Embed_utterance
+from data_gen.tts.emotion.inference import preprocess_wav
+from utils.multiprocess_utils import chunked_multiprocess_run
+import random
+import traceback
+import json
+from resemblyzer import VoiceEncoder
+from tqdm import tqdm
+from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder, is_sil_phoneme
+from utils.hparams import hparams, set_hparams
+import numpy as np
+from utils.indexed_datasets import IndexedDatasetBuilder
+from vocoders.base_vocoder import get_vocoder_cls
+import pandas as pd
+class BinarizationError(Exception):
+    pass
+class EmotionBinarizer:
+    def __init__(self, processed_data_dir=None):
+        if processed_data_dir is None:
+            processed_data_dir = hparams['processed_data_dir']
+        self.processed_data_dirs = processed_data_dir.split(",")
+        self.binarization_args = hparams['binarization_args']
+        self.pre_align_args = hparams['pre_align_args']
+        self.item2txt = {}
+        self.item2ph = {}
+        self.item2wavfn = {}
+        self.item2tgfn = {}
+        self.item2spk = {}
+        self.item2emo = {}
+    def load_meta_data(self):
+        for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
+            self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
+            for r_idx, r in tqdm(self.meta_df.iterrows(), desc='Loading meta data.'):
+                item_name = raw_item_name = r['item_name']
+                if len(self.processed_data_dirs) > 1:
+                    item_name = f'ds{ds_id}_{item_name}'
+                self.item2txt[item_name] = r['txt']
+                self.item2ph[item_name] = r['ph']
+                self.item2wavfn[item_name] = r['wav_fn']
+                self.item2spk[item_name] = r.get('spk', 'SPK1') \
+                    if self.binarization_args['with_spk_id'] else 'SPK1'
+                if len(self.processed_data_dirs) > 1:
+                    self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
+                self.item2tgfn[item_name] = f"{processed_data_dir}/mfa_outputs/{raw_item_name}.TextGrid"
+                self.item2emo[item_name] = r.get('others', '"Neutral"')
+        self.item_names = sorted(list(self.item2txt.keys()))
+        if self.binarization_args['shuffle']:
+            random.seed(1234)
+            random.shuffle(self.item_names)
+    @property
+    def train_item_names(self):
+        return self.item_names[hparams['test_num']:]
+    @property
+    def valid_item_names(self):
+        return self.item_names[:hparams['test_num']]
+    @property
+    def test_item_names(self):
+        return self.valid_item_names
+    def build_spk_map(self):
+        spk_map = set()
+        for item_name in self.item_names:
+            spk_name = self.item2spk[item_name]
+            spk_map.add(spk_name)
+        spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
+        print("| #Spk: ", len(spk_map))
+        assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
+        return spk_map
+    def build_emo_map(self):
+        emo_map = set()
+        for item_name in self.item_names:
+            emo_name = self.item2emo[item_name]
+            emo_map.add(emo_name)
+        emo_map = {x: i for i, x in enumerate(sorted(list(emo_map)))}
+        print("| #Emo: ", len(emo_map))
+        return emo_map
+    def item_name2spk_id(self, item_name):
+        return self.spk_map[self.item2spk[item_name]]
+    def item_name2emo_id(self, item_name):
+        return self.emo_map[self.item2emo[item_name]]
+    def _phone_encoder(self):
+        ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
+        ph_set = []
+        if self.binarization_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
+            for ph_sent in self.item2ph.values():
+                ph_set += ph_sent.split(' ')
+            ph_set = sorted(set(ph_set))
+            json.dump(ph_set, open(ph_set_fn, 'w'))
+            print("| Build phone set: ", ph_set)
+        else:
+            ph_set = json.load(open(ph_set_fn, 'r'))
+            print("| Load phone set: ", ph_set)
+        return build_phone_encoder(hparams['binary_data_dir'])
+    def _word_encoder(self):
+        fn = f"{hparams['binary_data_dir']}/word_set.json"
+        word_set = []
+        if self.binarization_args['reset_word_dict']:
+            for word_sent in self.item2txt.values():
+                word_set += [x for x in word_sent.split(' ') if x != '']
+            word_set = Counter(word_set)
+            total_words = sum(word_set.values())
+            word_set = word_set.most_common(hparams['word_size'])
+            num_unk_words = total_words - sum([x[1] for x in word_set])
+            word_set = [x[0] for x in word_set]
+            json.dump(word_set, open(fn, 'w'))
+            print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
+                  f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
+        else:
+            word_set = json.load(open(fn, 'r'))
+            print("| Load word set. Size: ", len(word_set), word_set[:10])
+        return TokenTextEncoder(None, vocab_list=word_set, replace_oov='<UNK>')
+    def meta_data(self, prefix):
+        if prefix == 'valid':
+            item_names = self.valid_item_names
+        elif prefix == 'test':
+            item_names = self.test_item_names
+        else:
+            item_names = self.train_item_names
+        for item_name in item_names:
+            ph = self.item2ph[item_name]
+            txt = self.item2txt[item_name]
+            tg_fn = self.item2tgfn.get(item_name)
+            wav_fn = self.item2wavfn[item_name]
+            spk_id = self.item_name2spk_id(item_name)
+            emotion = self.item_name2emo_id(item_name)
+            yield item_name, ph, txt, tg_fn, wav_fn, spk_id, emotion
+    def process(self):
+        self.load_meta_data()
+        os.makedirs(hparams['binary_data_dir'], exist_ok=True)
+        self.spk_map = self.build_spk_map()
+        print("| spk_map: ", self.spk_map)
+        spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
+        json.dump(self.spk_map, open(spk_map_fn, 'w'))
+        self.emo_map = self.build_emo_map()
+        print("| emo_map: ", self.emo_map)
+        emo_map_fn = f"{hparams['binary_data_dir']}/emo_map.json"
+        json.dump(self.emo_map, open(emo_map_fn, 'w'))
+        self.phone_encoder = self._phone_encoder()
+        self.word_encoder = None
+        EmotionEncoder.load_model(hparams['emotion_encoder_path'])
+        if self.binarization_args['with_word']:
+            self.word_encoder = self._word_encoder()
+        self.process_data('valid')
+        self.process_data('test')
+        self.process_data('train')
+    def process_data(self, prefix):
+        data_dir = hparams['binary_data_dir']
+        args = []
+        builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
+        ph_lengths = []
+        mel_lengths = []
+        f0s = []
+        total_sec = 0
+        if self.binarization_args['with_spk_embed']:
+            voice_encoder = VoiceEncoder().cuda()
+        meta_data = list(self.meta_data(prefix))
+        for m in meta_data:
+            args.append(list(m) + [(self.phone_encoder, self.word_encoder), self.binarization_args])
+        num_workers = self.num_workers
+        for f_id, (_, item) in enumerate(
+                zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
+            if item is None:
+                continue
+            item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
+                if self.binarization_args['with_spk_embed'] else None
+            processed_wav = preprocess_wav(item['wav_fn'])
+            item['emo_embed'] = Embed_utterance(processed_wav)
+            if not self.binarization_args['with_wav'] and 'wav' in item:
+                del item['wav']
+            builder.add_item(item)
+            mel_lengths.append(item['len'])
+            if 'ph_len' in item:
+                ph_lengths.append(item['ph_len'])
+            total_sec += item['sec']
+            if item.get('f0') is not None:
+                f0s.append(item['f0'])
+        builder.finalize()
+        np.save(f'{data_dir}/{prefix}_lengths.npy', mel_lengths)
+        if len(ph_lengths) > 0:
+            np.save(f'{data_dir}/{prefix}_ph_lengths.npy', ph_lengths)
+        if len(f0s) > 0:
+            f0s = np.concatenate(f0s, 0)
+            f0s = f0s[f0s != 0]
+            np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
+        print(f"| {prefix} total duration: {total_sec:.3f}s")
+    @classmethod
+    def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, emotion, encoder, binarization_args):
+        res = {'item_name': item_name, 'txt': txt, 'ph': ph, 'wav_fn': wav_fn, 'spk_id': spk_id, 'emotion': emotion}
+        if binarization_args['with_linear']:
+            wav, mel, linear_stft = get_vocoder_cls(hparams).wav2spec(wav_fn) # , return_linear=True
+            res['linear'] = linear_stft
+        else:
+            wav, mel = get_vocoder_cls(hparams).wav2spec(wav_fn)
+        wav = wav.astype(np.float16)
+        res.update({'mel': mel, 'wav': wav,
+                    'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0]})
+        try:
+            if binarization_args['with_f0']:
+                cls.get_pitch(res)
+                if binarization_args['with_f0cwt']:
+                    cls.get_f0cwt(res)
+            if binarization_args['with_txt']:
+                ph_encoder, word_encoder = encoder
+                try:
+                    res['phone'] = ph_encoder.encode(ph)
+                    res['ph_len'] = len(res['phone'])
+                except:
+                    traceback.print_exc()
+                    raise BinarizationError(f"Empty phoneme")
+                if binarization_args['with_align']:
+                    cls.get_align(tg_fn, res)
+                    if binarization_args['trim_eos_bos']:
+                        bos_dur = res['dur'][0]
+                        eos_dur = res['dur'][-1]
+                        res['mel'] = mel[bos_dur:-eos_dur]
+                        res['f0'] = res['f0'][bos_dur:-eos_dur]
+                        res['pitch'] = res['pitch'][bos_dur:-eos_dur]
+                        res['mel2ph'] = res['mel2ph'][bos_dur:-eos_dur]
+                        res['wav'] = wav[bos_dur * hparams['hop_size']:-eos_dur * hparams['hop_size']]
+                        res['dur'] = res['dur'][1:-1]
+                        res['len'] = res['mel'].shape[0]
+                if binarization_args['with_word']:
+                    cls.get_word(res, word_encoder)
+        except BinarizationError as e:
+            print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
+            return None
+        except Exception as e:
+            traceback.print_exc()
+            print(f"| Skip item. item_name: {item_name}, wav_fn: {wav_fn}")
+            return None
+        return res
+    @staticmethod
+    def get_align(tg_fn, res):
+        ph = res['ph']
+        mel = res['mel']
+        phone_encoded = res['phone']
+        if tg_fn is not None and os.path.exists(tg_fn):
+            mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
+        else:
+            raise BinarizationError(f"Align not found")
+        if mel2ph.max() - 1 >= len(phone_encoded):
+            raise BinarizationError(
+                f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
+        res['mel2ph'] = mel2ph
+        res['dur'] = dur
+    @staticmethod
+    def get_pitch(res):
+        wav, mel = res['wav'], res['mel']
+        f0, pitch_coarse = get_pitch(wav, mel, hparams)
+        if sum(f0) == 0:
+            raise BinarizationError("Empty f0")
+        res['f0'] = f0
+        res['pitch'] = pitch_coarse
+    @staticmethod
+    def get_f0cwt(res):
+        from utils.cwt import get_cont_lf0, get_lf0_cwt
+        f0 = res['f0']
+        uv, cont_lf0_lpf = get_cont_lf0(f0)
+        logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
+        cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
+        Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
+        if np.any(np.isnan(Wavelet_lf0)):
+            raise BinarizationError("NaN CWT")
+        res['cwt_spec'] = Wavelet_lf0
+        res['cwt_scales'] = scales
+        res['f0_mean'] = logf0s_mean_org
+        res['f0_std'] = logf0s_std_org
+    @staticmethod
+    def get_word(res, word_encoder):
+        ph_split = res['ph'].split(" ")
+        # ph side mapping to word
+        ph_words = []  # ['<BOS>', 'N_AW1_', ',', 'AE1_Z_|', 'AO1_L_|', 'B_UH1_K_S_|', 'N_AA1_T_|', ....]
+        ph2word = np.zeros([len(ph_split)], dtype=int)
+        last_ph_idx_for_word = []  # [2, 11, ...]
+        for i, ph in enumerate(ph_split):
+            if ph == '|':
+                last_ph_idx_for_word.append(i)
+            elif not ph[0].isalnum():
+                if ph not in ['<BOS>']:
+                    last_ph_idx_for_word.append(i - 1)
+                last_ph_idx_for_word.append(i)
+        start_ph_idx_for_word = [0] + [i + 1 for i in last_ph_idx_for_word[:-1]]
+        for i, (s_w, e_w) in enumerate(zip(start_ph_idx_for_word, last_ph_idx_for_word)):
+            ph_words.append(ph_split[s_w:e_w + 1])
+            ph2word[s_w:e_w + 1] = i
+        ph2word = ph2word.tolist()
+        ph_words = ["_".join(w) for w in ph_words]
+        # mel side mapping to word
+        mel2word = []
+        dur_word = [0 for _ in range(len(ph_words))]
+        for i, m2p in enumerate(res['mel2ph']):
+            word_idx = ph2word[m2p - 1]
+            mel2word.append(ph2word[m2p - 1])
+            dur_word[word_idx] += 1
+        ph2word = [x + 1 for x in ph2word]  # 0预留给padding
+        mel2word = [x + 1 for x in mel2word]  # 0预留给padding
+        res['ph_words'] = ph_words  # [T_word]
+        res['ph2word'] = ph2word  # [T_ph]
+        res['mel2word'] = mel2word  # [T_mel]
+        res['dur_word'] = dur_word  # [T_word]
+        words = [x for x in res['txt'].split(" ") if x != '']
+        while len(words) > 0 and is_sil_phoneme(words[0]):
+            words = words[1:]
+        while len(words) > 0 and is_sil_phoneme(words[-1]):
+            words = words[:-1]
+        words = ['<BOS>'] + words + ['<EOS>']
+        word_tokens = word_encoder.encode(" ".join(words))
+        res['words'] = words
+        res['word_tokens'] = word_tokens
+        assert len(words) == len(ph_words), [words, ph_words]
+    @property
+    def num_workers(self):
+        return int(os.getenv('N_PROC', hparams.get('N_PROC', os.cpu_count())))
+if __name__ == "__main__":
+    set_hparams()
+    EmotionBinarizer().process()

data_gen/tts/base_preprocess.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import json
+import os
+import random
+import re
+import traceback
+from collections import Counter
+from functools import partial
+import librosa
+from tqdm import tqdm
+from data_gen.tts.txt_processors.base_text_processor import get_txt_processor_cls
+from data_gen.tts.wav_processors.base_processor import get_wav_processor_cls
+from utils.hparams import hparams
+from utils.multiprocess_utils import multiprocess_run_tqdm
+from utils.os_utils import link_file, move_file, remove_file
+from data_gen.tts.data_gen_utils import is_sil_phoneme, build_token_encoder
+class BasePreprocessor:
+    def __init__(self):
+        self.preprocess_args = hparams['preprocess_args']
+        txt_processor = self.preprocess_args['txt_processor']
+        self.txt_processor = get_txt_processor_cls(txt_processor)
+        self.raw_data_dir = hparams['raw_data_dir']
+        self.processed_dir = hparams['processed_data_dir']
+        self.spk_map_fn = f"{self.processed_dir}/spk_map.json"
+    def meta_data(self):
+        """
+        :return: {'item_name': Str, 'wav_fn': Str, 'txt': Str, 'spk_name': Str, 'txt_loader': None or Func}
+        """
+        raise NotImplementedError
+    def process(self):
+        processed_dir = self.processed_dir
+        wav_processed_tmp_dir = f'{processed_dir}/processed_tmp'
+        remove_file(wav_processed_tmp_dir)
+        os.makedirs(wav_processed_tmp_dir, exist_ok=True)
+        wav_processed_dir = f'{processed_dir}/{self.wav_processed_dirname}'
+        remove_file(wav_processed_dir)
+        os.makedirs(wav_processed_dir, exist_ok=True)
+        meta_data = list(tqdm(self.meta_data(), desc='Load meta data'))
+        item_names = [d['item_name'] for d in meta_data]
+        assert len(item_names) == len(set(item_names)), 'Key `item_name` should be Unique.'
+        # preprocess data
+        phone_list = []
+        word_list = []
+        spk_names = set()
+        process_item = partial(self.preprocess_first_pass,
+                               txt_processor=self.txt_processor,
+                               wav_processed_dir=wav_processed_dir,
+                               wav_processed_tmp=wav_processed_tmp_dir,
+                               preprocess_args=self.preprocess_args)
+        items = []
+        args = [{
+            'item_name': item_raw['item_name'],
+            'txt_raw': item_raw['txt'],
+            'wav_fn': item_raw['wav_fn'],
+            'txt_loader': item_raw.get('txt_loader'),
+            'others': item_raw.get('others', None)
+        } for item_raw in meta_data]
+        for item_, (item_id, item) in zip(meta_data, multiprocess_run_tqdm(process_item, args, desc='Preprocess')):
+            if item is not None:
+                item_.update(item)
+                item = item_
+                if 'txt_loader' in item:
+                    del item['txt_loader']
+                item['id'] = item_id
+                item['spk_name'] = item.get('spk_name', '<SINGLE_SPK>')
+                item['others'] = item.get('others', None)
+                phone_list += item['ph'].split(" ")
+                word_list += item['word'].split(" ")
+                spk_names.add(item['spk_name'])
+                items.append(item)
+        # add encoded tokens
+        ph_encoder, word_encoder = self._phone_encoder(phone_list), self._word_encoder(word_list)
+        spk_map = self.build_spk_map(spk_names)
+        args = [{
+            'ph': item['ph'], 'word': item['word'], 'spk_name': item['spk_name'],
+            'word_encoder': word_encoder, 'ph_encoder': ph_encoder, 'spk_map': spk_map
+        } for item in items]
+        for idx, item_new_kv in multiprocess_run_tqdm(self.preprocess_second_pass, args, desc='Add encoded tokens'):
+            items[idx].update(item_new_kv)
+        # build mfa data
+        if self.preprocess_args['use_mfa']:
+            mfa_dict = set()
+            mfa_input_dir = f'{processed_dir}/mfa_inputs'
+            remove_file(mfa_input_dir)
+            # group MFA inputs for better parallelism
+            mfa_groups = [i // self.preprocess_args['nsample_per_mfa_group'] for i in range(len(items))]
+            if self.preprocess_args['mfa_group_shuffle']:
+                random.seed(hparams['seed'])
+                random.shuffle(mfa_groups)
+            args = [{
+                'item': item, 'mfa_input_dir': mfa_input_dir,
+                'mfa_group': mfa_group, 'wav_processed_tmp': wav_processed_tmp_dir,
+                'preprocess_args': self.preprocess_args
+            } for item, mfa_group in zip(items, mfa_groups)]
+            for i, (ph_gb_word_nosil, new_wav_align_fn) in multiprocess_run_tqdm(
+                    self.build_mfa_inputs, args, desc='Build MFA data'):
+                items[i]['wav_align_fn'] = new_wav_align_fn
+                for w in ph_gb_word_nosil.split(" "):
+                    mfa_dict.add(f"{w} {w.replace('_', ' ')}")
+            mfa_dict = sorted(mfa_dict)
+            with open(f'{processed_dir}/mfa_dict.txt', 'w') as f:
+                f.writelines([f'{l}\n' for l in mfa_dict])
+        with open(f"{processed_dir}/{self.meta_csv_filename}.json", 'w') as f:
+            f.write(re.sub(r'\n\s+([\d+\]])', r'\1', json.dumps(items, ensure_ascii=False, sort_keys=False, indent=1)))
+        remove_file(wav_processed_tmp_dir)
+    @classmethod
+    def preprocess_first_pass(cls, item_name, txt_raw, txt_processor,
+                              wav_fn, wav_processed_dir, wav_processed_tmp,
+                              preprocess_args, txt_loader=None, others=None):
+        try:
+            if txt_loader is not None:
+                txt_raw = txt_loader(txt_raw)
+            ph, txt, word, ph2word, ph_gb_word = cls.txt_to_ph(txt_processor, txt_raw, preprocess_args)
+            wav_fn, wav_align_fn = cls.process_wav(
+                item_name, wav_fn,
+                hparams['processed_data_dir'],
+                wav_processed_tmp, preprocess_args)
+            # wav for binarization
+            ext = os.path.splitext(wav_fn)[1]
+            os.makedirs(wav_processed_dir, exist_ok=True)
+            new_wav_fn = f"{wav_processed_dir}/{item_name}{ext}"
+            move_link_func = move_file if os.path.dirname(wav_fn) == wav_processed_tmp else link_file
+            move_link_func(wav_fn, new_wav_fn)
+            return {
+                'txt': txt, 'txt_raw': txt_raw, 'ph': ph,
+                'word': word, 'ph2word': ph2word, 'ph_gb_word': ph_gb_word,
+                'wav_fn': new_wav_fn, 'wav_align_fn': wav_align_fn,
+                'others': others
+            }
+        except:
+            traceback.print_exc()
+            print(f"| Error is caught. item_name: {item_name}.")
+            return None
+    @staticmethod
+    def txt_to_ph(txt_processor, txt_raw, preprocess_args):
+        txt_struct, txt = txt_processor.process(txt_raw, preprocess_args)
+        ph = [p for w in txt_struct for p in w[1]]
+        ph_gb_word = ["_".join(w[1]) for w in txt_struct]
+        words = [w[0] for w in txt_struct]
+        # word_id=0 is reserved for padding
+        ph2word = [w_id + 1 for w_id, w in enumerate(txt_struct) for _ in range(len(w[1]))]
+        return " ".join(ph), txt, " ".join(words), ph2word, " ".join(ph_gb_word)
+    @staticmethod
+    def process_wav(item_name, wav_fn, processed_dir, wav_processed_tmp, preprocess_args):
+        processors = [get_wav_processor_cls(v) for v in preprocess_args['wav_processors']]
+        processors = [k() for k in processors if k is not None]
+        if len(processors) >= 1:
+            sr_file = librosa.core.get_samplerate(wav_fn)
+            output_fn_for_align = None
+            ext = os.path.splitext(wav_fn)[1]
+            input_fn = f"{wav_processed_tmp}/{item_name}{ext}"
+            link_file(wav_fn, input_fn)
+            for p in processors:
+                outputs = p.process(input_fn, sr_file, wav_processed_tmp, processed_dir, item_name, preprocess_args)
+                if len(outputs) == 3:
+                    input_fn, sr, output_fn_for_align = outputs
+                else:
+                    input_fn, sr = outputs
+            return input_fn, output_fn_for_align
+        else:
+            return wav_fn, wav_fn
+    def _phone_encoder(self, ph_set):
+        ph_set_fn = f"{self.processed_dir}/phone_set.json"
+        if self.preprocess_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
+            ph_set = sorted(set(ph_set))
+            json.dump(ph_set, open(ph_set_fn, 'w'), ensure_ascii=False)
+            print("| Build phone set: ", ph_set)
+        else:
+            ph_set = json.load(open(ph_set_fn, 'r'))
+            print("| Load phone set: ", ph_set)
+        return build_token_encoder(ph_set_fn)
+    def _word_encoder(self, word_set):
+        word_set_fn = f"{self.processed_dir}/word_set.json"
+        if self.preprocess_args['reset_word_dict']:
+            word_set = Counter(word_set)
+            total_words = sum(word_set.values())
+            word_set = word_set.most_common(hparams['word_dict_size'])
+            num_unk_words = total_words - sum([x[1] for x in word_set])
+            word_set = ['<BOS>', '<EOS>'] + [x[0] for x in word_set]
+            word_set = sorted(set(word_set))
+            json.dump(word_set, open(word_set_fn, 'w'), ensure_ascii=False)
+            print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
+                  f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
+        else:
+            word_set = json.load(open(word_set_fn, 'r'))
+            print("| Load word set. Size: ", len(word_set), word_set[:10])
+        return build_token_encoder(word_set_fn)
+    @classmethod
+    def preprocess_second_pass(cls, word, ph, spk_name, word_encoder, ph_encoder, spk_map):
+        word_token = word_encoder.encode(word)
+        ph_token = ph_encoder.encode(ph)
+        spk_id = spk_map[spk_name]
+        return {'word_token': word_token, 'ph_token': ph_token, 'spk_id': spk_id}
+    def build_spk_map(self, spk_names):
+        spk_map = {x: i for i, x in enumerate(sorted(list(spk_names)))}
+        assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
+        print(f"| Number of spks: {len(spk_map)}, spk_map: {spk_map}")
+        json.dump(spk_map, open(self.spk_map_fn, 'w'), ensure_ascii=False)
+        return spk_map
+    @classmethod
+    def build_mfa_inputs(cls, item, mfa_input_dir, mfa_group, wav_processed_tmp, preprocess_args):
+        item_name = item['item_name']
+        wav_align_fn = item['wav_align_fn']
+        ph_gb_word = item['ph_gb_word']
+        ext = os.path.splitext(wav_align_fn)[1]
+        mfa_input_group_dir = f'{mfa_input_dir}/{mfa_group}'
+        os.makedirs(mfa_input_group_dir, exist_ok=True)
+        new_wav_align_fn = f"{mfa_input_group_dir}/{item_name}{ext}"
+        move_link_func = move_file if os.path.dirname(wav_align_fn) == wav_processed_tmp else link_file
+        move_link_func(wav_align_fn, new_wav_align_fn)
+        ph_gb_word_nosil = " ".join(["_".join([p for p in w.split("_") if not is_sil_phoneme(p)])
+                                     for w in ph_gb_word.split(" ") if not is_sil_phoneme(w)])
+        with open(f'{mfa_input_group_dir}/{item_name}.lab', 'w') as f_txt:
+            f_txt.write(ph_gb_word_nosil)
+        return ph_gb_word_nosil, new_wav_align_fn
+    def load_spk_map(self, base_dir):
+        spk_map_fn = f"{base_dir}/spk_map.json"
+        spk_map = json.load(open(spk_map_fn, 'r'))
+        return spk_map
+    def load_dict(self, base_dir):
+        ph_encoder = build_token_encoder(f'{base_dir}/phone_set.json')
+        word_encoder = build_token_encoder(f'{base_dir}/word_set.json')
+        return ph_encoder, word_encoder
+    @property
+    def meta_csv_filename(self):
+        return 'metadata'
+    @property
+    def wav_processed_dirname(self):
+        return 'wav_processed'

data_gen/tts/bin/binarize.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+import importlib
+from utils.hparams import set_hparams, hparams
+def binarize():
+    binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
+    pkg = ".".join(binarizer_cls.split(".")[:-1])
+    cls_name = binarizer_cls.split(".")[-1]
+    binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
+    print("| Binarizer: ", binarizer_cls)
+    binarizer_cls().process()
+if __name__ == '__main__':
+    set_hparams()
+    binarize()

data_gen/tts/bin/pre_align.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+import importlib
+from utils.hparams import set_hparams, hparams
+def pre_align():
+    assert hparams['pre_align_cls'] != ''
+    pkg = ".".join(hparams["pre_align_cls"].split(".")[:-1])
+    cls_name = hparams["pre_align_cls"].split(".")[-1]
+    process_cls = getattr(importlib.import_module(pkg), cls_name)
+    process_cls().process()
+if __name__ == '__main__':
+    set_hparams()
+    pre_align()

data_gen/tts/bin/train_mfa_align.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import subprocess
+from utils.hparams import hparams, set_hparams
+import os
+def train_mfa_align():
+    CORPUS = hparams['processed_data_dir'].split("/")[-1]
+    print(f"| Run MFA for {CORPUS}.")
+    NUM_JOB = int(os.getenv('N_PROC', os.cpu_count()))
+    subprocess.check_call(f'CORPUS={CORPUS} NUM_JOB={NUM_JOB} bash usr/run_mfa_train_align.sh', shell=True)
+if __name__ == '__main__':
+    set_hparams(print_hparams=False)
+    train_mfa_align()

data_gen/tts/data_gen_utils.py ADDED Viewed

	@@ -0,0 +1,356 @@

+import warnings
+warnings.filterwarnings("ignore")
+# import parselmouth
+import os
+import torch
+from skimage.transform import resize
+from utils.text_encoder import TokenTextEncoder
+from utils.pitch_utils import f0_to_coarse
+import struct
+import webrtcvad
+from scipy.ndimage.morphology import binary_dilation
+import librosa
+import numpy as np
+from utils import audio
+import pyloudnorm as pyln
+import re
+import json
+from collections import OrderedDict
+PUNCS = '!,.?;:'
+int16_max = (2 ** 15) - 1
+def trim_long_silences(path, sr=None, return_raw_wav=False, norm=True, vad_max_silence_length=12):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a
+    threshold determined by the VAD parameters in params.py.
+    :param wav: the raw waveform as a numpy array of floats
+    :param vad_max_silence_length: Maximum number of consecutive silent frames a segment can have.
+    :return: the same waveform with silences trimmed away (length <= original wav length)
+    """
+    ## Voice Activation Detection
+    # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+    # This sets the granularity of the VAD. Should not need to be changed.
+    sampling_rate = 16000
+    wav_raw, sr = librosa.core.load(path, sr=sr)
+    if norm:
+        meter = pyln.Meter(sr)  # create BS.1770 meter
+        loudness = meter.integrated_loudness(wav_raw)
+        wav_raw = pyln.normalize.loudness(wav_raw, loudness, -20.0)
+        if np.abs(wav_raw).max() > 1.0:
+            wav_raw = wav_raw / np.abs(wav_raw).max()
+    wav = librosa.resample(wav_raw, sr, sampling_rate, res_type='kaiser_best')
+    vad_window_length = 30  # In milliseconds
+    # Number of frames to average together when performing the moving average smoothing.
+    # The larger this value, the larger the VAD variations must be to not get smoothed out.
+    vad_moving_average_width = 8
+    # Compute the voice detection window size
+    samples_per_window = (vad_window_length * sampling_rate) // 1000
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+    # Convert the float waveform to 16-bit mono PCM
+    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+                                         sample_rate=sampling_rate))
+    voice_flags = np.array(voice_flags)
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1:] / width
+    audio_mask = moving_average(voice_flags, vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(np.bool)
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+    audio_mask = resize(audio_mask, (len(wav_raw),)) > 0
+    if return_raw_wav:
+        return wav_raw, audio_mask, sr
+    return wav_raw[audio_mask], audio_mask, sr
+def process_utterance(wav_path,
+                      fft_size=1024,
+                      hop_size=256,
+                      win_length=1024,
+                      window="hann",
+                      num_mels=80,
+                      fmin=80,
+                      fmax=7600,
+                      eps=1e-6,
+                      sample_rate=22050,
+                      loud_norm=False,
+                      min_level_db=-100,
+                      return_linear=False,
+                      trim_long_sil=False, vocoder='pwg'):
+    if isinstance(wav_path, str):
+        if trim_long_sil:
+            wav, _, _ = trim_long_silences(wav_path, sample_rate)
+        else:
+            wav, _ = librosa.core.load(wav_path, sr=sample_rate)
+    else:
+        wav = wav_path
+    if loud_norm:
+        meter = pyln.Meter(sample_rate)  # create BS.1770 meter
+        loudness = meter.integrated_loudness(wav)
+        wav = pyln.normalize.loudness(wav, loudness, -22.0)
+        if np.abs(wav).max() > 1:
+            wav = wav / np.abs(wav).max()
+    # get amplitude spectrogram
+    x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
+                          win_length=win_length, window=window, pad_mode="constant")
+    spc = np.abs(x_stft)  # (n_bins, T)
+    # get mel basis
+    fmin = 0 if fmin == -1 else fmin
+    fmax = sample_rate / 2 if fmax == -1 else fmax
+    mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax)
+    mel = mel_basis @ spc
+    if vocoder == 'pwg':
+        mel = np.log10(np.maximum(eps, mel))  # (n_mel_bins, T)
+    else:
+        assert False, f'"{vocoder}" is not in ["pwg"].'
+    l_pad, r_pad = audio.librosa_pad_lr(wav, fft_size, hop_size, 1)
+    wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
+    wav = wav[:mel.shape[1] * hop_size]
+    if not return_linear:
+        return wav, mel
+    else:
+        spc = audio.amp_to_db(spc)
+        spc = audio.normalize(spc, {'min_level_db': min_level_db})
+        return wav, mel, spc
+def get_pitch(wav_data, mel, hparams):
+    """
+    :param wav_data: [T]
+    :param mel: [T, 80]
+    :param hparams:
+    :return:
+    """
+    time_step = hparams['hop_size'] / hparams['audio_sample_rate'] * 1000
+    f0_min = 80
+    f0_max = 750
+    if hparams['hop_size'] == 128:
+        pad_size = 4
+    elif hparams['hop_size'] == 256:
+        pad_size = 2
+    else:
+        assert False
+    import parselmouth
+    f0 = parselmouth.Sound(wav_data, hparams['audio_sample_rate']).to_pitch_ac(
+        time_step=time_step / 1000, voicing_threshold=0.6,
+        pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
+    lpad = pad_size * 2
+    rpad = len(mel) - len(f0) - lpad
+    f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
+    # mel and f0 are extracted by 2 different libraries. we should force them to have the same length.
+    # Attention: we find that new version of some libraries could cause ``rpad'' to be a negetive value...
+    # Just to be sure, we recommend users to set up the same environments as them in requirements_auto.txt (by Anaconda)
+    delta_l = len(mel) - len(f0)
+    assert np.abs(delta_l) <= 8
+    if delta_l > 0:
+        f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
+    f0 = f0[:len(mel)]
+    pitch_coarse = f0_to_coarse(f0)
+    return f0, pitch_coarse
+def remove_empty_lines(text):
+    """remove empty lines"""
+    assert (len(text) > 0)
+    assert (isinstance(text, list))
+    text = [t.strip() for t in text]
+    if "" in text:
+        text.remove("")
+    return text
+class TextGrid(object):
+    def __init__(self, text):
+        text = remove_empty_lines(text)
+        self.text = text
+        self.line_count = 0
+        self._get_type()
+        self._get_time_intval()
+        self._get_size()
+        self.tier_list = []
+        self._get_item_list()
+    def _extract_pattern(self, pattern, inc):
+        """
+        Parameters
+        ----------
+        pattern : regex to extract pattern
+        inc : increment of line count after extraction
+        Returns
+        -------
+        group : extracted info
+        """
+        try:
+            group = re.match(pattern, self.text[self.line_count]).group(1)
+            self.line_count += inc
+        except AttributeError:
+            raise ValueError("File format error at line %d:%s" % (self.line_count, self.text[self.line_count]))
+        return group
+    def _get_type(self):
+        self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2)
+    def _get_time_intval(self):
+        self.xmin = self._extract_pattern(r"xmin = (.*)", 1)
+        self.xmax = self._extract_pattern(r"xmax = (.*)", 2)
+    def _get_size(self):
+        self.size = int(self._extract_pattern(r"size = (.*)", 2))
+    def _get_item_list(self):
+        """Only supports IntervalTier currently"""
+        for itemIdx in range(1, self.size + 1):
+            tier = OrderedDict()
+            item_list = []
+            tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1)
+            tier_class = self._extract_pattern(r"class = \"(.*)\"", 1)
+            if tier_class != "IntervalTier":
+                raise NotImplementedError("Only IntervalTier class is supported currently")
+            tier_name = self._extract_pattern(r"name = \"(.*)\"", 1)
+            tier_xmin = self._extract_pattern(r"xmin = (.*)", 1)
+            tier_xmax = self._extract_pattern(r"xmax = (.*)", 1)
+            tier_size = self._extract_pattern(r"intervals: size = (.*)", 1)
+            for i in range(int(tier_size)):
+                item = OrderedDict()
+                item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1)
+                item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1)
+                item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1)
+                item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1)
+                item_list.append(item)
+            tier["idx"] = tier_idx
+            tier["class"] = tier_class
+            tier["name"] = tier_name
+            tier["xmin"] = tier_xmin
+            tier["xmax"] = tier_xmax
+            tier["size"] = tier_size
+            tier["items"] = item_list
+            self.tier_list.append(tier)
+    def toJson(self):
+        _json = OrderedDict()
+        _json["file_type"] = self.file_type
+        _json["xmin"] = self.xmin
+        _json["xmax"] = self.xmax
+        _json["size"] = self.size
+        _json["tiers"] = self.tier_list
+        return json.dumps(_json, ensure_ascii=False, indent=2)
+def get_mel2ph(tg_fn, ph, mel, hparams):
+    ph_list = ph.split(" ")
+    with open(tg_fn, "r") as f:
+        tg = f.readlines()
+    tg = remove_empty_lines(tg)
+    tg = TextGrid(tg)
+    tg = json.loads(tg.toJson())
+    split = np.ones(len(ph_list) + 1, np.float) * -1
+    tg_idx = 0
+    ph_idx = 0
+    tg_align = [x for x in tg['tiers'][-1]['items']]
+    tg_align_ = []
+    for x in tg_align:
+        x['xmin'] = float(x['xmin'])
+        x['xmax'] = float(x['xmax'])
+        if x['text'] in ['sil', 'sp', '', 'SIL', 'PUNC']:
+            x['text'] = ''
+            if len(tg_align_) > 0 and tg_align_[-1]['text'] == '':
+                tg_align_[-1]['xmax'] = x['xmax']
+                continue
+        tg_align_.append(x)
+    tg_align = tg_align_
+    tg_len = len([x for x in tg_align if x['text'] != ''])
+    ph_len = len([x for x in ph_list if not is_sil_phoneme(x)])
+    assert tg_len == ph_len, (tg_len, ph_len, tg_align, ph_list, tg_fn)
+    while tg_idx < len(tg_align) or ph_idx < len(ph_list):
+        if tg_idx == len(tg_align) and is_sil_phoneme(ph_list[ph_idx]):
+            split[ph_idx] = 1e8
+            ph_idx += 1
+            continue
+        x = tg_align[tg_idx]
+        if x['text'] == '' and ph_idx == len(ph_list):
+            tg_idx += 1
+            continue
+        assert ph_idx < len(ph_list), (tg_len, ph_len, tg_align, ph_list, tg_fn)
+        ph = ph_list[ph_idx]
+        if x['text'] == '' and not is_sil_phoneme(ph):
+            assert False, (ph_list, tg_align)
+        if x['text'] != '' and is_sil_phoneme(ph):
+            ph_idx += 1
+        else:
+            assert (x['text'] == '' and is_sil_phoneme(ph)) \
+                   or x['text'].lower() == ph.lower() \
+                   or x['text'].lower() == 'sil', (x['text'], ph)
+            split[ph_idx] = x['xmin']
+            if ph_idx > 0 and split[ph_idx - 1] == -1 and is_sil_phoneme(ph_list[ph_idx - 1]):
+                split[ph_idx - 1] = split[ph_idx]
+            ph_idx += 1
+            tg_idx += 1
+    assert tg_idx == len(tg_align), (tg_idx, [x['text'] for x in tg_align])
+    assert ph_idx >= len(ph_list) - 1, (ph_idx, ph_list, len(ph_list), [x['text'] for x in tg_align], tg_fn)
+    mel2ph = np.zeros([mel.shape[0]], np.int)
+    split[0] = 0
+    split[-1] = 1e8
+    for i in range(len(split) - 1):
+        assert split[i] != -1 and split[i] <= split[i + 1], (split[:-1],)
+    split = [int(s * hparams['audio_sample_rate'] / hparams['hop_size'] + 0.5) for s in split]
+    for ph_idx in range(len(ph_list)):
+        mel2ph[split[ph_idx]:split[ph_idx + 1]] = ph_idx + 1
+    mel2ph_torch = torch.from_numpy(mel2ph)
+    T_t = len(ph_list)
+    dur = mel2ph_torch.new_zeros([T_t + 1]).scatter_add(0, mel2ph_torch, torch.ones_like(mel2ph_torch))
+    dur = dur[1:].numpy()
+    return mel2ph, dur
+def build_phone_encoder(data_dir):
+    phone_list_file = os.path.join(data_dir, 'phone_set.json')
+    phone_list = json.load(open(phone_list_file))
+    return TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
+def build_word_encoder(data_dir):
+    word_list_file = os.path.join(data_dir, 'word_set.json')
+    word_list = json.load(open(word_list_file))
+    return TokenTextEncoder(None, vocab_list=word_list, replace_oov=',')
+def is_sil_phoneme(p):
+    return not p[0].isalpha()
+def build_token_encoder(token_list_file):
+    token_list = json.load(open(token_list_file))
+    return TokenTextEncoder(None, vocab_list=token_list, replace_oov='<UNK>')

data_gen/tts/emotion/audio.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from scipy.ndimage.morphology import binary_dilation
+from data_gen.tts.emotion.params_data import *
+from pathlib import Path
+from typing import Optional, Union
+import numpy as np
+import webrtcvad
+import librosa
+import struct
+int16_max = (2 ** 15) - 1
+def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
+                   source_sr: Optional[int] = None):
+    """
+    Applies the preprocessing operations used in training the Speaker Encoder to a waveform
+    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
+    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
+    just .wav), either the waveform as a numpy array of floats.
+    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
+    preprocessing. After preprocessing, the waveform's sampling rate will match the data
+    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
+    this argument will be ignored.
+    """
+    # Load the wav from disk if needed
+    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+        wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
+    else:
+        wav = fpath_or_wav
+    # Resample the wav if needed
+    if source_sr is not None and source_sr != sampling_rate:
+        wav = librosa.resample(wav, source_sr, sampling_rate)
+    # Apply the preprocessing: normalize volume and shorten long silences
+    wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
+    wav = trim_long_silences(wav)
+    return wav
+def wav_to_mel_spectrogram(wav):
+    """
+    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
+    Note: this not a log-mel spectrogram.
+    """
+    frames = librosa.feature.melspectrogram(
+        wav,
+        sampling_rate,
+        n_fft=int(sampling_rate * mel_window_length / 1000),
+        hop_length=int(sampling_rate * mel_window_step / 1000),
+        n_mels=mel_n_channels
+    )
+    return frames.astype(np.float32).T
+def trim_long_silences(wav):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a
+    threshold determined by the VAD parameters in params.py.
+    :param wav: the raw waveform as a numpy array of floats
+    :return: the same waveform with silences trimmed away (length <= original wav length)
+    """
+    # Compute the voice detection window size
+    samples_per_window = (vad_window_length * sampling_rate) // 1000
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+    # Convert the float waveform to 16-bit mono PCM
+    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+                                         sample_rate=sampling_rate))
+    voice_flags = np.array(voice_flags)
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1:] / width
+    audio_mask = moving_average(voice_flags, vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(np.bool)
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+    return wav[audio_mask == True]
+def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
+    if increase_only and decrease_only:
+        raise ValueError("Both increase only and decrease only are set")
+    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
+    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
+        return wav
+    return wav * (10 ** (dBFS_change / 20))

data_gen/tts/emotion/inference.py ADDED Viewed

	@@ -0,0 +1,177 @@

+from data_gen.tts.emotion.params_data import *
+from data_gen.tts.emotion.model import EmotionEncoder
+from data_gen.tts.emotion.audio import preprocess_wav   # We want to expose this function from here
+from matplotlib import cm
+from data_gen.tts.emotion import audio
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+_model = None # type: EmotionEncoder
+_device = None # type: torch.device
+def load_model(weights_fpath: Path, device=None):
+    """
+    Loads the model in memory. If this function is not explicitely called, it will be run on the
+    first call to embed_frames() with the default weights file.
+    :param weights_fpath: the path to saved model weights.
+    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
+    model will be loaded and will run on this device. Outputs will however always be on the cpu.
+    If None, will default to your GPU if it"s available, otherwise your CPU.
+    """
+    # TODO: I think the slow loading of the encoder might have something to do with the device it
+    #   was saved on. Worth investigating.
+    global _model, _device
+    if device is None:
+        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    elif isinstance(device, str):
+        _device = torch.device(device)
+    _model = EmotionEncoder(_device, torch.device("cpu"))
+    checkpoint = torch.load(weights_fpath)
+    _model.load_state_dict(checkpoint["model_state"])
+    _model.eval()
+    print("Loaded encoder trained to step %d" % (checkpoint["step"]))
+def is_loaded():
+    return _model is not None
+def embed_frames_batch(frames_batch):
+    """
+    Computes embeddings for a batch of mel spectrogram.
+    :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
+    (batch_size, n_frames, n_channels)
+    :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
+    """
+    if _model is None:
+        raise Exception("Model was not loaded. Call load_model() before inference.")
+    frames = torch.from_numpy(frames_batch).to(_device)
+    embed = _model.inference(frames).detach().cpu().numpy()
+    return embed
+def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
+                           min_pad_coverage=0.75, overlap=0.5):
+    """
+    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
+    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
+    spectrogram slices are returned, so as to make each partial utterance waveform correspond to
+    its spectrogram. This function assumes that the mel spectrogram parameters used are those
+    defined in params_data.py.
+    The returned ranges may be indexing further than the length of the waveform. It is
+    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+    :param n_samples: the number of samples in the waveform
+    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
+    utterance
+    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
+    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
+    then the last partial utterance will be considered, as if we padded the audio. Otherwise,
+    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
+    utterance, this parameter is ignored so that the function always returns at least 1 slice.
+    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
+    utterances are entirely disjoint.
+    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
+    respectively the waveform and the mel spectrogram with these slices to obtain the partial
+    utterances.
+    """
+    assert 0 <= overlap < 1
+    assert 0 < min_pad_coverage <= 1
+    samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+    n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+    frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
+    # Compute the slices
+    wav_slices, mel_slices = [], []
+    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+    for i in range(0, steps, frame_step):
+        mel_range = np.array([i, i + partial_utterance_n_frames])
+        wav_range = mel_range * samples_per_frame
+        mel_slices.append(slice(*mel_range))
+        wav_slices.append(slice(*wav_range))
+    # Evaluate whether extra padding is warranted or not
+    last_wav_range = wav_slices[-1]
+    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+    if coverage < min_pad_coverage and len(mel_slices) > 1:
+        mel_slices = mel_slices[:-1]
+        wav_slices = wav_slices[:-1]
+    return wav_slices, mel_slices
+def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
+    """
+    Computes an embedding for a single utterance.
+    # TODO: handle multiple wavs to benefit from batching on GPU
+    :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
+    :param using_partials: if True, then the utterance is split in partial utterances of
+    <partial_utterance_n_frames> frames and the utterance embedding is computed from their
+    normalized average. If False, the utterance is instead computed from feeding the entire
+    spectogram to the network.
+    :param return_partials: if True, the partial embeddings will also be returned along with the
+    wav slices that correspond to the partial embeddings.
+    :param kwargs: additional arguments to compute_partial_splits()
+    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
+    <return_partials> is True, the partial utterances as a numpy array of float32 of shape
+    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
+    returned. If <using_partials> is simultaneously set to False, both these values will be None
+    instead.
+    """
+    # Process the entire utterance if not using partials
+    if not using_partials:
+        frames = audio.wav_to_mel_spectrogram(wav)
+        embed = embed_frames_batch(frames[None, ...])[0]
+        if return_partials:
+            return embed, None, None
+        return embed
+    # Compute where to split the utterance into partials and pad if necessary
+    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
+    max_wave_length = wave_slices[-1].stop
+    if max_wave_length >= len(wav):
+        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+    # Split the utterance into partials
+    frames = audio.wav_to_mel_spectrogram(wav)
+    frames_batch = np.array([frames[s] for s in mel_slices])
+    partial_embeds = embed_frames_batch(frames_batch)
+    # Compute the utterance embedding from the partial embeddings
+    raw_embed = np.mean(partial_embeds, axis=0)
+    embed = raw_embed / np.linalg.norm(raw_embed, 2)
+    if return_partials:
+        return embed, partial_embeds, wave_slices
+    return embed
+def embed_speaker(wavs, **kwargs):
+    raise NotImplemented()
+def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
+    if ax is None:
+        ax = plt.gca()
+    if shape is None:
+        height = int(np.sqrt(len(embed)))
+        shape = (height, -1)
+    embed = embed.reshape(shape)
+    cmap = cm.get_cmap()
+    mappable = ax.imshow(embed, cmap=cmap)
+    cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
+    cbar.set_clim(*color_range)
+    ax.set_xticks([]), ax.set_yticks([])
+    ax.set_title(title)

data_gen/tts/emotion/model.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from data_gen.tts.emotion.params_model import *
+from data_gen.tts.emotion.params_data import *
+from torch.nn.utils import clip_grad_norm_
+from scipy.optimize import brentq
+from torch import nn
+import numpy as np
+import torch
+class EmotionEncoder(nn.Module):
+    def __init__(self, device, loss_device):
+        super().__init__()
+        self.loss_device = loss_device
+        # Network defition
+        self.lstm = nn.LSTM(input_size=mel_n_channels,
+                            hidden_size=model_hidden_size,
+                            num_layers=model_num_layers,
+                            batch_first=True).to(device)
+        self.linear = nn.Linear(in_features=model_hidden_size,
+                                out_features=model_embedding_size).to(device)
+        self.relu = torch.nn.ReLU().to(device)
+        # Cosine similarity scaling (with fixed initial parameter values)
+        self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
+        self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
+        # Loss
+        self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
+    def do_gradient_ops(self):
+        # Gradient scale
+        self.similarity_weight.grad *= 0.01
+        self.similarity_bias.grad *= 0.01
+        # Gradient clipping
+        clip_grad_norm_(self.parameters(), 3, norm_type=2)
+    def forward(self, utterances, hidden_init=None):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
+        (batch_size, n_frames, n_channels)
+        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
+        batch_size, hidden_size). Will default to a tensor of zeros if None.
+        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
+        """
+        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
+        # and the final cell state.
+        out, (hidden, cell) = self.lstm(utterances, hidden_init)
+        # We take only the hidden state of the last layer
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        # L2-normalize it
+        embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+        return embeds
+    def inference(self, utterances, hidden_init=None):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
+        (batch_size, n_frames, n_channels)
+        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
+        batch_size, hidden_size). Will default to a tensor of zeros if None.
+        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
+        """
+        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
+        # and the final cell state.
+        out, (hidden, cell) = self.lstm(utterances, hidden_init)
+        return hidden[-1]

data_gen/tts/emotion/params_data.py ADDED Viewed

	@@ -0,0 +1,29 @@

+## Mel-filterbank
+mel_window_length = 25  # In milliseconds
+mel_window_step = 10    # In milliseconds
+mel_n_channels = 40
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160     # 1600 ms
+# Number of spectrogram frames at inference
+inference_n_frames = 80     #  800 ms
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30  # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out.
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+## Audio volume normalization
+audio_norm_target_dBFS = -30

data_gen/tts/emotion/params_model.py ADDED Viewed

	@@ -0,0 +1,11 @@

+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
+## Training parameters
+learning_rate_init = 1e-4
+speakers_per_batch = 6
+utterances_per_speaker = 20

data_gen/tts/emotion/test_emotion.py ADDED Viewed

	@@ -0,0 +1,184 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Run inference for pre-processed data with a trained model.
+"""
+import logging
+import math
+import numpy, math, pdb, sys, random
+import time, os, itertools, shutil, importlib
+import argparse
+import os
+import sys
+import glob
+from sklearn import metrics
+import soundfile as sf
+#import sentencepiece as spm
+import torch
+import inference as encoder
+import torch.nn as nn
+import torch.nn.functional as F
+from pathlib import Path
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+from resemblyzer import VoiceEncoder, preprocess_wav
+def tuneThresholdfromScore(scores, labels, target_fa, target_fr=None):
+    fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
+    fnr = 1 - tpr
+    fnr = fnr * 100
+    fpr = fpr * 100
+    tunedThreshold = [];
+    if target_fr:
+        for tfr in target_fr:
+            idx = numpy.nanargmin(numpy.absolute((tfr - fnr)))
+            tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]);
+    for tfa in target_fa:
+        idx = numpy.nanargmin(numpy.absolute((tfa - fpr)))  # numpy.where(fpr<=tfa)[0][-1]
+        tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]);
+    idxE = numpy.nanargmin(numpy.absolute((fnr - fpr)))
+    eer = max(fpr[idxE], fnr[idxE])
+    return (tunedThreshold, eer, fpr, fnr);
+def loadWAV(filename, max_frames, evalmode=True, num_eval=10):
+    # Maximum audio length
+    max_audio = max_frames * 160 + 240
+    # Read wav file and convert to torch tensor
+    audio,sample_rate = sf.read(filename)
+    feats_v0 = torch.from_numpy(audio).float()
+    audiosize = audio.shape[0]
+    if audiosize <= max_audio:
+        shortage = math.floor((max_audio - audiosize + 1) / 2)
+        audio = numpy.pad(audio, (shortage, shortage), 'constant', constant_values=0)
+        audiosize = audio.shape[0]
+    if evalmode:
+        startframe = numpy.linspace(0, audiosize - max_audio, num=num_eval)
+    else:
+        startframe = numpy.array([numpy.int64(random.random() * (audiosize - max_audio))])
+    feats = []
+    if evalmode and max_frames == 0:
+        feats.append(audio)
+    else:
+        for asf in startframe:
+            feats.append(audio[int(asf):int(asf) + max_audio])
+    feat = numpy.stack(feats, axis=0)
+    feat = torch.FloatTensor(feat)
+    return feat;
+def evaluateFromList(listfilename, print_interval=100, test_path='', multi=False):
+    lines       = []
+    files       = []
+    feats       = {}
+    tstart      = time.time()
+    ## Read all lines
+    with open(listfilename) as listfile:
+        while True:
+            line = listfile.readline();
+            if (not line):
+                break;
+            data = line.split();
+            ## Append random label if missing
+            if len(data) == 2: data = [random.randint(0,1)] + data
+            files.append(data[1])
+            files.append(data[2])
+            lines.append(line)
+    setfiles = list(set(files))
+    setfiles.sort()
+    ## Save all features to file
+    for idx, file in enumerate(setfiles):
+        # preprocessed_wav = encoder.preprocess_wav(os.path.join(test_path,file))
+        # embed = encoder.embed_utterance(preprocessed_wav)
+        processed_wav = preprocess_wav(os.path.join(test_path,file))
+        embed = voice_encoder.embed_utterance(processed_wav)
+        torch.cuda.empty_cache()
+        ref_feat = torch.from_numpy(embed).unsqueeze(0)
+        feats[file]     = ref_feat
+        telapsed = time.time() - tstart
+        if idx % print_interval == 0:
+            sys.stdout.write("\rReading %d of %d: %.2f Hz, embedding size %d"%(idx,len(setfiles),idx/telapsed,ref_feat.size()[1]));
+    print('')
+    all_scores = [];
+    all_labels = [];
+    all_trials = [];
+    tstart = time.time()
+    ## Read files and compute all scores
+    for idx, line in enumerate(lines):
+        data = line.split();
+        ## Append random label if missing
+        if len(data) == 2: data = [random.randint(0,1)] + data
+        ref_feat = feats[data[1]]
+        com_feat = feats[data[2]]
+        ref_feat = ref_feat.cuda()
+        com_feat = com_feat.cuda()
+        # normalize feats
+        ref_feat = F.normalize(ref_feat, p=2, dim=1)
+        com_feat = F.normalize(com_feat, p=2, dim=1)
+        dist = F.pairwise_distance(ref_feat.unsqueeze(-1), com_feat.unsqueeze(-1)).detach().cpu().numpy();
+        score = -1 * numpy.mean(dist);
+        all_scores.append(score);
+        all_labels.append(int(data[0]));
+        all_trials.append(data[1]+" "+data[2])
+        if idx % print_interval == 0:
+            telapsed = time.time() - tstart
+            sys.stdout.write("\rComputing %d of %d: %.2f Hz"%(idx,len(lines),idx/telapsed));
+            sys.stdout.flush();
+    print('\n')
+    return (all_scores, all_labels, all_trials);
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser("baseline")
+    parser.add_argument("--data_root", type=str, help="", required=True)
+    parser.add_argument("--list", type=str, help="", required=True)
+    parser.add_argument("--model_dir", type=str, help="model parameters for AudioEncoder", required=True)
+    args = parser.parse_args()
+    # Load the models one by one.
+    print("Preparing the encoder...")
+    # encoder.load_model(Path(args.model_dir))
+    print("Insert the wav file name...")
+    voice_encoder = VoiceEncoder().cuda()
+    sc, lab, trials = evaluateFromList(args.list, print_interval=100, test_path=args.data_root)
+    result = tuneThresholdfromScore(sc, lab, [1, 0.1]);
+    print('EER %2.4f'%result[1])

data_gen/tts/txt_processors/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import en

data_gen/tts/txt_processors/base_text_processor.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from data_gen.tts.data_gen_utils import is_sil_phoneme
+REGISTERED_TEXT_PROCESSORS = {}
+def register_txt_processors(name):
+    def _f(cls):
+        REGISTERED_TEXT_PROCESSORS[name] = cls
+        return cls
+    return _f
+def get_txt_processor_cls(name):
+    return REGISTERED_TEXT_PROCESSORS.get(name, None)
+class BaseTxtProcessor:
+    @staticmethod
+    def sp_phonemes():
+        return ['|']
+    @classmethod
+    def process(cls, txt, preprocess_args):
+        raise NotImplementedError
+    @classmethod
+    def postprocess(cls, txt_struct, preprocess_args):
+        # remove sil phoneme in head and tail
+        while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]):
+            txt_struct = txt_struct[1:]
+        while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]):
+            txt_struct = txt_struct[:-1]
+        if preprocess_args['with_phsep']:
+            txt_struct = cls.add_bdr(txt_struct)
+        if preprocess_args['add_eos_bos']:
+            txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]]
+        return txt_struct
+    @classmethod
+    def add_bdr(cls, txt_struct):
+        txt_struct_ = []
+        for i, ts in enumerate(txt_struct):
+            txt_struct_.append(ts)
+            if i != len(txt_struct) - 1 and \
+                    not is_sil_phoneme(txt_struct[i][0]) and not is_sil_phoneme(txt_struct[i + 1][0]):
+                txt_struct_.append(['|', ['|']])
+        return txt_struct_

data_gen/tts/txt_processors/en.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import re
+import unicodedata
+from g2p_en import G2p
+from g2p_en.expand import normalize_numbers
+from nltk import pos_tag
+from nltk.tokenize import TweetTokenizer
+from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors
+from data_gen.tts.data_gen_utils import is_sil_phoneme, PUNCS
+class EnG2p(G2p):
+    word_tokenize = TweetTokenizer().tokenize
+    def __call__(self, text):
+        # preprocessing
+        words = EnG2p.word_tokenize(text)
+        tokens = pos_tag(words)  # tuples of (word, tag)
+        # steps
+        prons = []
+        for word, pos in tokens:
+            if re.search("[a-z]", word) is None:
+                pron = [word]
+            elif word in self.homograph2features:  # Check homograph
+                pron1, pron2, pos1 = self.homograph2features[word]
+                if pos.startswith(pos1):
+                    pron = pron1
+                else:
+                    pron = pron2
+            elif word in self.cmu:  # lookup CMU dict
+                pron = self.cmu[word][0]
+            else:  # predict for oov
+                pron = self.predict(word)
+            prons.extend(pron)
+            prons.extend([" "])
+        return prons[:-1]
+@register_txt_processors('en')
+class TxtProcessor(BaseTxtProcessor):
+    g2p = EnG2p()
+    @staticmethod
+    def preprocess_text(text):
+        text = normalize_numbers(text)
+        text = ''.join(char for char in unicodedata.normalize('NFD', text)
+                       if unicodedata.category(char) != 'Mn')  # Strip accents
+        text = text.lower()
+        text = re.sub("[\'\"()]+", "", text)
+        text = re.sub("[-]+", " ", text)
+        text = re.sub(f"[^ a-z{PUNCS}]", "", text)
+        text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text)  # !! -> !
+        text = re.sub(f"([{PUNCS}])+", r"\1", text)  # !! -> !
+        text = text.replace("i.e.", "that is")
+        text = text.replace("i.e.", "that is")
+        text = text.replace("etc.", "etc")
+        text = re.sub(f"([{PUNCS}])", r" \1 ", text)
+        text = re.sub(rf"\s+", r" ", text)
+        return text
+    @classmethod
+    def process(cls, txt, preprocess_args):
+        txt = cls.preprocess_text(txt).strip()
+        phs = cls.g2p(txt)
+        txt_struct = [[w, []] for w in txt.split(" ")]
+        i_word = 0
+        for p in phs:
+            if p == ' ':
+                i_word += 1
+            else:
+                txt_struct[i_word][1].append(p)
+        txt_struct = cls.postprocess(txt_struct, preprocess_args)
+        return txt_struct, txt

data_gen/tts/wav_processors/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from . import base_processor
2	+ from . import common_processors

data_gen/tts/wav_processors/base_processor.py ADDED Viewed

	@@ -0,0 +1,25 @@

+REGISTERED_WAV_PROCESSORS = {}
+def register_wav_processors(name):
+    def _f(cls):
+        REGISTERED_WAV_PROCESSORS[name] = cls
+        return cls
+    return _f
+def get_wav_processor_cls(name):
+    return REGISTERED_WAV_PROCESSORS.get(name, None)
+class BaseWavProcessor:
+    @property
+    def name(self):
+        raise NotImplementedError
+    def output_fn(self, input_fn):
+        return f'{input_fn[:-4]}_{self.name}.wav'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        raise NotImplementedError

data_gen/tts/wav_processors/common_processors.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+import subprocess
+import librosa
+import numpy as np
+from data_gen.tts.wav_processors.base_processor import BaseWavProcessor, register_wav_processors
+from data_gen.tts.data_gen_utils import trim_long_silences
+from utils.audio import save_wav
+from utils.rnnoise import rnnoise
+from utils.hparams import hparams
+@register_wav_processors(name='sox_to_wav')
+class ConvertToWavProcessor(BaseWavProcessor):
+    @property
+    def name(self):
+        return 'ToWav'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        if input_fn[-4:] == '.wav':
+            return input_fn, sr
+        else:
+            output_fn = self.output_fn(input_fn)
+            subprocess.check_call(f'sox -v 0.95 "{input_fn}" -t wav "{output_fn}"', shell=True)
+            return output_fn, sr
+@register_wav_processors(name='sox_resample')
+class ResampleProcessor(BaseWavProcessor):
+    @property
+    def name(self):
+        return 'Resample'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        output_fn = self.output_fn(input_fn)
+        sr_file = librosa.core.get_samplerate(input_fn)
+        if sr != sr_file:
+            subprocess.check_call(f'sox -v 0.95 "{input_fn}" -r{sr} "{output_fn}"', shell=True)
+            y, _ = librosa.core.load(input_fn, sr=sr)
+            y, _ = librosa.effects.trim(y)
+            save_wav(y, output_fn, sr)
+            return output_fn, sr
+        else:
+            return input_fn, sr
+@register_wav_processors(name='trim_sil')
+class TrimSILProcessor(BaseWavProcessor):
+    @property
+    def name(self):
+        return 'TrimSIL'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        output_fn = self.output_fn(input_fn)
+        y, _ = librosa.core.load(input_fn, sr=sr)
+        y, _ = librosa.effects.trim(y)
+        save_wav(y, output_fn, sr)
+        return output_fn
+@register_wav_processors(name='trim_all_sil')
+class TrimAllSILProcessor(BaseWavProcessor):
+    @property
+    def name(self):
+        return 'TrimSIL'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        output_fn = self.output_fn(input_fn)
+        y, audio_mask, _ = trim_long_silences(
+            input_fn, vad_max_silence_length=preprocess_args.get('vad_max_silence_length', 12))
+        save_wav(y, output_fn, sr)
+        if preprocess_args['save_sil_mask']:
+            os.makedirs(f'{processed_dir}/sil_mask', exist_ok=True)
+            np.save(f'{processed_dir}/sil_mask/{item_name}.npy', audio_mask)
+        return output_fn, sr
+@register_wav_processors(name='denoise')
+class DenoiseProcessor(BaseWavProcessor):
+    @property
+    def name(self):
+        return 'Denoise'
+    def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
+        output_fn = self.output_fn(input_fn)
+        rnnoise(input_fn, output_fn, out_sample_rate=sr)
+        return output_fn, sr

egs/datasets/audio/emotion/base_text2mel.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+raw_data_dir: 'data/raw/ESD'
+processed_data_dir: 'data/processed/emotion'
+binary_data_dir: 'data/binary/emotion'
+pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign
+audio_sample_rate: 16000
+binarization_args:
+  shuffle: true
+binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
+use_spk_id: true
+test_num: 200
+num_spk: 10
+pitch_type: frame
+min_frames: 128
+num_test_samples: 30
+mel_loss: "ssim:0.5|l1:0.5"
+vocoder_ckpt: ''
+use_emotion: true

egs/datasets/audio/emotion/pre_align.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+from data_gen.tts.base_preprocess import BasePreprocessor
+import glob
+import re
+class EmoPreAlign(BasePreprocessor):
+    def meta_data(self):
+        spks = ['0012', '0011', '0013', '0014', '0015', '0016', '0017', '0018', '0019', '0020']
+        pattern = re.compile('[\t\n ]+')
+        for spk in spks:
+            for line in open(f"{self.raw_data_dir}/{spk}/{spk}.txt", 'r'):  # 打开文件
+                line = re.sub(pattern, ' ', line)
+                if line == ' ': continue
+                split_ = line.split(' ')
+                txt = ' '.join(split_[1: -2])
+                item_name = split_[0]
+                emotion = split_[-2]
+                wav_fn = f'{self.raw_data_dir}/{spk}/{emotion}/{item_name}.wav'
+                yield item_name, wav_fn, txt, spk, emotion
+if __name__ == "__main__":
+    EmoPreAlign().process()

egs/datasets/audio/libritts/base_text2mel.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+raw_data_dir: 'data/raw/LibriTTS'
+processed_data_dir: 'data/processed/libritts'
+binary_data_dir: 'data/binary/libritts'
+pre_align_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign
+binarization_args:
+  shuffle: true
+use_spk_id: true
+test_num: 200
+num_spk: 2320
+pitch_type: frame
+min_frames: 128
+num_test_samples: 30
+mel_loss: "ssim:0.5|l1:0.5"
+vocoder_ckpt: ''

egs/datasets/audio/libritts/fs2.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - egs/egs_bases/tts/fs2.yaml
+  - ./base_text2mel.yaml

egs/datasets/audio/libritts/pre_align.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+from data_gen.tts.base_preprocess import BasePreprocessor
+import glob
+class LibrittsPreAlign(BasePreprocessor):
+    def meta_data(self):
+        wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*/*.wav'))
+        for wav_fn in wav_fns:
+            item_name = os.path.basename(wav_fn)[:-4]
+            txt_fn = f'{wav_fn[:-4]}.normalized.txt'
+            with open(txt_fn, 'r') as f:
+                txt = f.readlines()
+                f.close()
+            spk = item_name.split("_")[0]
+            yield item_name, wav_fn, txt, spk
+if __name__ == "__main__":
+    LibrittsPreAlign().process()

egs/datasets/audio/libritts/pwg.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+base_config: egs/egs_bases/tts/vocoder/pwg.yaml
+raw_data_dir: 'data/raw/LibriTTS'
+processed_data_dir: 'data/processed/libritts'
+binary_data_dir: 'data/binary/libritts_wav'
+generator_params:
+  kernel_size: 5
+num_spk: 400
+max_samples: 20480

egs/datasets/audio/lj/base_mel2wav.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+raw_data_dir: 'data/raw/LJSpeech-1.1'
+processed_data_dir: 'data/processed/ljspeech'
+binary_data_dir: 'data/binary/ljspeech_wav'
+binarization_args:
+  with_spk_embed: false

egs/datasets/audio/lj/pre_align.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from data_gen.tts.base_preprocess import BasePreprocessor
+class LJPreAlign(BasePreprocessor):
+    def meta_data(self):
+        for l in open(f'{self.raw_data_dir}/metadata.csv').readlines():
+            item_name, _, txt = l.strip().split("|")
+            wav_fn = f"{self.raw_data_dir}/wavs/{item_name}.wav"
+            yield item_name, wav_fn, txt, 'SPK1'
+if __name__ == "__main__":
+    LJPreAlign().process()

egs/datasets/audio/lj/pwg.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - egs/egs_bases/tts/vocoder/pwg.yaml
+  - ./base_mel2wav.yaml

egs/datasets/audio/vctk/base_mel2wav.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+raw_data_dir: 'data/raw/VCTK-Corpus'
+processed_data_dir: 'data/processed/vctk'
+binary_data_dir: 'data/binary/vctk_wav'

egs/datasets/audio/vctk/fs2.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+base_config:
+  - egs/egs_bases/tts/fs2.yaml
+raw_data_dir: 'data/raw/VCTK-Corpus'
+processed_data_dir: 'data/processed/vctk'
+binary_data_dir: 'data/binary/vctk'
+pre_align_cls: egs.datasets.audio.vctk.pre_align.VCTKPreAlign
+use_spk_id: true
+test_num: 200
+num_spk: 400
+binarization_args:
+  shuffle: true
+  trim_eos_bos: true

egs/datasets/audio/vctk/pre_align.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os
+from data_gen.tts.base_pre_align import BasePreAlign
+import glob
+class VCTKPreAlign(BasePreAlign):
+    def meta_data(self):
+        wav_fns = glob.glob(f'{self.raw_data_dir}/wav48/*/*.wav')
+        for wav_fn in wav_fns:
+            item_name = os.path.basename(wav_fn)[:-4]
+            spk = item_name.split("_")[0]
+            txt_fn = wav_fn.split("/")
+            txt_fn[-1] = f'{item_name}.txt'
+            txt_fn[-3] = f'txt'
+            txt_fn = "/".join(txt_fn)
+            if os.path.exists(txt_fn) and os.path.exists(wav_fn):
+                yield item_name, wav_fn, (self.load_txt, txt_fn), spk
+if __name__ == "__main__":
+    VCTKPreAlign().process()

egs/datasets/audio/vctk/pwg.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+base_config:
+  - egs/egs_bases/tts/vocoder/pwg.yaml
+  - ./base_mel2wav.yaml
+num_spk: 400
+max_samples: 20480

egs/egs_bases/config_base.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+# task
+binary_data_dir: ''
+work_dir: '' # experiment directory.
+infer: false # inference
+amp: false
+seed: 1234
+debug: false
+save_codes: []
+#  - configs
+#  - modules
+#  - tasks
+#  - utils
+#  - usr
+#############
+# dataset
+#############
+ds_workers: 1
+test_num: 100
+endless_ds: false
+sort_by_len: true
+#########
+# train and eval
+#########
+print_nan_grads: false
+load_ckpt: ''
+save_best: true
+num_ckpt_keep: 3
+clip_grad_norm: 0
+accumulate_grad_batches: 1
+tb_log_interval: 100
+num_sanity_val_steps: 5  # steps of validation at the beginning
+check_val_every_n_epoch: 10
+val_check_interval: 2000
+valid_monitor_key: 'val_loss'
+valid_monitor_mode: 'min'
+max_epochs: 1000
+max_updates: 1000000
+max_tokens: 31250
+max_sentences: 100000
+max_valid_tokens: -1
+max_valid_sentences: -1
+test_input_dir: ''
+resume_from_checkpoint: 0
+rename_tmux: true