ftshijt commited on Dec 21, 2023

Commit

ff4cb43

•

1 Parent(s): f82eb96

Update model

Browse files

Files changed (23) hide show

README.md +328 -1
exp/svs_stats_raw_phn_pyopenjtalk_jp/train/feats_stats.npz +3 -0
exp/svs_stats_raw_phn_pyopenjtalk_jp/train/pitch_stats.npz +3 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/500epoch.pth +3 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/config.yaml +247 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/backward_time.png +0 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/clip.png +0 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/decoder_alpha.png +0 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/duration_loss.png +0 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/encoder_alpha.png +0 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/forward_time.png +0 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/gpu_max_cached_mem_GB.png +0 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/grad_norm.png +0 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/iter_time.png +0 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/loss.png +0 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/loss_scale.png +0 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/mel_loss.png +0 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/optim0_lr0.png +0 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/optim_step_time.png +0 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/pitch_loss.png +0 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/train_time.png +0 -0
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/vuv_loss.png +0 -0
meta.yaml +8 -0

README.md CHANGED Viewed

@@ -1,3 +1,330 @@
 ---
-license: apache-2.0
 ---

 ---
+tags:
+- espnet
+- audio
+- singing-voice-synthesis
+language: jp
+datasets:
+- kiritan
+license: cc-by-4.0
 ---
+## ESPnet2 SVS model
+### `espnet/kiritan_svs_xiaoice`
+This model was trained by ftshijt using kiritan recipe in [espnet](https://github.com/espnet/espnet/).
+### Demo: How to use in ESPnet2
+Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
+if you haven't done that already.
+```bash
+cd espnet
+git checkout 5c4d7cf7feba8461de2e1080bf82182f0efaef38
+pip install -e .
+cd egs2/kiritan/svs1
+./run.sh --skip_data_prep false --skip_train true --download_model espnet/kiritan_svs_xiaoice
+```
+## SVS config
+<details><summary>expand</summary>
+```
+config: conf/tuning/train_xiaoice.yaml
+print_config: false
+log_level: INFO
+drop_last_iter: false
+dry_run: false
+iterator_type: sequence
+valid_iterator_type: null
+output_dir: exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp
+ngpu: 1
+seed: 0
+num_workers: 10
+num_att_plot: 3
+dist_backend: nccl
+dist_init_method: env://
+dist_world_size: null
+dist_rank: null
+local_rank: 0
+dist_master_addr: null
+dist_master_port: null
+dist_launcher: null
+multiprocessing_distributed: false
+unused_parameters: false
+sharded_ddp: false
+cudnn_enabled: true
+cudnn_benchmark: false
+cudnn_deterministic: true
+collect_stats: false
+write_collected_feats: false
+max_epoch: 500
+patience: null
+val_scheduler_criterion:
+- valid
+- loss
+early_stopping_criterion:
+- valid
+- loss
+- min
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
+keep_nbest_models: 5
+nbest_averaging_interval: 0
+grad_clip: 1.0
+grad_clip_type: 2.0
+grad_noise: false
+accum_grad: 1
+no_forward_run: false
+resume: true
+train_dtype: float32
+use_amp: false
+log_interval: null
+use_matplotlib: true
+use_tensorboard: true
+create_graph_in_tensorboard: false
+use_wandb: false
+wandb_project: null
+wandb_id: null
+wandb_entity: null
+wandb_name: null
+wandb_model_log_interval: -1
+detect_anomaly: false
+use_lora: false
+save_lora_only: true
+lora_conf: {}
+pretrain_path: null
+init_param: []
+ignore_init_mismatch: false
+freeze_param: []
+num_iters_per_epoch: 500
+batch_size: 16
+valid_batch_size: null
+batch_bins: 1000000
+valid_batch_bins: null
+train_shape_file:
+- exp/svs_stats_raw_phn_pyopenjtalk_jp/train/text_shape.phn
+- exp/svs_stats_raw_phn_pyopenjtalk_jp/train/singing_shape
+valid_shape_file:
+- exp/svs_stats_raw_phn_pyopenjtalk_jp/valid/text_shape.phn
+- exp/svs_stats_raw_phn_pyopenjtalk_jp/valid/singing_shape
+batch_type: sorted
+valid_batch_type: null
+fold_length:
+- 150
+- 240000
+sort_in_batch: descending
+shuffle_within_batch: false
+sort_batch: descending
+multiple_iterator: false
+chunk_length: 500
+chunk_shift_ratio: 0.5
+num_cache_chunks: 1024
+chunk_excluded_key_prefixes: []
+chunk_default_fs: null
+train_data_path_and_name_and_type:
+-   - dump/raw/tr_no_dev/text
+    - text
+    - text
+-   - dump/raw/tr_no_dev/wav.scp
+    - singing
+    - sound
+-   - dump/raw/tr_no_dev/label
+    - label
+    - duration
+-   - dump/raw/tr_no_dev/score.scp
+    - score
+    - score
+valid_data_path_and_name_and_type:
+-   - dump/raw/dev/text
+    - text
+    - text
+-   - dump/raw/dev/wav.scp
+    - singing
+    - sound
+-   - dump/raw/dev/label
+    - label
+    - duration
+-   - dump/raw/dev/score.scp
+    - score
+    - score
+allow_variable_data_keys: false
+max_cache_size: 0.0
+max_cache_fd: 32
+allow_multi_rates: false
+valid_max_cache_size: null
+exclude_weight_decay: false
+exclude_weight_decay_conf: {}
+optim: adam
+optim_conf:
+    lr: 0.001
+    eps: 1.0e-06
+    weight_decay: 0.0
+scheduler: null
+scheduler_conf: {}
+token_list:
+- <blank>
+- <unk>
+- pau
+- a
+- i
+- o
+- e
+- u
+- k
+- n
+- r
+- t
+- m
+- d
+- s
+- N
+- sh
+- g
+- y
+- b
+- w
+- cl
+- ts
+- z
+- ch
+- j
+- h
+- f
+- p
+- ky
+- ry
+- hy
+- py
+- ny
+- <sos/eos>
+odim: null
+model_conf: {}
+use_preprocessor: true
+token_type: phn
+bpemodel: null
+non_linguistic_symbols: null
+cleaner: null
+g2p: pyopenjtalk
+fs: 24000
+score_feats_extract: syllable_score_feats
+score_feats_extract_conf:
+    fs: 24000
+    n_fft: 2048
+    win_length: 1200
+    hop_length: 300
+feats_extract: fbank
+feats_extract_conf:
+    n_fft: 2048
+    hop_length: 300
+    win_length: 1200
+    fs: 24000
+    fmin: 80
+    fmax: 7600
+    n_mels: 80
+normalize: global_mvn
+normalize_conf:
+    stats_file: exp/svs_stats_raw_phn_pyopenjtalk_jp/train/feats_stats.npz
+svs: xiaoice
+svs_conf:
+    midi_dim: 129
+    duration_dim: 500
+    adim: 384
+    aheads: 4
+    elayers: 6
+    eunits: 1536
+    dlayers: 6
+    dunits: 1536
+    postnet_layers: 5
+    postnet_chans: 512
+    postnet_filts: 5
+    postnet_dropout_rate: 0.5
+    use_batch_norm: true
+    reduction_factor: 1
+    init_type: pytorch
+    use_masking: true
+    loss_function: XiaoiceSing2
+    loss_type: L1
+    lambda_mel: 1
+    lambda_dur: 0.1
+    lambda_pitch: 0.01
+    lambda_vuv: 0.01
+pitch_extract: dio
+pitch_extract_conf:
+    use_token_averaged_f0: false
+    fs: 24000
+    n_fft: 2048
+    hop_length: 300
+    f0max: 800
+    f0min: 80
+    reduction_factor: 1
+pitch_normalize: global_mvn
+pitch_normalize_conf:
+    stats_file: exp/svs_stats_raw_phn_pyopenjtalk_jp/train/pitch_stats.npz
+ying_extract: null
+ying_extract_conf: {}
+energy_extract: null
+energy_extract_conf: {}
+energy_normalize: null
+energy_normalize_conf: {}
+required:
+- output_dir
+- token_list
+version: '202310'
+distributed: false
+```
+</details>
+### Citing ESPnet
+```BibTex
+@inproceedings{watanabe2018espnet,
+  author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
+  title={{ESPnet}: End-to-End Speech Processing Toolkit},
+  year={2018},
+  booktitle={Proceedings of Interspeech},
+  pages={2207--2211},
+  doi={10.21437/Interspeech.2018-1456},
+  url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
+}
+@inproceedings{shi22d_interspeech,
+  author={Jiatong Shi and Shuai Guo and Tao Qian and Tomoki Hayashi and Yuning Wu and Fangzheng Xu and Xuankai Chang and Huazhe Li and Peter Wu and Shinji Watanabe and Qin Jin},
+  title={{Muskits: an End-to-end Music Processing Toolkit for Singing Voice Synthesis}},
+  year=2022,
+  booktitle={Proc. Interspeech 2022},
+  pages={4277--4281},
+  doi={10.21437/Interspeech.2022-10039}
+}
+```
+or arXiv:
+```bibtex
+@misc{watanabe2018espnet,
+  title={ESPnet: End-to-End Speech Processing Toolkit},
+  author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
+  year={2018},
+  eprint={1804.00015},
+  archivePrefix={arXiv},
+  primaryClass={cs.CL}
+}
+```

exp/svs_stats_raw_phn_pyopenjtalk_jp/train/feats_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3f8760a03a38e9f6aedceafc940853e135de03be88dc0f400f80111012ae2f4
+size 1402

exp/svs_stats_raw_phn_pyopenjtalk_jp/train/pitch_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:472f44554816456248d361dd80b0a2a3d17c6dc420486a72fd7a0eedb2144f99
+size 770

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/500epoch.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87d4d724bc5815af66b36528e34e25a91d27cb18458015945fe19a60aef66138
+size 107701515

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/config.yaml ADDED Viewed

	@@ -0,0 +1,247 @@

+config: conf/tuning/train_xiaoice.yaml
+print_config: false
+log_level: INFO
+drop_last_iter: false
+dry_run: false
+iterator_type: sequence
+valid_iterator_type: null
+output_dir: exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp
+ngpu: 1
+seed: 0
+num_workers: 10
+num_att_plot: 3
+dist_backend: nccl
+dist_init_method: env://
+dist_world_size: null
+dist_rank: null
+local_rank: 0
+dist_master_addr: null
+dist_master_port: null
+dist_launcher: null
+multiprocessing_distributed: false
+unused_parameters: false
+sharded_ddp: false
+cudnn_enabled: true
+cudnn_benchmark: false
+cudnn_deterministic: true
+collect_stats: false
+write_collected_feats: false
+max_epoch: 500
+patience: null
+val_scheduler_criterion:
+- valid
+- loss
+early_stopping_criterion:
+- valid
+- loss
+- min
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
+keep_nbest_models: 5
+nbest_averaging_interval: 0
+grad_clip: 1.0
+grad_clip_type: 2.0
+grad_noise: false
+accum_grad: 1
+no_forward_run: false
+resume: true
+train_dtype: float32
+use_amp: false
+log_interval: null
+use_matplotlib: true
+use_tensorboard: true
+create_graph_in_tensorboard: false
+use_wandb: false
+wandb_project: null
+wandb_id: null
+wandb_entity: null
+wandb_name: null
+wandb_model_log_interval: -1
+detect_anomaly: false
+use_lora: false
+save_lora_only: true
+lora_conf: {}
+pretrain_path: null
+init_param: []
+ignore_init_mismatch: false
+freeze_param: []
+num_iters_per_epoch: 500
+batch_size: 16
+valid_batch_size: null
+batch_bins: 1000000
+valid_batch_bins: null
+train_shape_file:
+- exp/svs_stats_raw_phn_pyopenjtalk_jp/train/text_shape.phn
+- exp/svs_stats_raw_phn_pyopenjtalk_jp/train/singing_shape
+valid_shape_file:
+- exp/svs_stats_raw_phn_pyopenjtalk_jp/valid/text_shape.phn
+- exp/svs_stats_raw_phn_pyopenjtalk_jp/valid/singing_shape
+batch_type: sorted
+valid_batch_type: null
+fold_length:
+- 150
+- 240000
+sort_in_batch: descending
+shuffle_within_batch: false
+sort_batch: descending
+multiple_iterator: false
+chunk_length: 500
+chunk_shift_ratio: 0.5
+num_cache_chunks: 1024
+chunk_excluded_key_prefixes: []
+chunk_default_fs: null
+train_data_path_and_name_and_type:
+-   - dump/raw/tr_no_dev/text
+    - text
+    - text
+-   - dump/raw/tr_no_dev/wav.scp
+    - singing
+    - sound
+-   - dump/raw/tr_no_dev/label
+    - label
+    - duration
+-   - dump/raw/tr_no_dev/score.scp
+    - score
+    - score
+valid_data_path_and_name_and_type:
+-   - dump/raw/dev/text
+    - text
+    - text
+-   - dump/raw/dev/wav.scp
+    - singing
+    - sound
+-   - dump/raw/dev/label
+    - label
+    - duration
+-   - dump/raw/dev/score.scp
+    - score
+    - score
+allow_variable_data_keys: false
+max_cache_size: 0.0
+max_cache_fd: 32
+allow_multi_rates: false
+valid_max_cache_size: null
+exclude_weight_decay: false
+exclude_weight_decay_conf: {}
+optim: adam
+optim_conf:
+    lr: 0.001
+    eps: 1.0e-06
+    weight_decay: 0.0
+scheduler: null
+scheduler_conf: {}
+token_list:
+- <blank>
+- <unk>
+- pau
+- a
+- i
+- o
+- e
+- u
+- k
+- n
+- r
+- t
+- m
+- d
+- s
+- N
+- sh
+- g
+- y
+- b
+- w
+- cl
+- ts
+- z
+- ch
+- j
+- h
+- f
+- p
+- ky
+- ry
+- hy
+- py
+- ny
+- <sos/eos>
+odim: null
+model_conf: {}
+use_preprocessor: true
+token_type: phn
+bpemodel: null
+non_linguistic_symbols: null
+cleaner: null
+g2p: pyopenjtalk
+fs: 24000
+score_feats_extract: syllable_score_feats
+score_feats_extract_conf:
+    fs: 24000
+    n_fft: 2048
+    win_length: 1200
+    hop_length: 300
+feats_extract: fbank
+feats_extract_conf:
+    n_fft: 2048
+    hop_length: 300
+    win_length: 1200
+    fs: 24000
+    fmin: 80
+    fmax: 7600
+    n_mels: 80
+normalize: global_mvn
+normalize_conf:
+    stats_file: exp/svs_stats_raw_phn_pyopenjtalk_jp/train/feats_stats.npz
+svs: xiaoice
+svs_conf:
+    midi_dim: 129
+    duration_dim: 500
+    adim: 384
+    aheads: 4
+    elayers: 6
+    eunits: 1536
+    dlayers: 6
+    dunits: 1536
+    postnet_layers: 5
+    postnet_chans: 512
+    postnet_filts: 5
+    postnet_dropout_rate: 0.5
+    use_batch_norm: true
+    reduction_factor: 1
+    init_type: pytorch
+    use_masking: true
+    loss_function: XiaoiceSing2
+    loss_type: L1
+    lambda_mel: 1
+    lambda_dur: 0.1
+    lambda_pitch: 0.01
+    lambda_vuv: 0.01
+pitch_extract: dio
+pitch_extract_conf:
+    use_token_averaged_f0: false
+    fs: 24000
+    n_fft: 2048
+    hop_length: 300
+    f0max: 800
+    f0min: 80
+    reduction_factor: 1
+pitch_normalize: global_mvn
+pitch_normalize_conf:
+    stats_file: exp/svs_stats_raw_phn_pyopenjtalk_jp/train/pitch_stats.npz
+ying_extract: null
+ying_extract_conf: {}
+energy_extract: null
+energy_extract_conf: {}
+energy_normalize: null
+energy_normalize_conf: {}
+required:
+- output_dir
+- token_list
+version: '202310'
+distributed: false

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/backward_time.png ADDED Viewed

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/clip.png ADDED Viewed

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/decoder_alpha.png ADDED Viewed

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/duration_loss.png ADDED Viewed

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/encoder_alpha.png ADDED Viewed

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/forward_time.png ADDED Viewed

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/gpu_max_cached_mem_GB.png ADDED Viewed

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/grad_norm.png ADDED Viewed

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/iter_time.png ADDED Viewed

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/loss.png ADDED Viewed

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/loss_scale.png ADDED Viewed

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/mel_loss.png ADDED Viewed

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/optim0_lr0.png ADDED Viewed

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/optim_step_time.png ADDED Viewed

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/pitch_loss.png ADDED Viewed

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/train_time.png ADDED Viewed

exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/vuv_loss.png ADDED Viewed

meta.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+espnet: '202310'
+files:
+  model_file: exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/500epoch.pth
+python: "3.9.16 (main, Mar  8 2023, 14:00:05) \n[GCC 11.2.0]"
+timestamp: 1703139274.806801
+torch: 1.13.1+cu117
+yaml_files:
+  train_config: exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/config.yaml