Fhrozen commited on
Commit
6ac9713
1 Parent(s): 5adebd5
README.md CHANGED
@@ -1,3 +1,39 @@
1
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  license: cc-by-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - audio-to-audio
6
+ - vocoder
7
+ language:
8
+ - multilingual
9
+ datasets:
10
+ - libritts
11
+ - csj
12
+ - css10
13
+ - aishell3
14
+ - jvs
15
+ - jsss
16
+ - jsut
17
  license: cc-by-4.0
18
  ---
19
+
20
+ ## Vocoder model - FastDiff
21
+
22
+ **No support given.**
23
+
24
+ ### Details
25
+
26
+ ```
27
+ num_iters_per_epoch: 250
28
+ max_epoch: 1000
29
+ batch_size: 64
30
+ vocoder_conf:
31
+ audio_channels: 1
32
+ inner_channels: 32
33
+ cond_channels: 80
34
+ upsample_ratios:
35
+ - 5
36
+ - 5
37
+ - 4
38
+ - 3
39
+ ```
exp/tts_vocoder_fastdiff/config.yaml ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/fastdiff.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_vocoder_fastdiff
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 6
10
+ num_att_plot: 3
11
+ num_valid_artifacts: 5
12
+ dist_backend: nccl
13
+ dist_init_method: env://
14
+ dist_world_size: null
15
+ dist_rank: null
16
+ local_rank: 0
17
+ dist_master_addr: null
18
+ dist_master_port: null
19
+ dist_launcher: null
20
+ multiprocessing_distributed: false
21
+ unused_parameters: false
22
+ sharded_ddp: false
23
+ growth_interval: 0
24
+ min_grad_scale: -1
25
+ cudnn_enabled: true
26
+ cudnn_benchmark: false
27
+ cudnn_deterministic: true
28
+ collect_stats: false
29
+ write_collected_feats: false
30
+ max_epoch: 1000
31
+ patience: null
32
+ val_scheduler_criterion:
33
+ - valid
34
+ - loss
35
+ early_stopping_criterion:
36
+ - valid
37
+ - loss
38
+ - min
39
+ best_model_criterion:
40
+ - - valid
41
+ - loss
42
+ - min
43
+ - - train
44
+ - loss
45
+ - min
46
+ keep_nbest_models: 5
47
+ nbest_averaging_interval: 0
48
+ grad_clip: 1.0
49
+ grad_clip_type: 2.0
50
+ grad_noise: false
51
+ accum_grad: 1
52
+ no_forward_run: false
53
+ resume: true
54
+ train_dtype: float32
55
+ use_amp: false
56
+ log_interval: null
57
+ use_matplotlib: true
58
+ use_tensorboard: true
59
+ detect_anomaly: false
60
+ pretrain_path: null
61
+ init_param: []
62
+ ignore_init_mismatch: false
63
+ freeze_param: []
64
+ num_iters_per_epoch: 250
65
+ batch_size: 64
66
+ valid_batch_size: null
67
+ valid_num_batches: 100
68
+ batch_bins: 1000000
69
+ valid_batch_bins: 3000000
70
+ train_shape_file:
71
+ - exp/voc_stats_raw/train/text_shape.char
72
+ - exp/voc_stats_raw/train/speech_shape
73
+ valid_shape_file:
74
+ - exp/voc_stats_raw/valid/text_shape.char
75
+ - exp/voc_stats_raw/valid/speech_shape
76
+ batch_type: unsorted
77
+ valid_batch_type: numel
78
+ fold_length:
79
+ - 150
80
+ - 240000
81
+ sort_in_batch: descending
82
+ sort_batch: descending
83
+ multiple_iterator: false
84
+ chunk_length: 500
85
+ chunk_shift_ratio: 0.5
86
+ num_cache_chunks: 1024
87
+ train_data_path_and_name_and_type:
88
+ - - dump/raw/train_no_dev/text
89
+ - text
90
+ - text
91
+ - - dump/raw/train_no_dev/wav.scp
92
+ - speech
93
+ - sound
94
+ valid_data_path_and_name_and_type:
95
+ - - dump/raw/dev/text
96
+ - text
97
+ - text
98
+ - - dump/raw/dev/wav.scp
99
+ - speech
100
+ - sound
101
+ allow_variable_data_keys: false
102
+ max_cache_size: 0.0
103
+ max_cache_fd: 32
104
+ valid_max_cache_size: null
105
+ optim: adamw
106
+ optim_conf:
107
+ lr: 0.0002
108
+ betas:
109
+ - 0.9
110
+ - 0.98
111
+ eps: 1.0e-09
112
+ scheduler: null
113
+ scheduler_conf: {}
114
+ collate_fn_conf: {}
115
+ token_list: dump/token_list/char/tokens.txt
116
+ odim: null
117
+ model_conf: {}
118
+ use_preprocessor: true
119
+ token_type: char
120
+ bpemodel: null
121
+ non_linguistic_symbols: null
122
+ cleaner: null
123
+ g2p: null
124
+ feats_extract: fbank
125
+ feats_extract_conf:
126
+ n_fft: 2048
127
+ hop_length: 300
128
+ win_length: 1200
129
+ fs: 24000
130
+ fmin: 80
131
+ fmax: 7600
132
+ n_mels: 80
133
+ normalize: global_mvn
134
+ normalize_conf:
135
+ stats_file: exp/voc_stats_raw/train/feats_stats.npz
136
+ vocoder: fastdiff
137
+ vocoder_conf:
138
+ audio_channels: 1
139
+ inner_channels: 32
140
+ cond_channels: 80
141
+ upsample_ratios:
142
+ - 5
143
+ - 5
144
+ - 4
145
+ - 3
146
+ lvc_layers_each_block: 4
147
+ lvc_kernel_size: 3
148
+ kpnet_hidden_channels: 64
149
+ kpnet_conv_size: 3
150
+ dropout: 0.05
151
+ diffusion_step_embed_dim_in: 128
152
+ diffusion_step_embed_dim_mid: 512
153
+ diffusion_step_embed_dim_out: 512
154
+ use_weight_norm: true
155
+ mel_loss_params:
156
+ fs: 24000
157
+ fft_size: 2048
158
+ hop_size: 300
159
+ win_length: 1200
160
+ window: hann
161
+ num_mels: 80
162
+ fmin: 0
163
+ fmax: 12000
164
+ log_base: null
165
+ pitch_extract: null
166
+ pitch_extract_conf: {}
167
+ energy_extract: null
168
+ energy_extract_conf: {}
169
+ pitch_normalize: null
170
+ pitch_normalize_conf: {}
171
+ energy_normalize: null
172
+ energy_normalize_conf: {}
173
+ required:
174
+ - output_dir
175
+ version: '202207'
176
+ distributed: false
exp/tts_vocoder_fastdiff/train.loss.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:730b9c876a8e4ef6a41a39437a0dd604c91974816921274986be788badfcf0b9
3
+ size 81577071
exp/voc_stats_raw/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e88a8e0ee800d551db34b05e14c39f627ee77200a44977678ad57d30f22df68a
3
+ size 1402
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202207'
2
+ files:
3
+ model_file: exp/tts_vocoder_fastdiff/train.loss.ave_5best.pth
4
+ python: "3.9.13 (main, Aug 25 2022, 23:26:10) \n[GCC 11.2.0]"
5
+ timestamp: 1673910959.463416
6
+ torch: 1.12.1
7
+ yaml_files:
8
+ train_config: exp/tts_vocoder_fastdiff/config.yaml