ESPnet
multilingual
audio
codec
ftshijt commited on
Commit
0e4fe9b
·
1 Parent(s): d029d58

Update model

Browse files
Files changed (29) hide show
  1. README.md +7 -4
  2. exp_bench_16k/codec_dac_16k_music/360epoch.pth +3 -0
  3. exp_bench_16k/codec_dac_16k_music/config.yaml +268 -0
  4. exp_bench_16k/codec_dac_16k_music/images/adv_loss.png +0 -0
  5. exp_bench_16k/codec_dac_16k_music/images/codec_commit_loss.png +0 -0
  6. exp_bench_16k/codec_dac_16k_music/images/codec_loss.png +0 -0
  7. exp_bench_16k/codec_dac_16k_music/images/codec_quantization_loss.png +0 -0
  8. exp_bench_16k/codec_dac_16k_music/images/discriminator_backward_time.png +0 -0
  9. exp_bench_16k/codec_dac_16k_music/images/discriminator_forward_time.png +0 -0
  10. exp_bench_16k/codec_dac_16k_music/images/discriminator_loss.png +0 -0
  11. exp_bench_16k/codec_dac_16k_music/images/discriminator_optim_step_time.png +0 -0
  12. exp_bench_16k/codec_dac_16k_music/images/discriminator_train_time.png +0 -0
  13. exp_bench_16k/codec_dac_16k_music/images/fake_loss.png +0 -0
  14. exp_bench_16k/codec_dac_16k_music/images/feat_match_loss.png +0 -0
  15. exp_bench_16k/codec_dac_16k_music/images/generator_backward_time.png +0 -0
  16. exp_bench_16k/codec_dac_16k_music/images/generator_forward_time.png +0 -0
  17. exp_bench_16k/codec_dac_16k_music/images/generator_optim_step_time.png +0 -0
  18. exp_bench_16k/codec_dac_16k_music/images/generator_train_time.png +0 -0
  19. exp_bench_16k/codec_dac_16k_music/images/gpu_max_cached_mem_GB.png +0 -0
  20. exp_bench_16k/codec_dac_16k_music/images/iter_time.png +0 -0
  21. exp_bench_16k/codec_dac_16k_music/images/loss.png +0 -0
  22. exp_bench_16k/codec_dac_16k_music/images/mel_loss.png +0 -0
  23. exp_bench_16k/codec_dac_16k_music/images/mel_loss_real.png +0 -0
  24. exp_bench_16k/codec_dac_16k_music/images/optim0_lr0.png +0 -0
  25. exp_bench_16k/codec_dac_16k_music/images/optim1_lr0.png +0 -0
  26. exp_bench_16k/codec_dac_16k_music/images/real_loss.png +0 -0
  27. exp_bench_16k/codec_dac_16k_music/images/reconstruct_loss.png +0 -0
  28. exp_bench_16k/codec_dac_16k_music/images/train_time.png +0 -0
  29. meta.yaml +3 -3
README.md CHANGED
@@ -35,14 +35,14 @@ cd egs2/amuse/codec1
35
  <details><summary>expand</summary>
36
 
37
  ```
38
- config: conf/train_dac_large_v1.4_single.yaml
39
  print_config: false
40
  log_level: INFO
41
  drop_last_iter: false
42
  dry_run: false
43
  iterator_type: chunk
44
  valid_iterator_type: null
45
- output_dir: exp_bench_16k/codec_dac_16k_music_single
46
  ngpu: 1
47
  seed: 777
48
  num_workers: 1
@@ -53,7 +53,7 @@ dist_world_size: 2
53
  dist_rank: 0
54
  local_rank: 0
55
  dist_master_addr: localhost
56
- dist_master_port: 52201
57
  dist_launcher: null
58
  multiprocessing_distributed: true
59
  unused_parameters: true
@@ -209,7 +209,7 @@ codec_conf:
209
  decoder_trim_right_ratio: 1.0
210
  decoder_final_activation: null
211
  decoder_final_activation_params: null
212
- quantizer_n_q: 1
213
  quantizer_bins: 1024
214
  quantizer_decay: 0.99
215
  quantizer_kmeans_init: true
@@ -217,6 +217,9 @@ codec_conf:
217
  quantizer_threshold_ema_dead_code: 2
218
  quantizer_target_bandwidth:
219
  - 0.5
 
 
 
220
  quantizer_dropout: true
221
  sample_rate: 16000
222
  discriminator_params:
 
35
  <details><summary>expand</summary>
36
 
37
  ```
38
+ config: conf/train_dac_large_v1.4.yaml
39
  print_config: false
40
  log_level: INFO
41
  drop_last_iter: false
42
  dry_run: false
43
  iterator_type: chunk
44
  valid_iterator_type: null
45
+ output_dir: exp_bench_16k/codec_dac_16k_music
46
  ngpu: 1
47
  seed: 777
48
  num_workers: 1
 
53
  dist_rank: 0
54
  local_rank: 0
55
  dist_master_addr: localhost
56
+ dist_master_port: 60549
57
  dist_launcher: null
58
  multiprocessing_distributed: true
59
  unused_parameters: true
 
209
  decoder_trim_right_ratio: 1.0
210
  decoder_final_activation: null
211
  decoder_final_activation_params: null
212
+ quantizer_n_q: 8
213
  quantizer_bins: 1024
214
  quantizer_decay: 0.99
215
  quantizer_kmeans_init: true
 
217
  quantizer_threshold_ema_dead_code: 2
218
  quantizer_target_bandwidth:
219
  - 0.5
220
+ - 1
221
+ - 2
222
+ - 4
223
  quantizer_dropout: true
224
  sample_rate: 16000
225
  discriminator_params:
exp_bench_16k/codec_dac_16k_music/360epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf09eba829917e367a6fc68668ec12bdda1e4a752486bc9ca12d941959d382a8
3
+ size 283100815
exp_bench_16k/codec_dac_16k_music/config.yaml ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_dac_large_v1.4.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: chunk
7
+ valid_iterator_type: null
8
+ output_dir: exp_bench_16k/codec_dac_16k_music
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 1
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: 2
16
+ dist_rank: 0
17
+ local_rank: 0
18
+ dist_master_addr: localhost
19
+ dist_master_port: 60549
20
+ dist_launcher: null
21
+ multiprocessing_distributed: true
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ use_deepspeed: false
25
+ deepspeed_config: null
26
+ cudnn_enabled: true
27
+ cudnn_benchmark: false
28
+ cudnn_deterministic: false
29
+ use_tf32: false
30
+ collect_stats: false
31
+ write_collected_feats: false
32
+ max_epoch: 360
33
+ patience: null
34
+ val_scheduler_criterion:
35
+ - valid
36
+ - loss
37
+ early_stopping_criterion:
38
+ - valid
39
+ - loss
40
+ - min
41
+ best_model_criterion:
42
+ - - valid
43
+ - mel_loss
44
+ - min
45
+ - - train
46
+ - mel_loss
47
+ - min
48
+ - - train
49
+ - total_count
50
+ - max
51
+ keep_nbest_models: 5
52
+ nbest_averaging_interval: 0
53
+ grad_clip: -1
54
+ grad_clip_type: 2.0
55
+ grad_noise: false
56
+ accum_grad: 1
57
+ no_forward_run: false
58
+ resume: true
59
+ train_dtype: float32
60
+ use_amp: false
61
+ log_interval: 50
62
+ use_matplotlib: true
63
+ use_tensorboard: true
64
+ create_graph_in_tensorboard: false
65
+ use_wandb: false
66
+ wandb_project: null
67
+ wandb_id: null
68
+ wandb_entity: null
69
+ wandb_name: null
70
+ wandb_model_log_interval: -1
71
+ detect_anomaly: false
72
+ use_adapter: false
73
+ adapter: lora
74
+ save_strategy: all
75
+ adapter_conf: {}
76
+ pretrain_path: null
77
+ init_param: []
78
+ ignore_init_mismatch: false
79
+ freeze_param: []
80
+ num_iters_per_epoch: 5000
81
+ batch_size: 64
82
+ valid_batch_size: null
83
+ batch_bins: 1000000
84
+ valid_batch_bins: null
85
+ category_sample_size: 10
86
+ train_shape_file:
87
+ - exp_bench_16k/music_stats/train/audio_shape
88
+ valid_shape_file:
89
+ - exp_bench_16k/music_stats/valid/audio_shape
90
+ batch_type: unsorted
91
+ valid_batch_type: null
92
+ fold_length:
93
+ - 256000
94
+ sort_in_batch: descending
95
+ shuffle_within_batch: false
96
+ sort_batch: descending
97
+ multiple_iterator: false
98
+ chunk_length: 32000
99
+ chunk_shift_ratio: 0.5
100
+ num_cache_chunks: 256
101
+ chunk_excluded_key_prefixes: []
102
+ chunk_default_fs: null
103
+ chunk_max_abs_length: null
104
+ chunk_discard_short_samples: true
105
+ train_data_path_and_name_and_type:
106
+ - - dump_bench_16k/raw/music_train_1k/wav.scp
107
+ - audio
108
+ - kaldi_ark
109
+ valid_data_path_and_name_and_type:
110
+ - - dump_bench_16k/raw/dev-small/wav.scp
111
+ - audio
112
+ - kaldi_ark
113
+ multi_task_dataset: false
114
+ allow_variable_data_keys: false
115
+ max_cache_size: 0.0
116
+ max_cache_fd: 32
117
+ allow_multi_rates: false
118
+ valid_max_cache_size: null
119
+ exclude_weight_decay: false
120
+ exclude_weight_decay_conf: {}
121
+ optim: adamw
122
+ optim_conf:
123
+ lr: 0.0002
124
+ betas:
125
+ - 0.5
126
+ - 0.9
127
+ eps: 1.0e-09
128
+ weight_decay: 0.0
129
+ scheduler: exponentiallr
130
+ scheduler_conf:
131
+ gamma: 0.999875
132
+ optim2: adamw
133
+ optim2_conf:
134
+ lr: 0.0002
135
+ betas:
136
+ - 0.5
137
+ - 0.9
138
+ eps: 1.0e-09
139
+ weight_decay: 0.0
140
+ scheduler2: exponentiallr
141
+ scheduler2_conf:
142
+ gamma: 0.999875
143
+ generator_first: true
144
+ skip_discriminator_prob: 0.0
145
+ model_conf: {}
146
+ use_preprocessor: true
147
+ codec: dac
148
+ codec_conf:
149
+ sampling_rate: 16000
150
+ generator_params:
151
+ hidden_dim: 512
152
+ codebook_dim: 512
153
+ encdec_channels: 1
154
+ encdec_n_filters: 32
155
+ encdec_n_residual_layers: 3
156
+ encdec_ratios:
157
+ - 8
158
+ - 5
159
+ - 4
160
+ - 2
161
+ encdec_activation: Snake
162
+ encdec_norm: weight_norm
163
+ encdec_kernel_size: 7
164
+ encdec_residual_kernel_size: 7
165
+ encdec_last_kernel_size: 7
166
+ encdec_dilation_base: 2
167
+ encdec_causal: false
168
+ encdec_pad_mode: reflect
169
+ encdec_true_skip: false
170
+ encdec_compress: 2
171
+ encdec_lstm: 2
172
+ decoder_trim_right_ratio: 1.0
173
+ decoder_final_activation: null
174
+ decoder_final_activation_params: null
175
+ quantizer_n_q: 8
176
+ quantizer_bins: 1024
177
+ quantizer_decay: 0.99
178
+ quantizer_kmeans_init: true
179
+ quantizer_kmeans_iters: 50
180
+ quantizer_threshold_ema_dead_code: 2
181
+ quantizer_target_bandwidth:
182
+ - 0.5
183
+ - 1
184
+ - 2
185
+ - 4
186
+ quantizer_dropout: true
187
+ sample_rate: 16000
188
+ discriminator_params:
189
+ msmpmb_discriminator_params:
190
+ rates: []
191
+ sample_rate: 24000
192
+ fft_sizes:
193
+ - 2048
194
+ - 1024
195
+ - 512
196
+ periods:
197
+ - 2
198
+ - 3
199
+ - 5
200
+ - 7
201
+ - 11
202
+ period_discriminator_params:
203
+ in_channels: 1
204
+ out_channels: 1
205
+ kernel_sizes:
206
+ - 5
207
+ - 3
208
+ channels: 32
209
+ downsample_scales:
210
+ - 3
211
+ - 3
212
+ - 3
213
+ - 3
214
+ - 1
215
+ max_downsample_channels: 1024
216
+ bias: true
217
+ nonlinear_activation: LeakyReLU
218
+ nonlinear_activation_params:
219
+ negative_slope: 0.1
220
+ use_weight_norm: true
221
+ use_spectral_norm: false
222
+ band_discriminator_params:
223
+ hop_factor: 0.25
224
+ sample_rate: 24000
225
+ bands:
226
+ - - 0.0
227
+ - 0.1
228
+ - - 0.1
229
+ - 0.25
230
+ - - 0.25
231
+ - 0.5
232
+ - - 0.5
233
+ - 0.75
234
+ - - 0.75
235
+ - 1.0
236
+ channel: 32
237
+ generator_adv_loss_params:
238
+ average_by_discriminators: false
239
+ loss_type: mse
240
+ discriminator_adv_loss_params:
241
+ average_by_discriminators: false
242
+ loss_type: mse
243
+ use_feat_match_loss: true
244
+ feat_match_loss_params:
245
+ average_by_discriminators: false
246
+ average_by_layers: false
247
+ include_final_outputs: true
248
+ use_mel_loss: true
249
+ mel_loss_params:
250
+ range_start: 6
251
+ range_end: 11
252
+ window: hann
253
+ n_mels: 80
254
+ fmin: 0
255
+ fmax: null
256
+ log_base: null
257
+ fs: 16000
258
+ lambda_quantization: 0.25
259
+ lambda_commit: 1.0
260
+ lambda_reconstruct: 1.0
261
+ lambda_adv: 1.0
262
+ lambda_mel: 45.0
263
+ lambda_feat_match: 2.0
264
+ cache_generator_outputs: true
265
+ required:
266
+ - output_dir
267
+ version: '202402'
268
+ distributed: true
exp_bench_16k/codec_dac_16k_music/images/adv_loss.png ADDED
exp_bench_16k/codec_dac_16k_music/images/codec_commit_loss.png ADDED
exp_bench_16k/codec_dac_16k_music/images/codec_loss.png ADDED
exp_bench_16k/codec_dac_16k_music/images/codec_quantization_loss.png ADDED
exp_bench_16k/codec_dac_16k_music/images/discriminator_backward_time.png ADDED
exp_bench_16k/codec_dac_16k_music/images/discriminator_forward_time.png ADDED
exp_bench_16k/codec_dac_16k_music/images/discriminator_loss.png ADDED
exp_bench_16k/codec_dac_16k_music/images/discriminator_optim_step_time.png ADDED
exp_bench_16k/codec_dac_16k_music/images/discriminator_train_time.png ADDED
exp_bench_16k/codec_dac_16k_music/images/fake_loss.png ADDED
exp_bench_16k/codec_dac_16k_music/images/feat_match_loss.png ADDED
exp_bench_16k/codec_dac_16k_music/images/generator_backward_time.png ADDED
exp_bench_16k/codec_dac_16k_music/images/generator_forward_time.png ADDED
exp_bench_16k/codec_dac_16k_music/images/generator_optim_step_time.png ADDED
exp_bench_16k/codec_dac_16k_music/images/generator_train_time.png ADDED
exp_bench_16k/codec_dac_16k_music/images/gpu_max_cached_mem_GB.png ADDED
exp_bench_16k/codec_dac_16k_music/images/iter_time.png ADDED
exp_bench_16k/codec_dac_16k_music/images/loss.png ADDED
exp_bench_16k/codec_dac_16k_music/images/mel_loss.png ADDED
exp_bench_16k/codec_dac_16k_music/images/mel_loss_real.png ADDED
exp_bench_16k/codec_dac_16k_music/images/optim0_lr0.png ADDED
exp_bench_16k/codec_dac_16k_music/images/optim1_lr0.png ADDED
exp_bench_16k/codec_dac_16k_music/images/real_loss.png ADDED
exp_bench_16k/codec_dac_16k_music/images/reconstruct_loss.png ADDED
exp_bench_16k/codec_dac_16k_music/images/train_time.png ADDED
meta.yaml CHANGED
@@ -1,8 +1,8 @@
1
  espnet: '202402'
2
  files:
3
- model_file: exp_bench_16k/codec_dac_16k_music_single/315epoch.pth
4
  python: 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:26:55) [GCC 12.3.0]
5
- timestamp: 1735812806.908501
6
  torch: 2.6.0.dev20241209+cu124
7
  yaml_files:
8
- train_config: exp_bench_16k/codec_dac_16k_music_single/config.yaml
 
1
  espnet: '202402'
2
  files:
3
+ model_file: exp_bench_16k/codec_dac_16k_music/360epoch.pth
4
  python: 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:26:55) [GCC 12.3.0]
5
+ timestamp: 1736237745.352415
6
  torch: 2.6.0.dev20241209+cu124
7
  yaml_files:
8
+ train_config: exp_bench_16k/codec_dac_16k_music/config.yaml