Wangyou Zhang commited on
Commit
eb7d994
1 Parent(s): a61d5d2

initial commit

Browse files
Files changed (31) hide show
  1. README.md +348 -0
  2. exp/enh_train_enh_uses_refch0_2mem_raw/20epoch.pth +3 -0
  3. exp/enh_train_enh_uses_refch0_2mem_raw/RESULTS.md +25 -0
  4. exp/enh_train_enh_uses_refch0_2mem_raw/config.yaml +236 -0
  5. exp/enh_train_enh_uses_refch0_2mem_raw/images/backward_time.png +0 -0
  6. exp/enh_train_enh_uses_refch0_2mem_raw/images/clip.png +0 -0
  7. exp/enh_train_enh_uses_refch0_2mem_raw/images/forward_time.png +0 -0
  8. exp/enh_train_enh_uses_refch0_2mem_raw/images/gpu_max_cached_mem_GB.png +0 -0
  9. exp/enh_train_enh_uses_refch0_2mem_raw/images/grad_norm.png +0 -0
  10. exp/enh_train_enh_uses_refch0_2mem_raw/images/iter_time.png +0 -0
  11. exp/enh_train_enh_uses_refch0_2mem_raw/images/l1_timedomain+magspec_loss_1ch_16k.png +0 -0
  12. exp/enh_train_enh_uses_refch0_2mem_raw/images/l1_timedomain+magspec_loss_1ch_48k.png +0 -0
  13. exp/enh_train_enh_uses_refch0_2mem_raw/images/l1_timedomain+magspec_loss_2ch_16k.png +0 -0
  14. exp/enh_train_enh_uses_refch0_2mem_raw/images/l1_timedomain+magspec_loss_2ch_16k_r.png +0 -0
  15. exp/enh_train_enh_uses_refch0_2mem_raw/images/l1_timedomain+magspec_loss_5ch_16k.png +0 -0
  16. exp/enh_train_enh_uses_refch0_2mem_raw/images/l1_timedomain+magspec_loss_8ch_16k.png +0 -0
  17. exp/enh_train_enh_uses_refch0_2mem_raw/images/loss.png +0 -0
  18. exp/enh_train_enh_uses_refch0_2mem_raw/images/loss_scale.png +0 -0
  19. exp/enh_train_enh_uses_refch0_2mem_raw/images/optim0_lr0.png +0 -0
  20. exp/enh_train_enh_uses_refch0_2mem_raw/images/optim_step_time.png +0 -0
  21. exp/enh_train_enh_uses_refch0_2mem_raw/images/si_snr_loss_1ch_16k.png +0 -0
  22. exp/enh_train_enh_uses_refch0_2mem_raw/images/si_snr_loss_1ch_48k.png +0 -0
  23. exp/enh_train_enh_uses_refch0_2mem_raw/images/si_snr_loss_2ch_16k.png +0 -0
  24. exp/enh_train_enh_uses_refch0_2mem_raw/images/si_snr_loss_2ch_16k_r.png +0 -0
  25. exp/enh_train_enh_uses_refch0_2mem_raw/images/si_snr_loss_5ch_16k.png +0 -0
  26. exp/enh_train_enh_uses_refch0_2mem_raw/images/si_snr_loss_8ch_16k.png +0 -0
  27. exp/enh_train_enh_uses_refch0_2mem_raw/images/train_time.png +0 -0
  28. exp/enh_train_enh_uses_refch0_2mem_raw/latest.pth +1 -0
  29. exp/enh_train_enh_uses_refch0_2mem_raw/valid.loss.ave.pth +1 -0
  30. exp/enh_train_enh_uses_refch0_2mem_raw/valid.loss.best.pth +1 -0
  31. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,351 @@
1
  ---
 
 
 
 
 
 
 
 
 
 
 
2
  license: cc-by-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - audio-to-audio
6
+ language: en
7
+ datasets:
8
+ - vctk_noisy
9
+ - dns_ins20
10
+ - chime4
11
+ - reverb
12
+ - whamr
13
  license: cc-by-4.0
14
  ---
15
+
16
+ ## ESPnet2 ENH model
17
+
18
+ ### `espnet/Wangyou_Zhang_universal_train_enh_uses_refch0_2mem_raw`
19
+
20
+ This model was trained by Wangyou Zhang using the wsj0_2mix recipe in [espnet](https://github.com/espnet/espnet/).
21
+
22
+ ### Demo: How to use in ESPnet2
23
+
24
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
25
+ if you haven't done that already.
26
+
27
+ ```bash
28
+ cd espnet
29
+
30
+ pip install -e .
31
+ cd egs2/universal_se/enh1
32
+ ./run.sh --skip_data_prep false --skip_train true --is_tse_task false --download_model espnet/Wangyou_Zhang_universal_train_enh_uses_refch0_2mem_raw
33
+ ```
34
+
35
+ <!-- Generated by scripts/utils/show_enh_score.sh -->
36
+ # RESULTS
37
+ ## Environments
38
+ - date: `Sat Jul 15 12:50:47 CST 2023`
39
+ - python version: `3.8.16 (default, Mar 2 2023, 03:21:46) [GCC 11.2.0]`
40
+ - espnet version: `espnet 202301`
41
+ - pytorch version: `pytorch 2.0.1`
42
+ - Git hash: ``
43
+ - Commit date: ``
44
+
45
+
46
+ ## USES (ref_channel=0, 2 groups of memory tokens)
47
+
48
+ |dataset|condition|PESQ_WB|STOI|SAR|SDR|SIR|SI_SNR|OVRL|SIG|BAK|P808_MOS|
49
+ |---|---|---|---|---|---|---|---|---|---|---|---|
50
+ |vctk_noisy_tt_2spk|1ch, 48kHz||93.05|10.97|10.97|0.00|8.36|3.14|3.39|4.05|3.57|
51
+ |vctk_noisy_tt_2spk_16k|1ch, 16kHz|3.11|95.03|21.51|21.51|0.00|19.45|3.19|3.46|4.06|3.57|
52
+ |dns20_tt_synthetic_no_reverb|1ch, 16kHz|3.23|97.77|19.63|19.63|0.00|19.72|3.32|3.56|4.10|4.04|
53
+ |dns20_tt_synthetic_with_reverb|1ch, 16kHz|2.75|89.87|13.40|13.40|0.00|12.90|2.36|2.85|3.21|3.37|
54
+ |chime4_et05_simu_isolated_6ch_track|5ch, 16kHz|2.95|97.82|18.30|18.30|0.00|17.24|3.22|3.47|4.07|3.75|
55
+ |reverb_et_simu_8ch_multich|8ch, 16kHz|2.09|89.83|11.94|11.94|0.00|-10.12|2.98|3.35|3.79|3.90|
56
+ |whamr_tt_mix_single_anechoic_max_16k|2ch, 16kHz|2.55|96.36|15.78|15.78|0.00|15.46|3.33|3.55|4.16|3.86|
57
+ |whamr_tt_mix_single_reverb_max_16k|2ch, 16kHz|2.51|95.98|13.75|13.75|0.00|12.51|3.32|3.54|4.15|3.86|
58
+ |chime4_et05_real_isolated_6ch_track_1ch|5ch, 16kHz|1.23|55.11|-2.34|-2.34|0.00|-30.45|3.07|3.36|3.98|3.75|
59
+ |reverb_et_real_8ch_multich|8ch, 16kHz|1.17|75.30|4.39|4.39|0.00|1.62|3.11|3.42|3.97|3.99|
60
+
61
+ ## ENH config
62
+
63
+ <details><summary>expand</summary>
64
+
65
+ ```
66
+ config: conf/tuning/train_enh_uses_refch0_2mem.yaml
67
+ print_config: false
68
+ log_level: INFO
69
+ dry_run: false
70
+ iterator_type: chunk
71
+ output_dir: exp/enh_train_enh_uses_refch0_2mem_raw
72
+ ngpu: 1
73
+ seed: 0
74
+ num_workers: 4
75
+ num_att_plot: 3
76
+ dist_backend: nccl
77
+ dist_init_method: env://
78
+ dist_world_size: 4
79
+ dist_rank: 0
80
+ local_rank: 0
81
+ dist_master_addr: localhost
82
+ dist_master_port: 33702
83
+ dist_launcher: null
84
+ multiprocessing_distributed: true
85
+ unused_parameters: true
86
+ sharded_ddp: false
87
+ cudnn_enabled: true
88
+ cudnn_benchmark: false
89
+ cudnn_deterministic: true
90
+ collect_stats: false
91
+ write_collected_feats: false
92
+ skip_stats_npz: false
93
+ max_epoch: 150
94
+ patience: 20
95
+ val_scheduler_criterion:
96
+ - valid
97
+ - loss
98
+ early_stopping_criterion:
99
+ - valid
100
+ - loss
101
+ - min
102
+ best_model_criterion:
103
+ - - valid
104
+ - loss
105
+ - min
106
+ keep_nbest_models: 1
107
+ nbest_averaging_interval: 0
108
+ grad_clip: 5.0
109
+ grad_clip_type: 2.0
110
+ grad_noise: false
111
+ accum_grad: 1
112
+ no_forward_run: false
113
+ resume: true
114
+ train_dtype: float32
115
+ use_amp: false
116
+ log_interval: null
117
+ use_matplotlib: true
118
+ use_tensorboard: true
119
+ create_graph_in_tensorboard: false
120
+ use_wandb: false
121
+ wandb_project: null
122
+ wandb_id: null
123
+ wandb_entity: null
124
+ wandb_name: null
125
+ wandb_model_log_interval: -1
126
+ detect_anomaly: false
127
+ pretrain_path: null
128
+ init_param: []
129
+ ignore_init_mismatch: false
130
+ freeze_param: []
131
+ num_iters_per_epoch: 8000
132
+ batch_size: 4
133
+ valid_batch_size: null
134
+ batch_bins: 1000000
135
+ valid_batch_bins: null
136
+ train_shape_file:
137
+ - exp/enh_stats_16k/train/speech_mix_shape
138
+ - exp/enh_stats_16k/train/speech_ref1_shape
139
+ - exp/enh_stats_16k/train/dereverb_ref1_shape
140
+ valid_shape_file:
141
+ - exp/enh_stats_16k/valid/speech_mix_shape
142
+ - exp/enh_stats_16k/valid/speech_ref1_shape
143
+ - exp/enh_stats_16k/valid/dereverb_ref1_shape
144
+ batch_type: folded
145
+ valid_batch_type: null
146
+ fold_length:
147
+ - 80000
148
+ - 80000
149
+ - 80000
150
+ sort_in_batch: descending
151
+ sort_batch: descending
152
+ multiple_iterator: false
153
+ chunk_length: 32000
154
+ chunk_shift_ratio: 0.5
155
+ num_cache_chunks: 1024
156
+ chunk_excluded_key_prefixes: []
157
+ chunk_discard_short_samples: false
158
+ train_data_path_and_name_and_type:
159
+ - - dump/raw/train_dns20_vctk_whamr_chime4_reverb/wav.scp
160
+ - speech_mix
161
+ - sound
162
+ - - dump/raw/train_dns20_vctk_whamr_chime4_reverb/spk1.scp
163
+ - speech_ref1
164
+ - sound
165
+ - - dump/raw/train_dns20_vctk_whamr_chime4_reverb/dereverb1.scp
166
+ - dereverb_ref1
167
+ - sound
168
+ - - dump/raw/train_dns20_vctk_whamr_chime4_reverb/utt2category
169
+ - category
170
+ - text
171
+ - - dump/raw/train_dns20_vctk_whamr_chime4_reverb/utt2fs
172
+ - fs
173
+ - text_int
174
+ valid_data_path_and_name_and_type:
175
+ - - dump/raw/valid_dns20_vctk_whamr_chime4/wav.scp
176
+ - speech_mix
177
+ - sound
178
+ - - dump/raw/valid_dns20_vctk_whamr_chime4/spk1.scp
179
+ - speech_ref1
180
+ - sound
181
+ - - dump/raw/valid_dns20_vctk_whamr_chime4/dereverb1.scp
182
+ - dereverb_ref1
183
+ - sound
184
+ - - dump/raw/valid_dns20_vctk_whamr_chime4/utt2category
185
+ - category
186
+ - text
187
+ - - dump/raw/valid_dns20_vctk_whamr_chime4/utt2fs
188
+ - fs
189
+ - text_int
190
+ allow_variable_data_keys: false
191
+ max_cache_size: 0.0
192
+ max_cache_fd: 32
193
+ allow_multi_rates: true
194
+ valid_max_cache_size: null
195
+ exclude_weight_decay: false
196
+ exclude_weight_decay_conf: {}
197
+ optim: adam
198
+ optim_conf:
199
+ lr: 0.0004
200
+ eps: 1.0e-08
201
+ weight_decay: 1.0e-05
202
+ scheduler: warmupreducelronplateau
203
+ scheduler_conf:
204
+ warmup_steps: 25000
205
+ mode: min
206
+ factor: 0.5
207
+ patience: 2
208
+ init: null
209
+ model_conf:
210
+ normalize_variance: true
211
+ categories:
212
+ - 1ch_48k
213
+ - 1ch_16k
214
+ - 1ch_16k_r
215
+ - 2ch_16k
216
+ - 2ch_16k_r
217
+ - 5ch_16k
218
+ - 8ch_16k_r
219
+ criterions:
220
+ - name: mr_l1_tfd
221
+ conf:
222
+ window_sz:
223
+ - 256
224
+ - 512
225
+ - 768
226
+ - 1024
227
+ hop_sz: null
228
+ eps: 1.0e-08
229
+ time_domain_weight: 0.5
230
+ normalize_variance: true
231
+ wrapper: fixed_order
232
+ wrapper_conf:
233
+ weight: 1.0
234
+ - name: si_snr
235
+ conf:
236
+ eps: 1.0e-07
237
+ wrapper: fixed_order
238
+ wrapper_conf:
239
+ weight: 0.0
240
+ speech_volume_normalize: null
241
+ rir_scp: null
242
+ rir_apply_prob: 1.0
243
+ noise_scp: null
244
+ noise_apply_prob: 1.0
245
+ noise_db_range: '13_15'
246
+ short_noise_thres: 0.5
247
+ use_reverberant_ref: false
248
+ num_spk: 1
249
+ num_noise_type: 1
250
+ sample_rate: 8000
251
+ force_single_channel: false
252
+ channel_reordering: true
253
+ categories:
254
+ - 1ch_48k
255
+ - 1ch_16k
256
+ - 1ch_16k_r
257
+ - 2ch_16k
258
+ - 2ch_16k_r
259
+ - 5ch_16k
260
+ - 8ch_16k_r
261
+ dynamic_mixing: false
262
+ utt2spk: null
263
+ dynamic_mixing_gain_db: 0.0
264
+ encoder: stft
265
+ encoder_conf:
266
+ n_fft: 256
267
+ hop_length: 128
268
+ use_builtin_complex: false
269
+ separator: uses
270
+ separator_conf:
271
+ num_spk: 1
272
+ enc_channels: 256
273
+ bottleneck_size: 64
274
+ num_blocks: 6
275
+ num_spatial_blocks: 3
276
+ segment_size: 64
277
+ memory_size: 20
278
+ memory_types: 2
279
+ rnn_type: lstm
280
+ bidirectional: true
281
+ hidden_size: 128
282
+ att_heads: 4
283
+ dropout: 0.0
284
+ norm_type: cLN
285
+ activation: relu
286
+ ch_mode: tac
287
+ ch_att_dim: 256
288
+ eps: 1.0e-05
289
+ ref_channel: 0
290
+ decoder: stft
291
+ decoder_conf:
292
+ n_fft: 256
293
+ hop_length: 128
294
+ mask_module: multi_mask
295
+ mask_module_conf: {}
296
+ preprocessor: enh
297
+ preprocessor_conf: {}
298
+ required:
299
+ - output_dir
300
+ version: '202301'
301
+ distributed: true
302
+ ```
303
+
304
+ </details>
305
+
306
+
307
+
308
+ ### Citing ESPnet
309
+
310
+ ```BibTex
311
+ @inproceedings{watanabe2018espnet,
312
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
313
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
314
+ year={2018},
315
+ booktitle={Proceedings of Interspeech},
316
+ pages={2207--2211},
317
+ doi={10.21437/Interspeech.2018-1456},
318
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
319
+ }
320
+
321
+
322
+ @inproceedings{ESPnet-SE,
323
+ author = {Chenda Li and Jing Shi and Wangyou Zhang and Aswin Shanmugam Subramanian and Xuankai Chang and
324
+ Naoyuki Kamo and Moto Hira and Tomoki Hayashi and Christoph B{"{o}}ddeker and Zhuo Chen and Shinji Watanabe},
325
+ title = {ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
326
+ booktitle = {{IEEE} Spoken Language Technology Workshop, {SLT} 2021, Shenzhen, China, January 19-22, 2021},
327
+ pages = {785--792},
328
+ publisher = {{IEEE}},
329
+ year = {2021},
330
+ url = {https://doi.org/10.1109/SLT48900.2021.9383615},
331
+ doi = {10.1109/SLT48900.2021.9383615},
332
+ timestamp = {Mon, 12 Apr 2021 17:08:59 +0200},
333
+ biburl = {https://dblp.org/rec/conf/slt/Li0ZSCKHHBC021.bib},
334
+ bibsource = {dblp computer science bibliography, https://dblp.org}
335
+ }
336
+
337
+
338
+ ```
339
+
340
+ or arXiv:
341
+
342
+ ```bibtex
343
+ @misc{watanabe2018espnet,
344
+ title={ESPnet: End-to-End Speech Processing Toolkit},
345
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
346
+ year={2018},
347
+ eprint={1804.00015},
348
+ archivePrefix={arXiv},
349
+ primaryClass={cs.CL}
350
+ }
351
+ ```
exp/enh_train_enh_uses_refch0_2mem_raw/20epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be693ffb237fe0e44dc647d4df4f0ca8795833cdba63b914235918854402e03c
3
+ size 12296793
exp/enh_train_enh_uses_refch0_2mem_raw/RESULTS.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_enh_score.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Sat Jul 15 12:50:47 CST 2023`
5
+ - python version: `3.8.16 (default, Mar 2 2023, 03:21:46) [GCC 11.2.0]`
6
+ - espnet version: `espnet 202301`
7
+ - pytorch version: `pytorch 2.0.1`
8
+ - Git hash: ``
9
+ - Commit date: ``
10
+
11
+
12
+ ## USES (ref_channel=0, 2 groups of memory tokens)
13
+
14
+ |dataset|condition|PESQ_WB|STOI|SAR|SDR|SIR|SI_SNR|OVRL|SIG|BAK|P808_MOS|
15
+ |---|---|---|---|---|---|---|---|---|---|---|---|
16
+ |vctk_noisy_tt_2spk|1ch, 48kHz||93.05|10.97|10.97|0.00|8.36|3.14|3.39|4.05|3.57|
17
+ |vctk_noisy_tt_2spk_16k|1ch, 16kHz|3.11|95.03|21.51|21.51|0.00|19.45|3.19|3.46|4.06|3.57|
18
+ |dns20_tt_synthetic_no_reverb|1ch, 16kHz|3.23|97.77|19.63|19.63|0.00|19.72|3.32|3.56|4.10|4.04|
19
+ |dns20_tt_synthetic_with_reverb|1ch, 16kHz|2.75|89.87|13.40|13.40|0.00|12.90|2.36|2.85|3.21|3.37|
20
+ |chime4_et05_simu_isolated_6ch_track|5ch, 16kHz|2.95|97.82|18.30|18.30|0.00|17.24|3.22|3.47|4.07|3.75|
21
+ |reverb_et_simu_8ch_multich|8ch, 16kHz|2.09|89.83|11.94|11.94|0.00|-10.12|2.98|3.35|3.79|3.90|
22
+ |whamr_tt_mix_single_anechoic_max_16k|2ch, 16kHz|2.55|96.36|15.78|15.78|0.00|15.46|3.33|3.55|4.16|3.86|
23
+ |whamr_tt_mix_single_reverb_max_16k|2ch, 16kHz|2.51|95.98|13.75|13.75|0.00|12.51|3.32|3.54|4.15|3.86|
24
+ |chime4_et05_real_isolated_6ch_track_1ch|5ch, 16kHz|1.23|55.11|-2.34|-2.34|0.00|-30.45|3.07|3.36|3.98|3.75|
25
+ |reverb_et_real_8ch_multich|8ch, 16kHz|1.17|75.30|4.39|4.39|0.00|1.62|3.11|3.42|3.97|3.99|
exp/enh_train_enh_uses_refch0_2mem_raw/config.yaml ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_enh_uses_refch0_2mem.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: chunk
6
+ output_dir: exp/enh_train_enh_uses_refch0_2mem_raw
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 33702
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ skip_stats_npz: false
28
+ max_epoch: 150
29
+ patience: 20
30
+ val_scheduler_criterion:
31
+ - valid
32
+ - loss
33
+ early_stopping_criterion:
34
+ - valid
35
+ - loss
36
+ - min
37
+ best_model_criterion:
38
+ - - valid
39
+ - loss
40
+ - min
41
+ keep_nbest_models: 1
42
+ nbest_averaging_interval: 0
43
+ grad_clip: 5.0
44
+ grad_clip_type: 2.0
45
+ grad_noise: false
46
+ accum_grad: 1
47
+ no_forward_run: false
48
+ resume: true
49
+ train_dtype: float32
50
+ use_amp: false
51
+ log_interval: null
52
+ use_matplotlib: true
53
+ use_tensorboard: true
54
+ create_graph_in_tensorboard: false
55
+ use_wandb: false
56
+ wandb_project: null
57
+ wandb_id: null
58
+ wandb_entity: null
59
+ wandb_name: null
60
+ wandb_model_log_interval: -1
61
+ detect_anomaly: false
62
+ pretrain_path: null
63
+ init_param: []
64
+ ignore_init_mismatch: false
65
+ freeze_param: []
66
+ num_iters_per_epoch: 8000
67
+ batch_size: 4
68
+ valid_batch_size: null
69
+ batch_bins: 1000000
70
+ valid_batch_bins: null
71
+ train_shape_file:
72
+ - exp/enh_stats_16k/train/speech_mix_shape
73
+ - exp/enh_stats_16k/train/speech_ref1_shape
74
+ - exp/enh_stats_16k/train/dereverb_ref1_shape
75
+ valid_shape_file:
76
+ - exp/enh_stats_16k/valid/speech_mix_shape
77
+ - exp/enh_stats_16k/valid/speech_ref1_shape
78
+ - exp/enh_stats_16k/valid/dereverb_ref1_shape
79
+ batch_type: folded
80
+ valid_batch_type: null
81
+ fold_length:
82
+ - 80000
83
+ - 80000
84
+ - 80000
85
+ sort_in_batch: descending
86
+ sort_batch: descending
87
+ multiple_iterator: false
88
+ chunk_length: 32000
89
+ chunk_shift_ratio: 0.5
90
+ num_cache_chunks: 1024
91
+ chunk_excluded_key_prefixes: []
92
+ chunk_discard_short_samples: false
93
+ train_data_path_and_name_and_type:
94
+ - - dump/raw/train_dns20_vctk_whamr_chime4_reverb/wav.scp
95
+ - speech_mix
96
+ - sound
97
+ - - dump/raw/train_dns20_vctk_whamr_chime4_reverb/spk1.scp
98
+ - speech_ref1
99
+ - sound
100
+ - - dump/raw/train_dns20_vctk_whamr_chime4_reverb/dereverb1.scp
101
+ - dereverb_ref1
102
+ - sound
103
+ - - dump/raw/train_dns20_vctk_whamr_chime4_reverb/utt2category
104
+ - category
105
+ - text
106
+ - - dump/raw/train_dns20_vctk_whamr_chime4_reverb/utt2fs
107
+ - fs
108
+ - text_int
109
+ valid_data_path_and_name_and_type:
110
+ - - dump/raw/valid_dns20_vctk_whamr_chime4/wav.scp
111
+ - speech_mix
112
+ - sound
113
+ - - dump/raw/valid_dns20_vctk_whamr_chime4/spk1.scp
114
+ - speech_ref1
115
+ - sound
116
+ - - dump/raw/valid_dns20_vctk_whamr_chime4/dereverb1.scp
117
+ - dereverb_ref1
118
+ - sound
119
+ - - dump/raw/valid_dns20_vctk_whamr_chime4/utt2category
120
+ - category
121
+ - text
122
+ - - dump/raw/valid_dns20_vctk_whamr_chime4/utt2fs
123
+ - fs
124
+ - text_int
125
+ allow_variable_data_keys: false
126
+ max_cache_size: 0.0
127
+ max_cache_fd: 32
128
+ allow_multi_rates: true
129
+ valid_max_cache_size: null
130
+ exclude_weight_decay: false
131
+ exclude_weight_decay_conf: {}
132
+ optim: adam
133
+ optim_conf:
134
+ lr: 0.0004
135
+ eps: 1.0e-08
136
+ weight_decay: 1.0e-05
137
+ scheduler: warmupreducelronplateau
138
+ scheduler_conf:
139
+ warmup_steps: 25000
140
+ mode: min
141
+ factor: 0.5
142
+ patience: 2
143
+ init: null
144
+ model_conf:
145
+ normalize_variance: true
146
+ categories:
147
+ - 1ch_48k
148
+ - 1ch_16k
149
+ - 1ch_16k_r
150
+ - 2ch_16k
151
+ - 2ch_16k_r
152
+ - 5ch_16k
153
+ - 8ch_16k_r
154
+ criterions:
155
+ - name: mr_l1_tfd
156
+ conf:
157
+ window_sz:
158
+ - 256
159
+ - 512
160
+ - 768
161
+ - 1024
162
+ hop_sz: null
163
+ eps: 1.0e-08
164
+ time_domain_weight: 0.5
165
+ normalize_variance: true
166
+ wrapper: fixed_order
167
+ wrapper_conf:
168
+ weight: 1.0
169
+ - name: si_snr
170
+ conf:
171
+ eps: 1.0e-07
172
+ wrapper: fixed_order
173
+ wrapper_conf:
174
+ weight: 0.0
175
+ speech_volume_normalize: null
176
+ rir_scp: null
177
+ rir_apply_prob: 1.0
178
+ noise_scp: null
179
+ noise_apply_prob: 1.0
180
+ noise_db_range: '13_15'
181
+ short_noise_thres: 0.5
182
+ use_reverberant_ref: false
183
+ num_spk: 1
184
+ num_noise_type: 1
185
+ sample_rate: 8000
186
+ force_single_channel: false
187
+ channel_reordering: true
188
+ categories:
189
+ - 1ch_48k
190
+ - 1ch_16k
191
+ - 1ch_16k_r
192
+ - 2ch_16k
193
+ - 2ch_16k_r
194
+ - 5ch_16k
195
+ - 8ch_16k_r
196
+ dynamic_mixing: false
197
+ utt2spk: null
198
+ dynamic_mixing_gain_db: 0.0
199
+ encoder: stft
200
+ encoder_conf:
201
+ n_fft: 256
202
+ hop_length: 128
203
+ use_builtin_complex: false
204
+ separator: uses
205
+ separator_conf:
206
+ num_spk: 1
207
+ enc_channels: 256
208
+ bottleneck_size: 64
209
+ num_blocks: 6
210
+ num_spatial_blocks: 3
211
+ segment_size: 64
212
+ memory_size: 20
213
+ memory_types: 2
214
+ rnn_type: lstm
215
+ bidirectional: true
216
+ hidden_size: 128
217
+ att_heads: 4
218
+ dropout: 0.0
219
+ norm_type: cLN
220
+ activation: relu
221
+ ch_mode: tac
222
+ ch_att_dim: 256
223
+ eps: 1.0e-05
224
+ ref_channel: 0
225
+ decoder: stft
226
+ decoder_conf:
227
+ n_fft: 256
228
+ hop_length: 128
229
+ mask_module: multi_mask
230
+ mask_module_conf: {}
231
+ preprocessor: enh
232
+ preprocessor_conf: {}
233
+ required:
234
+ - output_dir
235
+ version: '202301'
236
+ distributed: true
exp/enh_train_enh_uses_refch0_2mem_raw/images/backward_time.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/clip.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/forward_time.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/gpu_max_cached_mem_GB.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/grad_norm.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/iter_time.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/l1_timedomain+magspec_loss_1ch_16k.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/l1_timedomain+magspec_loss_1ch_48k.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/l1_timedomain+magspec_loss_2ch_16k.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/l1_timedomain+magspec_loss_2ch_16k_r.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/l1_timedomain+magspec_loss_5ch_16k.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/l1_timedomain+magspec_loss_8ch_16k.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/loss.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/loss_scale.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/optim0_lr0.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/optim_step_time.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/si_snr_loss_1ch_16k.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/si_snr_loss_1ch_48k.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/si_snr_loss_2ch_16k.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/si_snr_loss_2ch_16k_r.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/si_snr_loss_5ch_16k.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/si_snr_loss_8ch_16k.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/images/train_time.png ADDED
exp/enh_train_enh_uses_refch0_2mem_raw/latest.pth ADDED
@@ -0,0 +1 @@
 
 
1
+ 20epoch.pth
exp/enh_train_enh_uses_refch0_2mem_raw/valid.loss.ave.pth ADDED
@@ -0,0 +1 @@
 
 
1
+ 20epoch.pth
exp/enh_train_enh_uses_refch0_2mem_raw/valid.loss.best.pth ADDED
@@ -0,0 +1 @@
 
 
1
+ 20epoch.pth
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202301'
2
+ files:
3
+ model_file: exp/enh_train_enh_uses_refch0_2mem_raw/20epoch.pth
4
+ python: "3.8.16 (default, Mar 2 2023, 03:21:46) \n[GCC 11.2.0]"
5
+ timestamp: 1696137339.402181
6
+ torch: 2.0.1
7
+ yaml_files:
8
+ train_config: exp/enh_train_enh_uses_refch0_2mem_raw/config.yaml