Wangyou Zhang commited on
Commit
f1a767f
1 Parent(s): 6a99087

Add model files

Browse files
README.md ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - audio-to-audio
6
+ language:
7
+ datasets:
8
+ - chime4
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 ENH model
13
+
14
+ ### `espnet/Wangyou_Zhang_wsj0_2mix_enh_dc_crn_mapping_snr_raw`
15
+
16
+ This model was trained by Wangyou Zhang using chime4 recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+
23
+ pip install -e .
24
+ cd egs2/chime4/enh1
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/Wangyou_Zhang_wsj0_2mix_enh_dc_crn_mapping_snr_raw
26
+ ```
27
+
28
+
29
+
30
+ ## ENH config
31
+
32
+ <details><summary>expand</summary>
33
+
34
+ ```
35
+ config: conf/tuning/train_enh_dc_crn_mapping_snr.yaml
36
+ print_config: false
37
+ log_level: INFO
38
+ dry_run: false
39
+ iterator_type: chunk
40
+ output_dir: exp/enh_train_enh_dc_crn_mapping_snr_raw
41
+ ngpu: 1
42
+ seed: 0
43
+ num_workers: 4
44
+ num_att_plot: 3
45
+ dist_backend: nccl
46
+ dist_init_method: env://
47
+ dist_world_size: null
48
+ dist_rank: null
49
+ local_rank: 0
50
+ dist_master_addr: null
51
+ dist_master_port: null
52
+ dist_launcher: null
53
+ multiprocessing_distributed: false
54
+ unused_parameters: false
55
+ sharded_ddp: false
56
+ cudnn_enabled: true
57
+ cudnn_benchmark: false
58
+ cudnn_deterministic: true
59
+ collect_stats: false
60
+ write_collected_feats: false
61
+ max_epoch: 200
62
+ patience: 10
63
+ val_scheduler_criterion:
64
+ - valid
65
+ - loss
66
+ early_stopping_criterion:
67
+ - valid
68
+ - loss
69
+ - min
70
+ best_model_criterion:
71
+ - - valid
72
+ - si_snr
73
+ - max
74
+ - - valid
75
+ - loss
76
+ - min
77
+ keep_nbest_models: 1
78
+ nbest_averaging_interval: 0
79
+ grad_clip: 5
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 1
83
+ no_forward_run: false
84
+ resume: true
85
+ train_dtype: float32
86
+ use_amp: false
87
+ log_interval: null
88
+ use_matplotlib: true
89
+ use_tensorboard: true
90
+ use_wandb: false
91
+ wandb_project: null
92
+ wandb_id: null
93
+ wandb_entity: null
94
+ wandb_name: null
95
+ wandb_model_log_interval: -1
96
+ detect_anomaly: false
97
+ pretrain_path: null
98
+ init_param: []
99
+ ignore_init_mismatch: false
100
+ freeze_param: []
101
+ num_iters_per_epoch: null
102
+ batch_size: 16
103
+ valid_batch_size: null
104
+ batch_bins: 1000000
105
+ valid_batch_bins: null
106
+ train_shape_file:
107
+ - exp/enh_stats_8k/train/speech_mix_shape
108
+ - exp/enh_stats_8k/train/speech_ref1_shape
109
+ - exp/enh_stats_8k/train/speech_ref2_shape
110
+ valid_shape_file:
111
+ - exp/enh_stats_8k/valid/speech_mix_shape
112
+ - exp/enh_stats_8k/valid/speech_ref1_shape
113
+ - exp/enh_stats_8k/valid/speech_ref2_shape
114
+ batch_type: folded
115
+ valid_batch_type: null
116
+ fold_length:
117
+ - 80000
118
+ - 80000
119
+ - 80000
120
+ sort_in_batch: descending
121
+ sort_batch: descending
122
+ multiple_iterator: false
123
+ chunk_length: 32000
124
+ chunk_shift_ratio: 0.5
125
+ num_cache_chunks: 1024
126
+ train_data_path_and_name_and_type:
127
+ - - dump/raw/tr_min_8k/wav.scp
128
+ - speech_mix
129
+ - sound
130
+ - - dump/raw/tr_min_8k/spk1.scp
131
+ - speech_ref1
132
+ - sound
133
+ - - dump/raw/tr_min_8k/spk2.scp
134
+ - speech_ref2
135
+ - sound
136
+ valid_data_path_and_name_and_type:
137
+ - - dump/raw/cv_min_8k/wav.scp
138
+ - speech_mix
139
+ - sound
140
+ - - dump/raw/cv_min_8k/spk1.scp
141
+ - speech_ref1
142
+ - sound
143
+ - - dump/raw/cv_min_8k/spk2.scp
144
+ - speech_ref2
145
+ - sound
146
+ allow_variable_data_keys: false
147
+ max_cache_size: 0.0
148
+ max_cache_fd: 32
149
+ valid_max_cache_size: null
150
+ optim: adam
151
+ optim_conf:
152
+ lr: 0.001
153
+ eps: 1.0e-08
154
+ weight_decay: 1.0e-07
155
+ amsgrad: true
156
+ scheduler: steplr
157
+ scheduler_conf:
158
+ step_size: 2
159
+ gamma: 0.98
160
+ init: xavier_uniform
161
+ model_conf:
162
+ stft_consistency: false
163
+ loss_type: mask_mse
164
+ mask_type: null
165
+ criterions:
166
+ - name: si_snr
167
+ conf:
168
+ eps: 1.0e-07
169
+ wrapper: pit
170
+ wrapper_conf:
171
+ weight: 1.0
172
+ use_preprocessor: false
173
+ encoder: stft
174
+ encoder_conf:
175
+ n_fft: 256
176
+ hop_length: 128
177
+ separator: dc_crn
178
+ separator_conf:
179
+ num_spk: 2
180
+ input_channels:
181
+ - 2
182
+ - 16
183
+ - 32
184
+ - 64
185
+ - 128
186
+ - 256
187
+ enc_hid_channels: 8
188
+ enc_layers: 5
189
+ glstm_groups: 2
190
+ glstm_layers: 2
191
+ glstm_bidirectional: true
192
+ glstm_rearrange: false
193
+ mode: mapping
194
+ decoder: stft
195
+ decoder_conf:
196
+ n_fft: 256
197
+ hop_length: 128
198
+ required:
199
+ - output_dir
200
+ version: 0.10.7a1
201
+ distributed: false
202
+ ```
203
+
204
+ </details>
205
+
206
+
207
+
208
+ ### Citing ESPnet
209
+
210
+ ```BibTex
211
+ @inproceedings{watanabe2018espnet,
212
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
213
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
214
+ year={2018},
215
+ booktitle={Proceedings of Interspeech},
216
+ pages={2207--2211},
217
+ doi={10.21437/Interspeech.2018-1456},
218
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
219
+ }
220
+
221
+ @inproceedings{li2021espnetse,
222
+ title={{ESPnet-SE}: End-to-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
223
+ author={Li, Chenda and Shi, Jing and Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Kamo, Naoyuki and Hira, Moto and Hayashi, Tomoki and Boeddeker, Christoph and Chen, Zhuo and Watanabe, Shinji},
224
+ booktitle={Proc. IEEE Spoken Language Technology Workshop (SLT)},
225
+ pages={785--792},
226
+ year={2021},
227
+ }
228
+
229
+ ```
230
+
231
+ or arXiv:
232
+
233
+ ```bibtex
234
+ @misc{watanabe2018espnet,
235
+ title={ESPnet: End-to-End Speech Processing Toolkit},
236
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
237
+ year={2018},
238
+ eprint={1804.00015},
239
+ archivePrefix={arXiv},
240
+ primaryClass={cs.CL}
241
+ }
242
+
243
+ @inproceedings{li2021espnetse,
244
+ title={{ESPnet-SE}: End-to-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
245
+ author={Li, Chenda and Shi, Jing and Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Kamo, Naoyuki and Hira, Moto and Hayashi, Tomoki and Boeddeker, Christoph and Chen, Zhuo and Watanabe, Shinji},
246
+ year={2020},
247
+ eprint={2011.03706},
248
+ archivePrefix={arXiv},
249
+ primaryClass={eess.AS}
250
+ }
251
+ ```
exp/enh_stats_8k/train/feats_stats.npz ADDED
Binary file (778 Bytes). View file
 
exp/enh_train_enh_dc_crn_mapping_snr_raw/200epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:718c8c9d64e63fb3cd8f869a68ecd1eba2d04152fd53de3c8d9eac62bc5479ba
3
+ size 33731881
exp/enh_train_enh_dc_crn_mapping_snr_raw/RESULTS.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by ./scripts/utils/show_enh_score.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Tue Mar 29 03:04:38 CST 2022`
5
+ - python version: `3.8.12 (default, Oct 12 2021, 13:49:34) [GCC 7.5.0]`
6
+ - espnet version: `espnet 0.10.7a1`
7
+ - pytorch version: `pytorch 1.10.2+cu102`
8
+ - Git hash: `9c24b3adddbde3402530080cb58ae08a6f4dd642`
9
+ - Commit date: `Wed Feb 23 14:49:15 2022 -0500`
10
+
11
+
12
+ ## enh_train_enh_dc_crn_mapping_snr_raw
13
+
14
+ config: conf/tuning/train_enh_dc_crn_mapping_snr.yaml
15
+
16
+ |dataset|PESQ|STOI|SAR|SDR|SIR|SI_SNR|
17
+ |---|---|---|---|---|---|---|
18
+ |enhanced_cv_min_8k|3.181|0.931|14.862|14.164|24.021|13.734|
19
+ |enhanced_tt_min_8k|3.108|0.935|14.250|13.462|23.132|13.010|
20
+
exp/enh_train_enh_dc_crn_mapping_snr_raw/config.yaml ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_enh_dc_crn_mapping_snr.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: chunk
6
+ output_dir: exp/enh_train_enh_dc_crn_mapping_snr_raw
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 200
28
+ patience: 10
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - si_snr
39
+ - max
40
+ - - valid
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 1
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 5
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 1
49
+ no_forward_run: false
50
+ resume: true
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_matplotlib: true
55
+ use_tensorboard: true
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ pretrain_path: null
64
+ init_param: []
65
+ ignore_init_mismatch: false
66
+ freeze_param: []
67
+ num_iters_per_epoch: null
68
+ batch_size: 16
69
+ valid_batch_size: null
70
+ batch_bins: 1000000
71
+ valid_batch_bins: null
72
+ train_shape_file:
73
+ - exp/enh_stats_8k/train/speech_mix_shape
74
+ - exp/enh_stats_8k/train/speech_ref1_shape
75
+ - exp/enh_stats_8k/train/speech_ref2_shape
76
+ valid_shape_file:
77
+ - exp/enh_stats_8k/valid/speech_mix_shape
78
+ - exp/enh_stats_8k/valid/speech_ref1_shape
79
+ - exp/enh_stats_8k/valid/speech_ref2_shape
80
+ batch_type: folded
81
+ valid_batch_type: null
82
+ fold_length:
83
+ - 80000
84
+ - 80000
85
+ - 80000
86
+ sort_in_batch: descending
87
+ sort_batch: descending
88
+ multiple_iterator: false
89
+ chunk_length: 32000
90
+ chunk_shift_ratio: 0.5
91
+ num_cache_chunks: 1024
92
+ train_data_path_and_name_and_type:
93
+ - - dump/raw/tr_min_8k/wav.scp
94
+ - speech_mix
95
+ - sound
96
+ - - dump/raw/tr_min_8k/spk1.scp
97
+ - speech_ref1
98
+ - sound
99
+ - - dump/raw/tr_min_8k/spk2.scp
100
+ - speech_ref2
101
+ - sound
102
+ valid_data_path_and_name_and_type:
103
+ - - dump/raw/cv_min_8k/wav.scp
104
+ - speech_mix
105
+ - sound
106
+ - - dump/raw/cv_min_8k/spk1.scp
107
+ - speech_ref1
108
+ - sound
109
+ - - dump/raw/cv_min_8k/spk2.scp
110
+ - speech_ref2
111
+ - sound
112
+ allow_variable_data_keys: false
113
+ max_cache_size: 0.0
114
+ max_cache_fd: 32
115
+ valid_max_cache_size: null
116
+ optim: adam
117
+ optim_conf:
118
+ lr: 0.001
119
+ eps: 1.0e-08
120
+ weight_decay: 1.0e-07
121
+ amsgrad: true
122
+ scheduler: steplr
123
+ scheduler_conf:
124
+ step_size: 2
125
+ gamma: 0.98
126
+ init: xavier_uniform
127
+ model_conf:
128
+ stft_consistency: false
129
+ loss_type: mask_mse
130
+ mask_type: null
131
+ criterions:
132
+ - name: si_snr
133
+ conf:
134
+ eps: 1.0e-07
135
+ wrapper: pit
136
+ wrapper_conf:
137
+ weight: 1.0
138
+ use_preprocessor: false
139
+ encoder: stft
140
+ encoder_conf:
141
+ n_fft: 256
142
+ hop_length: 128
143
+ separator: dc_crn
144
+ separator_conf:
145
+ num_spk: 2
146
+ input_channels:
147
+ - 2
148
+ - 16
149
+ - 32
150
+ - 64
151
+ - 128
152
+ - 256
153
+ enc_hid_channels: 8
154
+ enc_layers: 5
155
+ glstm_groups: 2
156
+ glstm_layers: 2
157
+ glstm_bidirectional: true
158
+ glstm_rearrange: false
159
+ mode: mapping
160
+ decoder: stft
161
+ decoder_conf:
162
+ n_fft: 256
163
+ hop_length: 128
164
+ required:
165
+ - output_dir
166
+ version: 0.10.7a1
167
+ distributed: false
exp/enh_train_enh_dc_crn_mapping_snr_raw/images/backward_time.png ADDED
exp/enh_train_enh_dc_crn_mapping_snr_raw/images/forward_time.png ADDED
exp/enh_train_enh_dc_crn_mapping_snr_raw/images/gpu_max_cached_mem_GB.png ADDED
exp/enh_train_enh_dc_crn_mapping_snr_raw/images/iter_time.png ADDED
exp/enh_train_enh_dc_crn_mapping_snr_raw/images/loss.png ADDED
exp/enh_train_enh_dc_crn_mapping_snr_raw/images/optim0_lr0.png ADDED
exp/enh_train_enh_dc_crn_mapping_snr_raw/images/optim_step_time.png ADDED
exp/enh_train_enh_dc_crn_mapping_snr_raw/images/si_snr_loss.png ADDED
exp/enh_train_enh_dc_crn_mapping_snr_raw/images/train_time.png ADDED
exp/enh_train_enh_dc_crn_mapping_snr_raw/valid.loss.best.pth ADDED
@@ -0,0 +1 @@
 
 
1
+ 200epoch.pth
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.10.7a1
2
+ files:
3
+ model_file: exp/enh_train_enh_dc_crn_mapping_snr_raw/200epoch.pth
4
+ python: "3.8.12 (default, Oct 12 2021, 13:49:34) \n[GCC 7.5.0]"
5
+ timestamp: 1649684621.845351
6
+ torch: 1.10.2+cu102
7
+ yaml_files:
8
+ train_config: exp/enh_train_enh_dc_crn_mapping_snr_raw/config.yaml