wyz commited on
Commit
2760045
1 Parent(s): 90acba8

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +243 -0
README.md ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - audio-to-audio
6
+ language:
7
+ datasets:
8
+ - chime4
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 ENH model
13
+
14
+ ### `espnet/Wangyou_Zhang_chime4_enh_train_enh_dc_crn_mapping_snr_raw`
15
+
16
+ This model was trained by Wangyou Zhang using chime4 recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+
23
+ pip install -e .
24
+ cd egs2/chime4/enh1
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/Wangyou_Zhang_chime4_enh_train_enh_dc_crn_mapping_snr_raw
26
+ ```
27
+
28
+
29
+
30
+ ## ENH config
31
+
32
+ <details><summary>expand</summary>
33
+
34
+ ```
35
+ config: conf/tuning/train_enh_dc_crn_mapping_snr.yaml
36
+ print_config: false
37
+ log_level: INFO
38
+ dry_run: false
39
+ iterator_type: chunk
40
+ output_dir: exp/enh_train_enh_dc_crn_mapping_snr_raw
41
+ ngpu: 1
42
+ seed: 0
43
+ num_workers: 4
44
+ num_att_plot: 3
45
+ dist_backend: nccl
46
+ dist_init_method: env://
47
+ dist_world_size: 2
48
+ dist_rank: 0
49
+ local_rank: 0
50
+ dist_master_addr: localhost
51
+ dist_master_port: 43524
52
+ dist_launcher: null
53
+ multiprocessing_distributed: true
54
+ unused_parameters: false
55
+ sharded_ddp: false
56
+ cudnn_enabled: true
57
+ cudnn_benchmark: false
58
+ cudnn_deterministic: true
59
+ collect_stats: false
60
+ write_collected_feats: false
61
+ max_epoch: 200
62
+ patience: 10
63
+ val_scheduler_criterion:
64
+ - valid
65
+ - loss
66
+ early_stopping_criterion:
67
+ - valid
68
+ - loss
69
+ - min
70
+ best_model_criterion:
71
+ - - valid
72
+ - si_snr
73
+ - max
74
+ - - valid
75
+ - loss
76
+ - min
77
+ keep_nbest_models: 1
78
+ nbest_averaging_interval: 0
79
+ grad_clip: 5
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 1
83
+ no_forward_run: false
84
+ resume: true
85
+ train_dtype: float32
86
+ use_amp: false
87
+ log_interval: null
88
+ use_matplotlib: true
89
+ use_tensorboard: true
90
+ use_wandb: false
91
+ wandb_project: null
92
+ wandb_id: null
93
+ wandb_entity: null
94
+ wandb_name: null
95
+ wandb_model_log_interval: -1
96
+ detect_anomaly: false
97
+ pretrain_path: null
98
+ init_param: []
99
+ ignore_init_mismatch: false
100
+ freeze_param: []
101
+ num_iters_per_epoch: null
102
+ batch_size: 16
103
+ valid_batch_size: null
104
+ batch_bins: 1000000
105
+ valid_batch_bins: null
106
+ train_shape_file:
107
+ - exp/enh_stats_16k/train/speech_mix_shape
108
+ - exp/enh_stats_16k/train/speech_ref1_shape
109
+ valid_shape_file:
110
+ - exp/enh_stats_16k/valid/speech_mix_shape
111
+ - exp/enh_stats_16k/valid/speech_ref1_shape
112
+ batch_type: folded
113
+ valid_batch_type: null
114
+ fold_length:
115
+ - 80000
116
+ - 80000
117
+ sort_in_batch: descending
118
+ sort_batch: descending
119
+ multiple_iterator: false
120
+ chunk_length: 32000
121
+ chunk_shift_ratio: 0.5
122
+ num_cache_chunks: 1024
123
+ train_data_path_and_name_and_type:
124
+ - - dump/raw/tr05_simu_isolated_6ch_track/wav.scp
125
+ - speech_mix
126
+ - sound
127
+ - - dump/raw/tr05_simu_isolated_6ch_track/spk1.scp
128
+ - speech_ref1
129
+ - sound
130
+ valid_data_path_and_name_and_type:
131
+ - - dump/raw/dt05_simu_isolated_6ch_track/wav.scp
132
+ - speech_mix
133
+ - sound
134
+ - - dump/raw/dt05_simu_isolated_6ch_track/spk1.scp
135
+ - speech_ref1
136
+ - sound
137
+ allow_variable_data_keys: false
138
+ max_cache_size: 0.0
139
+ max_cache_fd: 32
140
+ valid_max_cache_size: null
141
+ optim: adam
142
+ optim_conf:
143
+ lr: 0.001
144
+ eps: 1.0e-08
145
+ weight_decay: 1.0e-07
146
+ amsgrad: true
147
+ scheduler: steplr
148
+ scheduler_conf:
149
+ step_size: 2
150
+ gamma: 0.98
151
+ init: xavier_uniform
152
+ model_conf:
153
+ stft_consistency: false
154
+ loss_type: mask_mse
155
+ mask_type: null
156
+ criterions:
157
+ - name: snr
158
+ conf:
159
+ eps: 1.0e-07
160
+ wrapper: pit
161
+ wrapper_conf:
162
+ weight: 1.0
163
+ use_preprocessor: false
164
+ encoder: stft
165
+ encoder_conf:
166
+ n_fft: 256
167
+ hop_length: 128
168
+ separator: dc_crn
169
+ separator_conf:
170
+ num_spk: 1
171
+ input_channels:
172
+ - 10
173
+ - 16
174
+ - 32
175
+ - 64
176
+ - 128
177
+ - 256
178
+ enc_hid_channels: 8
179
+ enc_layers: 5
180
+ glstm_groups: 2
181
+ glstm_layers: 2
182
+ glstm_bidirectional: true
183
+ glstm_rearrange: false
184
+ mode: mapping
185
+ ref_channel: 3
186
+ decoder: stft
187
+ decoder_conf:
188
+ n_fft: 256
189
+ hop_length: 128
190
+ required:
191
+ - output_dir
192
+ version: 0.10.7a1
193
+ distributed: true
194
+ ```
195
+
196
+ </details>
197
+
198
+
199
+
200
+ ### Citing ESPnet
201
+
202
+ ```BibTex
203
+ @inproceedings{watanabe2018espnet,
204
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
205
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
206
+ year={2018},
207
+ booktitle={Proceedings of Interspeech},
208
+ pages={2207--2211},
209
+ doi={10.21437/Interspeech.2018-1456},
210
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
211
+ }
212
+
213
+ @inproceedings{li2021espnetse,
214
+ title={{ESPnet-SE}: End-to-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
215
+ author={Li, Chenda and Shi, Jing and Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Kamo, Naoyuki and Hira, Moto and Hayashi, Tomoki and Boeddeker, Christoph and Chen, Zhuo and Watanabe, Shinji},
216
+ booktitle={Proc. IEEE Spoken Language Technology Workshop (SLT)},
217
+ pages={785--792},
218
+ year={2021},
219
+ }
220
+
221
+ ```
222
+
223
+ or arXiv:
224
+
225
+ ```bibtex
226
+ @misc{watanabe2018espnet,
227
+ title={ESPnet: End-to-End Speech Processing Toolkit},
228
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
229
+ year={2018},
230
+ eprint={1804.00015},
231
+ archivePrefix={arXiv},
232
+ primaryClass={cs.CL}
233
+ }
234
+
235
+ @inproceedings{li2021espnetse,
236
+ title={{ESPnet-SE}: End-to-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
237
+ author={Li, Chenda and Shi, Jing and Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Kamo, Naoyuki and Hira, Moto and Hayashi, Tomoki and Boeddeker, Christoph and Chen, Zhuo and Watanabe, Shinji},
238
+ year={2020},
239
+ eprint={2011.03706},
240
+ archivePrefix={arXiv},
241
+ primaryClass={eess.AS}
242
+ }
243
+ ```