wyz commited on
Commit
7f45eb3
1 Parent(s): b11823b

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +221 -0
README.md ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - audio-to-audio
6
+ language:
7
+ datasets:
8
+ - chime4
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 ENH model
13
+
14
+ ### `espnet/Wangyou_Zhang_chime4_enh_train_enh_conv_tasnet_raw`
15
+
16
+ This model was trained by Wangyou Zhang using chime4 recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+
23
+ pip install -e .
24
+ cd egs2/chime4/enh1
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/Wangyou_Zhang_chime4_enh_train_enh_conv_tasnet_raw
26
+ ```
27
+
28
+
29
+
30
+ ## ENH config
31
+
32
+ <details><summary>expand</summary>
33
+
34
+ ```
35
+ config: conf/tuning/train_enh_conv_tasnet.yaml
36
+ print_config: false
37
+ log_level: INFO
38
+ dry_run: false
39
+ iterator_type: chunk
40
+ output_dir: exp/enh_train_enh_conv_tasnet_raw
41
+ ngpu: 1
42
+ seed: 0
43
+ num_workers: 4
44
+ num_att_plot: 3
45
+ dist_backend: nccl
46
+ dist_init_method: env://
47
+ dist_world_size: 2
48
+ dist_rank: 0
49
+ local_rank: 0
50
+ dist_master_addr: localhost
51
+ dist_master_port: 57680
52
+ dist_launcher: null
53
+ multiprocessing_distributed: true
54
+ cudnn_enabled: true
55
+ cudnn_benchmark: false
56
+ cudnn_deterministic: true
57
+ collect_stats: false
58
+ write_collected_feats: false
59
+ max_epoch: 100
60
+ patience: 4
61
+ val_scheduler_criterion:
62
+ - valid
63
+ - loss
64
+ early_stopping_criterion:
65
+ - valid
66
+ - loss
67
+ - min
68
+ best_model_criterion:
69
+ - - valid
70
+ - si_snr
71
+ - max
72
+ - - valid
73
+ - loss
74
+ - min
75
+ keep_nbest_models: 1
76
+ grad_clip: 5.0
77
+ grad_clip_type: 2.0
78
+ grad_noise: false
79
+ accum_grad: 1
80
+ no_forward_run: false
81
+ resume: true
82
+ train_dtype: float32
83
+ use_amp: false
84
+ log_interval: null
85
+ unused_parameters: false
86
+ use_tensorboard: true
87
+ use_wandb: false
88
+ wandb_project: null
89
+ wandb_id: null
90
+ pretrain_path: null
91
+ init_param: []
92
+ freeze_param: []
93
+ num_iters_per_epoch: null
94
+ batch_size: 8
95
+ valid_batch_size: null
96
+ batch_bins: 1000000
97
+ valid_batch_bins: null
98
+ train_shape_file:
99
+ - exp/enh_stats_16k/train/speech_mix_shape
100
+ - exp/enh_stats_16k/train/speech_ref1_shape
101
+ valid_shape_file:
102
+ - exp/enh_stats_16k/valid/speech_mix_shape
103
+ - exp/enh_stats_16k/valid/speech_ref1_shape
104
+ batch_type: folded
105
+ valid_batch_type: null
106
+ fold_length:
107
+ - 80000
108
+ - 80000
109
+ sort_in_batch: descending
110
+ sort_batch: descending
111
+ multiple_iterator: false
112
+ chunk_length: 32000
113
+ chunk_shift_ratio: 0.5
114
+ num_cache_chunks: 1024
115
+ train_data_path_and_name_and_type:
116
+ - - dump/raw/tr05_simu_isolated_1ch_track/wav.scp
117
+ - speech_mix
118
+ - sound
119
+ - - dump/raw/tr05_simu_isolated_1ch_track/spk1.scp
120
+ - speech_ref1
121
+ - sound
122
+ valid_data_path_and_name_and_type:
123
+ - - dump/raw/dt05_simu_isolated_1ch_track/wav.scp
124
+ - speech_mix
125
+ - sound
126
+ - - dump/raw/dt05_simu_isolated_1ch_track/spk1.scp
127
+ - speech_ref1
128
+ - sound
129
+ allow_variable_data_keys: false
130
+ max_cache_size: 0.0
131
+ max_cache_fd: 32
132
+ valid_max_cache_size: null
133
+ optim: adam
134
+ optim_conf:
135
+ lr: 0.001
136
+ eps: 1.0e-08
137
+ weight_decay: 1.0e-05
138
+ scheduler: reducelronplateau
139
+ scheduler_conf:
140
+ mode: min
141
+ factor: 0.5
142
+ patience: 3
143
+ init: xavier_uniform
144
+ model_conf:
145
+ loss_type: si_snr
146
+ use_preprocessor: false
147
+ encoder: conv
148
+ encoder_conf:
149
+ channel: 256
150
+ kernel_size: 20
151
+ stride: 10
152
+ separator: tcn
153
+ separator_conf:
154
+ num_spk: 1
155
+ layer: 8
156
+ stack: 4
157
+ bottleneck_dim: 256
158
+ hidden_dim: 512
159
+ kernel: 3
160
+ causal: false
161
+ norm_type: gLN
162
+ nonlinear: relu
163
+ decoder: conv
164
+ decoder_conf:
165
+ channel: 256
166
+ kernel_size: 20
167
+ stride: 10
168
+ required:
169
+ - output_dir
170
+ version: 0.9.7
171
+ distributed: true
172
+ ```
173
+
174
+ </details>
175
+
176
+
177
+
178
+ ### Citing ESPnet
179
+
180
+ ```BibTex
181
+ @inproceedings{watanabe2018espnet,
182
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
183
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
184
+ year={2018},
185
+ booktitle={Proceedings of Interspeech},
186
+ pages={2207--2211},
187
+ doi={10.21437/Interspeech.2018-1456},
188
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
189
+ }
190
+
191
+ @inproceedings{li2021espnetse,
192
+ title={{ESPnet-SE}: End-to-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
193
+ author={Li, Chenda and Shi, Jing and Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Kamo, Naoyuki and Hira, Moto and Hayashi, Tomoki and Boeddeker, Christoph and Chen, Zhuo and Watanabe, Shinji},
194
+ booktitle={Proc. IEEE Spoken Language Technology Workshop (SLT)},
195
+ pages={785--792},
196
+ year={2021},
197
+ }
198
+
199
+ ```
200
+
201
+ or arXiv:
202
+
203
+ ```bibtex
204
+ @misc{watanabe2018espnet,
205
+ title={ESPnet: End-to-End Speech Processing Toolkit},
206
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
207
+ year={2018},
208
+ eprint={1804.00015},
209
+ archivePrefix={arXiv},
210
+ primaryClass={cs.CL}
211
+ }
212
+
213
+ @inproceedings{li2021espnetse,
214
+ title={{ESPnet-SE}: End-to-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
215
+ author={Li, Chenda and Shi, Jing and Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Kamo, Naoyuki and Hira, Moto and Hayashi, Tomoki and Boeddeker, Christoph and Chen, Zhuo and Watanabe, Shinji},
216
+ year={2020},
217
+ eprint={2011.03706},
218
+ archivePrefix={arXiv},
219
+ primaryClass={eess.AS}
220
+ }
221
+ ```