wyz97 commited on
Commit
db03c34
1 Parent(s): e3a60b5

Update model

Browse files
README.md CHANGED
@@ -1,3 +1,339 @@
1
  ---
 
 
 
 
 
 
 
2
  license: cc-by-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - audio-to-audio
6
+ language: en
7
+ datasets:
8
+ - librimix
9
  license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 ENH model
13
+
14
+ ### `espnet/Wangyou_Zhang_librimix_train_enh_tse_td_speakerbeam_raw`
15
+
16
+ This model was trained by Wangyou Zhang using librimix recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+
26
+ pip install -e .
27
+ cd egs2/librimix/tse1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/Wangyou_Zhang_librimix_train_enh_tse_td_speakerbeam_raw
29
+ ```
30
+
31
+ <!-- Generated by ./scripts/utils/show_enh_score.sh -->
32
+ # RESULTS
33
+ ## Environments
34
+ - date: `Mon Jun 5 22:42:07 CST 2023`
35
+ - python version: `3.8.16 (default, Mar 2 2023, 03:21:46) [GCC 11.2.0]`
36
+ - espnet version: `espnet 202301`
37
+ - pytorch version: `pytorch 2.0.1`
38
+ - Git hash: ``
39
+ - Commit date: ``
40
+
41
+
42
+ ## enh_train_raw
43
+
44
+ config: ./conf/train.yaml
45
+
46
+ |dataset|PESQ_WB|STOI|SAR|SDR|SIR|SI_SNR|OVRL|SIG|BAK|P808_MOS|
47
+ |---|---|---|---|---|---|---|---|---|---|---|
48
+ |enhanced_dev|1.73|86.50|12.50|11.40|24.83|10.58|2.95|3.24|3.92|3.23|
49
+ |enhanced_test|1.73|87.36|12.34|11.47|24.51|10.74|2.99|3.29|3.91|3.25|
50
+
51
+ ## ENH config
52
+
53
+ <details><summary>expand</summary>
54
+
55
+ ```
56
+ config: ./conf/train.yaml
57
+ print_config: false
58
+ log_level: INFO
59
+ dry_run: false
60
+ iterator_type: chunk
61
+ output_dir: exp/enh_train_raw
62
+ ngpu: 1
63
+ seed: 0
64
+ num_workers: 2
65
+ num_att_plot: 3
66
+ dist_backend: nccl
67
+ dist_init_method: env://
68
+ dist_world_size: 4
69
+ dist_rank: 0
70
+ local_rank: 0
71
+ dist_master_addr: localhost
72
+ dist_master_port: 43837
73
+ dist_launcher: null
74
+ multiprocessing_distributed: true
75
+ unused_parameters: true
76
+ sharded_ddp: false
77
+ cudnn_enabled: true
78
+ cudnn_benchmark: false
79
+ cudnn_deterministic: true
80
+ collect_stats: false
81
+ write_collected_feats: false
82
+ skip_stats_npz: false
83
+ max_epoch: 100
84
+ patience: 20
85
+ val_scheduler_criterion:
86
+ - valid
87
+ - loss
88
+ early_stopping_criterion:
89
+ - valid
90
+ - loss
91
+ - min
92
+ best_model_criterion:
93
+ - - valid
94
+ - snr
95
+ - max
96
+ - - valid
97
+ - loss
98
+ - min
99
+ keep_nbest_models: 1
100
+ nbest_averaging_interval: 0
101
+ grad_clip: 5.0
102
+ grad_clip_type: 2.0
103
+ grad_noise: false
104
+ accum_grad: 1
105
+ no_forward_run: false
106
+ resume: true
107
+ train_dtype: float32
108
+ use_amp: false
109
+ log_interval: null
110
+ use_matplotlib: true
111
+ use_tensorboard: true
112
+ create_graph_in_tensorboard: false
113
+ use_wandb: false
114
+ wandb_project: null
115
+ wandb_id: null
116
+ wandb_entity: null
117
+ wandb_name: null
118
+ wandb_model_log_interval: -1
119
+ detect_anomaly: false
120
+ pretrain_path: null
121
+ init_param: []
122
+ ignore_init_mismatch: false
123
+ freeze_param: []
124
+ num_iters_per_epoch: null
125
+ batch_size: 16
126
+ valid_batch_size: null
127
+ batch_bins: 1000000
128
+ valid_batch_bins: null
129
+ train_shape_file:
130
+ - exp/enh_stats_train_dev_16k/train/speech_mix_shape
131
+ - exp/enh_stats_train_dev_16k/train/speech_ref1_shape
132
+ - exp/enh_stats_train_dev_16k/train/enroll_ref1_shape
133
+ - exp/enh_stats_train_dev_16k/train/speech_ref2_shape
134
+ - exp/enh_stats_train_dev_16k/train/enroll_ref2_shape
135
+ valid_shape_file:
136
+ - exp/enh_stats_train_dev_16k/valid/speech_mix_shape
137
+ - exp/enh_stats_train_dev_16k/valid/speech_ref1_shape
138
+ - exp/enh_stats_train_dev_16k/valid/enroll_ref1_shape
139
+ - exp/enh_stats_train_dev_16k/valid/speech_ref2_shape
140
+ - exp/enh_stats_train_dev_16k/valid/enroll_ref2_shape
141
+ batch_type: folded
142
+ valid_batch_type: null
143
+ fold_length:
144
+ - 80000
145
+ - 80000
146
+ - 80000
147
+ - 80000
148
+ - 80000
149
+ sort_in_batch: descending
150
+ sort_batch: descending
151
+ multiple_iterator: false
152
+ chunk_length: 48000
153
+ chunk_shift_ratio: 0.5
154
+ num_cache_chunks: 1024
155
+ chunk_excluded_key_prefixes:
156
+ - enroll_ref
157
+ train_data_path_and_name_and_type:
158
+ - - dump/raw/train/wav.scp
159
+ - speech_mix
160
+ - sound
161
+ - - dump/raw/train/spk1.scp
162
+ - speech_ref1
163
+ - sound
164
+ - - dump/raw/train/enroll_spk1.scp
165
+ - enroll_ref1
166
+ - text
167
+ - - dump/raw/train/spk2.scp
168
+ - speech_ref2
169
+ - sound
170
+ - - dump/raw/train/enroll_spk2.scp
171
+ - enroll_ref2
172
+ - text
173
+ valid_data_path_and_name_and_type:
174
+ - - dump/raw/dev/wav.scp
175
+ - speech_mix
176
+ - sound
177
+ - - dump/raw/dev/spk1.scp
178
+ - speech_ref1
179
+ - sound
180
+ - - dump/raw/dev/enroll_spk1.scp
181
+ - enroll_ref1
182
+ - text
183
+ - - dump/raw/dev/spk2.scp
184
+ - speech_ref2
185
+ - sound
186
+ - - dump/raw/dev/enroll_spk2.scp
187
+ - enroll_ref2
188
+ - text
189
+ allow_variable_data_keys: false
190
+ max_cache_size: 0.0
191
+ max_cache_fd: 32
192
+ valid_max_cache_size: null
193
+ exclude_weight_decay: false
194
+ exclude_weight_decay_conf: {}
195
+ optim: adam
196
+ optim_conf:
197
+ lr: 0.001
198
+ eps: 1.0e-08
199
+ weight_decay: 0
200
+ scheduler: reducelronplateau
201
+ scheduler_conf:
202
+ mode: min
203
+ factor: 0.7
204
+ patience: 3
205
+ init: null
206
+ model_conf:
207
+ num_spk: 2
208
+ share_encoder: true
209
+ criterions:
210
+ - name: snr
211
+ conf:
212
+ eps: 1.0e-07
213
+ wrapper: fixed_order
214
+ wrapper_conf:
215
+ weight: 1.0
216
+ - name: l1_fd
217
+ conf:
218
+ only_for_test: true
219
+ wrapper: fixed_order
220
+ wrapper_conf:
221
+ weight: 0.0
222
+ - name: l1_td
223
+ conf:
224
+ only_for_test: true
225
+ wrapper: fixed_order
226
+ wrapper_conf:
227
+ weight: 0.0
228
+ - name: mse_fd
229
+ conf:
230
+ only_for_test: true
231
+ wrapper: fixed_order
232
+ wrapper_conf:
233
+ weight: 0.0
234
+ - name: mse_td
235
+ conf:
236
+ only_for_test: true
237
+ wrapper: fixed_order
238
+ wrapper_conf:
239
+ weight: 0.0
240
+ train_spk2enroll: data/train-100/spk2enroll.json
241
+ enroll_segment: 48000
242
+ load_spk_embedding: false
243
+ load_all_speakers: false
244
+ rir_scp: null
245
+ rir_apply_prob: 1.0
246
+ noise_scp: null
247
+ noise_apply_prob: 1.0
248
+ noise_db_range: '13_15'
249
+ short_noise_thres: 0.5
250
+ speech_volume_normalize: null
251
+ use_reverberant_ref: false
252
+ num_spk: 1
253
+ num_noise_type: 1
254
+ sample_rate: 8000
255
+ force_single_channel: false
256
+ channel_reordering: false
257
+ categories: []
258
+ encoder: conv
259
+ encoder_conf:
260
+ channel: 256
261
+ kernel_size: 32
262
+ stride: 16
263
+ extractor: td_speakerbeam
264
+ extractor_conf:
265
+ layer: 8
266
+ stack: 4
267
+ bottleneck_dim: 256
268
+ hidden_dim: 512
269
+ skip_dim: 256
270
+ kernel: 3
271
+ causal: false
272
+ norm_type: gLN
273
+ pre_nonlinear: prelu
274
+ nonlinear: relu
275
+ i_adapt_layer: 7
276
+ adapt_layer_type: mul
277
+ adapt_enroll_dim: 256
278
+ use_spk_emb: false
279
+ decoder: conv
280
+ decoder_conf:
281
+ channel: 256
282
+ kernel_size: 32
283
+ stride: 16
284
+ preprocessor: tse
285
+ preprocessor_conf: {}
286
+ required:
287
+ - output_dir
288
+ version: '202301'
289
+ distributed: true
290
+ ```
291
+
292
+ </details>
293
+
294
+
295
+
296
+ ### Citing ESPnet
297
+
298
+ ```BibTex
299
+ @inproceedings{watanabe2018espnet,
300
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
301
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
302
+ year={2018},
303
+ booktitle={Proceedings of Interspeech},
304
+ pages={2207--2211},
305
+ doi={10.21437/Interspeech.2018-1456},
306
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
307
+ }
308
+
309
+
310
+ @inproceedings{ESPnet-SE,
311
+ author = {Chenda Li and Jing Shi and Wangyou Zhang and Aswin Shanmugam Subramanian and Xuankai Chang and
312
+ Naoyuki Kamo and Moto Hira and Tomoki Hayashi and Christoph B{"{o}}ddeker and Zhuo Chen and Shinji Watanabe},
313
+ title = {ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
314
+ booktitle = {{IEEE} Spoken Language Technology Workshop, {SLT} 2021, Shenzhen, China, January 19-22, 2021},
315
+ pages = {785--792},
316
+ publisher = {{IEEE}},
317
+ year = {2021},
318
+ url = {https://doi.org/10.1109/SLT48900.2021.9383615},
319
+ doi = {10.1109/SLT48900.2021.9383615},
320
+ timestamp = {Mon, 12 Apr 2021 17:08:59 +0200},
321
+ biburl = {https://dblp.org/rec/conf/slt/Li0ZSCKHHBC021.bib},
322
+ bibsource = {dblp computer science bibliography, https://dblp.org}
323
+ }
324
+
325
+
326
+ ```
327
+
328
+ or arXiv:
329
+
330
+ ```bibtex
331
+ @misc{watanabe2018espnet,
332
+ title={ESPnet: End-to-End Speech Processing Toolkit},
333
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
334
+ year={2018},
335
+ eprint={1804.00015},
336
+ archivePrefix={arXiv},
337
+ primaryClass={cs.CL}
338
+ }
339
+ ```
exp/enh_train_raw/99epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e1d6831d0df187ae8b23ed4d6804d07d0ec314336266e99f7da41a2f8ad055f
3
+ size 65038857
exp/enh_train_raw/RESULTS.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by ./scripts/utils/show_enh_score.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Mon Jun 5 22:42:07 CST 2023`
5
+ - python version: `3.8.16 (default, Mar 2 2023, 03:21:46) [GCC 11.2.0]`
6
+ - espnet version: `espnet 202301`
7
+ - pytorch version: `pytorch 2.0.1`
8
+ - Git hash: ``
9
+ - Commit date: ``
10
+
11
+
12
+ ## enh_train_raw
13
+
14
+ config: ./conf/train.yaml
15
+
16
+ |dataset|PESQ_WB|STOI|SAR|SDR|SIR|SI_SNR|OVRL|SIG|BAK|P808_MOS|
17
+ |---|---|---|---|---|---|---|---|---|---|---|
18
+ |enhanced_dev|1.73|86.50|12.50|11.40|24.83|10.58|2.95|3.24|3.92|3.23|
19
+ |enhanced_test|1.73|87.36|12.34|11.47|24.51|10.74|2.99|3.29|3.91|3.25|
20
+
exp/enh_train_raw/config.yaml ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/train.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: chunk
6
+ output_dir: exp/enh_train_raw
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 2
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 43837
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ skip_stats_npz: false
28
+ max_epoch: 100
29
+ patience: 20
30
+ val_scheduler_criterion:
31
+ - valid
32
+ - loss
33
+ early_stopping_criterion:
34
+ - valid
35
+ - loss
36
+ - min
37
+ best_model_criterion:
38
+ - - valid
39
+ - snr
40
+ - max
41
+ - - valid
42
+ - loss
43
+ - min
44
+ keep_nbest_models: 1
45
+ nbest_averaging_interval: 0
46
+ grad_clip: 5.0
47
+ grad_clip_type: 2.0
48
+ grad_noise: false
49
+ accum_grad: 1
50
+ no_forward_run: false
51
+ resume: true
52
+ train_dtype: float32
53
+ use_amp: false
54
+ log_interval: null
55
+ use_matplotlib: true
56
+ use_tensorboard: true
57
+ create_graph_in_tensorboard: false
58
+ use_wandb: false
59
+ wandb_project: null
60
+ wandb_id: null
61
+ wandb_entity: null
62
+ wandb_name: null
63
+ wandb_model_log_interval: -1
64
+ detect_anomaly: false
65
+ pretrain_path: null
66
+ init_param: []
67
+ ignore_init_mismatch: false
68
+ freeze_param: []
69
+ num_iters_per_epoch: null
70
+ batch_size: 16
71
+ valid_batch_size: null
72
+ batch_bins: 1000000
73
+ valid_batch_bins: null
74
+ train_shape_file:
75
+ - exp/enh_stats_train_dev_16k/train/speech_mix_shape
76
+ - exp/enh_stats_train_dev_16k/train/speech_ref1_shape
77
+ - exp/enh_stats_train_dev_16k/train/enroll_ref1_shape
78
+ - exp/enh_stats_train_dev_16k/train/speech_ref2_shape
79
+ - exp/enh_stats_train_dev_16k/train/enroll_ref2_shape
80
+ valid_shape_file:
81
+ - exp/enh_stats_train_dev_16k/valid/speech_mix_shape
82
+ - exp/enh_stats_train_dev_16k/valid/speech_ref1_shape
83
+ - exp/enh_stats_train_dev_16k/valid/enroll_ref1_shape
84
+ - exp/enh_stats_train_dev_16k/valid/speech_ref2_shape
85
+ - exp/enh_stats_train_dev_16k/valid/enroll_ref2_shape
86
+ batch_type: folded
87
+ valid_batch_type: null
88
+ fold_length:
89
+ - 80000
90
+ - 80000
91
+ - 80000
92
+ - 80000
93
+ - 80000
94
+ sort_in_batch: descending
95
+ sort_batch: descending
96
+ multiple_iterator: false
97
+ chunk_length: 48000
98
+ chunk_shift_ratio: 0.5
99
+ num_cache_chunks: 1024
100
+ chunk_excluded_key_prefixes:
101
+ - enroll_ref
102
+ train_data_path_and_name_and_type:
103
+ - - dump/raw/train/wav.scp
104
+ - speech_mix
105
+ - sound
106
+ - - dump/raw/train/spk1.scp
107
+ - speech_ref1
108
+ - sound
109
+ - - dump/raw/train/enroll_spk1.scp
110
+ - enroll_ref1
111
+ - text
112
+ - - dump/raw/train/spk2.scp
113
+ - speech_ref2
114
+ - sound
115
+ - - dump/raw/train/enroll_spk2.scp
116
+ - enroll_ref2
117
+ - text
118
+ valid_data_path_and_name_and_type:
119
+ - - dump/raw/dev/wav.scp
120
+ - speech_mix
121
+ - sound
122
+ - - dump/raw/dev/spk1.scp
123
+ - speech_ref1
124
+ - sound
125
+ - - dump/raw/dev/enroll_spk1.scp
126
+ - enroll_ref1
127
+ - text
128
+ - - dump/raw/dev/spk2.scp
129
+ - speech_ref2
130
+ - sound
131
+ - - dump/raw/dev/enroll_spk2.scp
132
+ - enroll_ref2
133
+ - text
134
+ allow_variable_data_keys: false
135
+ max_cache_size: 0.0
136
+ max_cache_fd: 32
137
+ valid_max_cache_size: null
138
+ exclude_weight_decay: false
139
+ exclude_weight_decay_conf: {}
140
+ optim: adam
141
+ optim_conf:
142
+ lr: 0.001
143
+ eps: 1.0e-08
144
+ weight_decay: 0
145
+ scheduler: reducelronplateau
146
+ scheduler_conf:
147
+ mode: min
148
+ factor: 0.7
149
+ patience: 3
150
+ init: null
151
+ model_conf:
152
+ num_spk: 2
153
+ share_encoder: true
154
+ criterions:
155
+ - name: snr
156
+ conf:
157
+ eps: 1.0e-07
158
+ wrapper: fixed_order
159
+ wrapper_conf:
160
+ weight: 1.0
161
+ - name: l1_fd
162
+ conf:
163
+ only_for_test: true
164
+ wrapper: fixed_order
165
+ wrapper_conf:
166
+ weight: 0.0
167
+ - name: l1_td
168
+ conf:
169
+ only_for_test: true
170
+ wrapper: fixed_order
171
+ wrapper_conf:
172
+ weight: 0.0
173
+ - name: mse_fd
174
+ conf:
175
+ only_for_test: true
176
+ wrapper: fixed_order
177
+ wrapper_conf:
178
+ weight: 0.0
179
+ - name: mse_td
180
+ conf:
181
+ only_for_test: true
182
+ wrapper: fixed_order
183
+ wrapper_conf:
184
+ weight: 0.0
185
+ train_spk2enroll: data/train-100/spk2enroll.json
186
+ enroll_segment: 48000
187
+ load_spk_embedding: false
188
+ load_all_speakers: false
189
+ rir_scp: null
190
+ rir_apply_prob: 1.0
191
+ noise_scp: null
192
+ noise_apply_prob: 1.0
193
+ noise_db_range: '13_15'
194
+ short_noise_thres: 0.5
195
+ speech_volume_normalize: null
196
+ use_reverberant_ref: false
197
+ num_spk: 1
198
+ num_noise_type: 1
199
+ sample_rate: 8000
200
+ force_single_channel: false
201
+ channel_reordering: false
202
+ categories: []
203
+ encoder: conv
204
+ encoder_conf:
205
+ channel: 256
206
+ kernel_size: 32
207
+ stride: 16
208
+ extractor: td_speakerbeam
209
+ extractor_conf:
210
+ layer: 8
211
+ stack: 4
212
+ bottleneck_dim: 256
213
+ hidden_dim: 512
214
+ skip_dim: 256
215
+ kernel: 3
216
+ causal: false
217
+ norm_type: gLN
218
+ pre_nonlinear: prelu
219
+ nonlinear: relu
220
+ i_adapt_layer: 7
221
+ adapt_layer_type: mul
222
+ adapt_enroll_dim: 256
223
+ use_spk_emb: false
224
+ decoder: conv
225
+ decoder_conf:
226
+ channel: 256
227
+ kernel_size: 32
228
+ stride: 16
229
+ preprocessor: tse
230
+ preprocessor_conf: {}
231
+ required:
232
+ - output_dir
233
+ version: '202301'
234
+ distributed: true
exp/enh_train_raw/images/L1_on_Spec.png ADDED
exp/enh_train_raw/images/MSE_on_Spec.png ADDED
exp/enh_train_raw/images/TD_L1_loss.png ADDED
exp/enh_train_raw/images/TD_MSE_loss.png ADDED
exp/enh_train_raw/images/backward_time.png ADDED
exp/enh_train_raw/images/clip.png ADDED
exp/enh_train_raw/images/forward_time.png ADDED
exp/enh_train_raw/images/gpu_max_cached_mem_GB.png ADDED
exp/enh_train_raw/images/grad_norm.png ADDED
exp/enh_train_raw/images/iter_time.png ADDED
exp/enh_train_raw/images/loss.png ADDED
exp/enh_train_raw/images/loss_scale.png ADDED
exp/enh_train_raw/images/optim0_lr0.png ADDED
exp/enh_train_raw/images/optim_step_time.png ADDED
exp/enh_train_raw/images/snr_loss.png ADDED
exp/enh_train_raw/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202301'
2
+ files:
3
+ model_file: exp/enh_train_raw/99epoch.pth
4
+ python: "3.8.16 (default, Mar 2 2023, 03:21:46) \n[GCC 11.2.0]"
5
+ timestamp: 1685992283.505222
6
+ torch: 2.0.1
7
+ yaml_files:
8
+ train_config: exp/enh_train_raw/config.yaml