YushiUeda commited on
Commit
b621d87
1 Parent(s): bcaf81a

Update model

Browse files
Files changed (24) hide show
  1. README.md +322 -0
  2. exp/diar_enh_stats_8k/train/feats_stats.npz +0 -0
  3. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/12epoch.pth +3 -0
  4. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/RESULTS.md +19 -0
  5. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/config.yaml +235 -0
  6. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/acc.png +0 -0
  7. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/backward_time.png +0 -0
  8. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/cf.png +0 -0
  9. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/der.png +0 -0
  10. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/fa.png +0 -0
  11. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/forward_time.png +0 -0
  12. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/gpu_max_cached_mem_GB.png +0 -0
  13. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/iter_time.png +0 -0
  14. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/loss.png +0 -0
  15. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/loss_att.png +0 -0
  16. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/loss_diar.png +0 -0
  17. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/mi.png +0 -0
  18. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/optim0_lr0.png +0 -0
  19. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/optim_step_time.png +0 -0
  20. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/sad_fr.png +0 -0
  21. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/sad_mr.png +0 -0
  22. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/si_snr_loss.png +0 -0
  23. exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/train_time.png +0 -0
  24. meta.yaml +8 -0
README.md ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - diarization
6
+ language: noinfo
7
+ datasets:
8
+ - librimix
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 DIAR model
13
+
14
+ ### `espnet/YushiUeda_librimix_diar_enh_2_3_spk_lmf`
15
+
16
+ This model was trained by YushiUeda using librimix recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+ git checkout 4f0f9a2435549211ef670354d09eb45883441b2d
23
+ pip install -e .
24
+ cd egs2/librimix/diar_enh2
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/YushiUeda_librimix_diar_enh_2_3_spk_lmf
26
+ ```
27
+
28
+ <!-- Generated by local/show_enh_score.sh -->
29
+ # RESULTS
30
+ ## Environments
31
+ - date: `Sat Mar 26 08:47:28 EDT 2022`
32
+ - python version: `3.7.11 (default, Jul 27 2021, 14:32:16) [GCC 7.5.0]`
33
+ - espnet version: `espnet 0.10.7a1`
34
+ - pytorch version: `pytorch 1.10.1+cu102`
35
+ - Git hash: `4f0f9a2435549211ef670354d09eb45883441b2d`
36
+ - Commit date: `Tue Mar 15 10:52:24 2022 -0400`
37
+
38
+
39
+ ## ..
40
+
41
+ config: conf/tuning/train_diar_enh_convtasnet_lmf_adapt.yaml
42
+
43
+ |dataset|STOI|SAR|SDR|SIR|SI_SNR|DER|
44
+ |---|---|---|---|---|---|---|
45
+ |diarized_enhanced_test|0.7667|8.1685|6.6069|15.2114|5.4204|6.04|
46
+
47
+ ## DIAR config
48
+
49
+ <details><summary>expand</summary>
50
+
51
+ ```
52
+ config: conf/tuning/train_diar_enh_convtasnet_lmf_adapt.yaml
53
+ print_config: false
54
+ log_level: INFO
55
+ dry_run: false
56
+ iterator_type: chunk
57
+ output_dir: exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt
58
+ ngpu: 1
59
+ seed: 0
60
+ num_workers: 4
61
+ num_att_plot: 3
62
+ dist_backend: nccl
63
+ dist_init_method: env://
64
+ dist_world_size: 4
65
+ dist_rank: 0
66
+ local_rank: 0
67
+ dist_master_addr: localhost
68
+ dist_master_port: 38467
69
+ dist_launcher: null
70
+ multiprocessing_distributed: true
71
+ unused_parameters: false
72
+ sharded_ddp: false
73
+ cudnn_enabled: true
74
+ cudnn_benchmark: false
75
+ cudnn_deterministic: true
76
+ collect_stats: false
77
+ write_collected_feats: false
78
+ max_epoch: 100
79
+ patience: 4
80
+ val_scheduler_criterion:
81
+ - valid
82
+ - loss
83
+ early_stopping_criterion:
84
+ - valid
85
+ - loss
86
+ - min
87
+ best_model_criterion:
88
+ - - valid
89
+ - si_snr_loss
90
+ - min
91
+ keep_nbest_models: 1
92
+ nbest_averaging_interval: 0
93
+ grad_clip: 5.0
94
+ grad_clip_type: 2.0
95
+ grad_noise: false
96
+ accum_grad: 4
97
+ no_forward_run: false
98
+ resume: true
99
+ train_dtype: float32
100
+ use_amp: false
101
+ log_interval: null
102
+ use_matplotlib: true
103
+ use_tensorboard: true
104
+ use_wandb: false
105
+ wandb_project: null
106
+ wandb_id: null
107
+ wandb_entity: null
108
+ wandb_name: null
109
+ wandb_model_log_interval: -1
110
+ detect_anomaly: false
111
+ pretrain_path: null
112
+ init_param:
113
+ - exp/diar_enh_train_diar_enh_convtasnet_lmf/valid.si_snr_loss.best.pth
114
+ ignore_init_mismatch: false
115
+ freeze_param: []
116
+ num_iters_per_epoch: null
117
+ batch_size: 4
118
+ valid_batch_size: null
119
+ batch_bins: 1000000
120
+ valid_batch_bins: null
121
+ train_shape_file:
122
+ - exp/diar_enh_stats_8k/train/speech_mix_shape
123
+ - exp/diar_enh_stats_8k/train/spk_labels_shape
124
+ - exp/diar_enh_stats_8k/train/speech_ref1_shape
125
+ - exp/diar_enh_stats_8k/train/speech_ref2_shape
126
+ - exp/diar_enh_stats_8k/train/speech_ref3_shape
127
+ - exp/diar_enh_stats_8k/train/noise_ref1_shape
128
+ valid_shape_file:
129
+ - exp/diar_enh_stats_8k/valid/speech_mix_shape
130
+ - exp/diar_enh_stats_8k/valid/spk_labels_shape
131
+ - exp/diar_enh_stats_8k/valid/speech_ref1_shape
132
+ - exp/diar_enh_stats_8k/valid/speech_ref2_shape
133
+ - exp/diar_enh_stats_8k/valid/speech_ref3_shape
134
+ - exp/diar_enh_stats_8k/valid/noise_ref1_shape
135
+ batch_type: folded
136
+ valid_batch_type: null
137
+ fold_length:
138
+ - 800
139
+ - 80000
140
+ - 80000
141
+ - 80000
142
+ - 80000
143
+ - 80000
144
+ sort_in_batch: descending
145
+ sort_batch: descending
146
+ multiple_iterator: false
147
+ chunk_length: 24000
148
+ chunk_shift_ratio: 0.5
149
+ num_cache_chunks: 1024
150
+ train_data_path_and_name_and_type:
151
+ - - dump/raw/train/wav.scp
152
+ - speech_mix
153
+ - sound
154
+ - - dump/raw/train/espnet_rttm
155
+ - spk_labels
156
+ - rttm
157
+ - - dump/raw/train/spk1.scp
158
+ - speech_ref1
159
+ - sound
160
+ - - dump/raw/train/spk2.scp
161
+ - speech_ref2
162
+ - sound
163
+ - - dump/raw/train/spk3.scp
164
+ - speech_ref3
165
+ - sound
166
+ - - dump/raw/train/noise1.scp
167
+ - noise_ref1
168
+ - sound
169
+ valid_data_path_and_name_and_type:
170
+ - - dump/raw/dev/wav.scp
171
+ - speech_mix
172
+ - sound
173
+ - - dump/raw/dev/espnet_rttm
174
+ - spk_labels
175
+ - rttm
176
+ - - dump/raw/dev/spk1.scp
177
+ - speech_ref1
178
+ - sound
179
+ - - dump/raw/dev/spk2.scp
180
+ - speech_ref2
181
+ - sound
182
+ - - dump/raw/dev/spk3.scp
183
+ - speech_ref3
184
+ - sound
185
+ - - dump/raw/dev/noise1.scp
186
+ - noise_ref1
187
+ - sound
188
+ allow_variable_data_keys: false
189
+ max_cache_size: 0.0
190
+ max_cache_fd: 32
191
+ valid_max_cache_size: null
192
+ optim: adam
193
+ optim_conf:
194
+ lr: 0.001
195
+ eps: 1.0e-07
196
+ weight_decay: 0
197
+ scheduler: reducelronplateau
198
+ scheduler_conf:
199
+ mode: min
200
+ factor: 0.5
201
+ patience: 1
202
+ num_spk: 3
203
+ init: xavier_uniform
204
+ model_conf:
205
+ loss_type: si_snr
206
+ diar_weight: 0.2
207
+ attractor_weight: 0.2
208
+ use_preprocessor: true
209
+ criterions:
210
+ - name: si_snr
211
+ conf:
212
+ eps: 1.0e-07
213
+ wrapper: pit2
214
+ wrapper_conf:
215
+ weight: 1.0
216
+ independent_perm: true
217
+ frontend: default
218
+ frontend_conf:
219
+ fs: 8k
220
+ hop_length: 64
221
+ specaug: specaug
222
+ specaug_conf:
223
+ apply_time_warp: false
224
+ apply_freq_mask: true
225
+ freq_mask_width_range:
226
+ - 0
227
+ - 30
228
+ num_freq_mask: 2
229
+ apply_time_mask: true
230
+ time_mask_width_range:
231
+ - 0
232
+ - 40
233
+ num_time_mask: 2
234
+ normalize: null
235
+ normalize_conf: {}
236
+ diar_encoder: transformer
237
+ diar_encoder_conf:
238
+ input_size: 208
239
+ input_layer: conv2d8
240
+ num_blocks: 4
241
+ linear_units: 512
242
+ dropout_rate: 0.1
243
+ output_size: 256
244
+ attention_heads: 4
245
+ attention_dropout_rate: 0.1
246
+ diar_decoder: linear
247
+ diar_decoder_conf: {}
248
+ label_aggregator: label_aggregator
249
+ label_aggregator_conf:
250
+ win_length: 256
251
+ hop_length: 64
252
+ attractor: rnn
253
+ attractor_conf:
254
+ unit: 256
255
+ layer: 1
256
+ dropout: 0.1
257
+ attractor_grad: true
258
+ enh_encoder: conv
259
+ enh_encoder_conf:
260
+ channel: 512
261
+ kernel_size: 16
262
+ stride: 8
263
+ separator: tcn
264
+ separator_conf:
265
+ layer: 8
266
+ stack: 3
267
+ bottleneck_dim: 128
268
+ hidden_dim: 512
269
+ kernel: 3
270
+ causal: false
271
+ norm_type: gLN
272
+ mask_module: mask
273
+ mask_module_conf:
274
+ max_num_spk: 3
275
+ mask_nonlinear: relu
276
+ input_dim: 512
277
+ bottleneck_dim: 128
278
+ enh_decoder: conv
279
+ enh_decoder_conf:
280
+ channel: 512
281
+ kernel_size: 16
282
+ stride: 8
283
+ required:
284
+ - output_dir
285
+ version: 0.10.7a1
286
+ distributed: true
287
+ ```
288
+
289
+ </details>
290
+
291
+
292
+
293
+ ### Citing ESPnet
294
+
295
+ ```BibTex
296
+ @inproceedings{watanabe2018espnet,
297
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
298
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
299
+ year={2018},
300
+ booktitle={Proceedings of Interspeech},
301
+ pages={2207--2211},
302
+ doi={10.21437/Interspeech.2018-1456},
303
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
304
+ }
305
+
306
+
307
+
308
+
309
+ ```
310
+
311
+ or arXiv:
312
+
313
+ ```bibtex
314
+ @misc{watanabe2018espnet,
315
+ title={ESPnet: End-to-End Speech Processing Toolkit},
316
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
317
+ year={2018},
318
+ eprint={1804.00015},
319
+ archivePrefix={arXiv},
320
+ primaryClass={cs.CL}
321
+ }
322
+ ```
exp/diar_enh_stats_8k/train/feats_stats.npz ADDED
Binary file (778 Bytes). View file
 
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/12epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e9c9fd2d4197aaf37ecedd1e4fe8da499fb2a1715d09ce470a7ec2576f24a58
3
+ size 38969763
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/RESULTS.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by local/show_enh_score.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Sat Mar 26 08:47:28 EDT 2022`
5
+ - python version: `3.7.11 (default, Jul 27 2021, 14:32:16) [GCC 7.5.0]`
6
+ - espnet version: `espnet 0.10.7a1`
7
+ - pytorch version: `pytorch 1.10.1+cu102`
8
+ - Git hash: `4f0f9a2435549211ef670354d09eb45883441b2d`
9
+ - Commit date: `Tue Mar 15 10:52:24 2022 -0400`
10
+
11
+
12
+ ## ..
13
+
14
+ config: conf/tuning/train_diar_enh_convtasnet_lmf_adapt.yaml
15
+
16
+ |dataset|STOI|SAR|SDR|SIR|SI_SNR|DER|
17
+ |---|---|---|---|---|---|---|
18
+ |diarized_enhanced_test|0.7667|8.1685|6.6069|15.2114|5.4204|6.04|
19
+
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/config.yaml ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_diar_enh_convtasnet_lmf_adapt.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: chunk
6
+ output_dir: exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 38467
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: 4
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - si_snr_loss
39
+ - min
40
+ keep_nbest_models: 1
41
+ nbest_averaging_interval: 0
42
+ grad_clip: 5.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 4
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ use_wandb: false
54
+ wandb_project: null
55
+ wandb_id: null
56
+ wandb_entity: null
57
+ wandb_name: null
58
+ wandb_model_log_interval: -1
59
+ detect_anomaly: false
60
+ pretrain_path: null
61
+ init_param:
62
+ - exp/diar_enh_train_diar_enh_convtasnet_lmf/valid.si_snr_loss.best.pth
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: null
66
+ batch_size: 4
67
+ valid_batch_size: null
68
+ batch_bins: 1000000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/diar_enh_stats_8k/train/speech_mix_shape
72
+ - exp/diar_enh_stats_8k/train/spk_labels_shape
73
+ - exp/diar_enh_stats_8k/train/speech_ref1_shape
74
+ - exp/diar_enh_stats_8k/train/speech_ref2_shape
75
+ - exp/diar_enh_stats_8k/train/speech_ref3_shape
76
+ - exp/diar_enh_stats_8k/train/noise_ref1_shape
77
+ valid_shape_file:
78
+ - exp/diar_enh_stats_8k/valid/speech_mix_shape
79
+ - exp/diar_enh_stats_8k/valid/spk_labels_shape
80
+ - exp/diar_enh_stats_8k/valid/speech_ref1_shape
81
+ - exp/diar_enh_stats_8k/valid/speech_ref2_shape
82
+ - exp/diar_enh_stats_8k/valid/speech_ref3_shape
83
+ - exp/diar_enh_stats_8k/valid/noise_ref1_shape
84
+ batch_type: folded
85
+ valid_batch_type: null
86
+ fold_length:
87
+ - 800
88
+ - 80000
89
+ - 80000
90
+ - 80000
91
+ - 80000
92
+ - 80000
93
+ sort_in_batch: descending
94
+ sort_batch: descending
95
+ multiple_iterator: false
96
+ chunk_length: 24000
97
+ chunk_shift_ratio: 0.5
98
+ num_cache_chunks: 1024
99
+ train_data_path_and_name_and_type:
100
+ - - dump/raw/train/wav.scp
101
+ - speech_mix
102
+ - sound
103
+ - - dump/raw/train/espnet_rttm
104
+ - spk_labels
105
+ - rttm
106
+ - - dump/raw/train/spk1.scp
107
+ - speech_ref1
108
+ - sound
109
+ - - dump/raw/train/spk2.scp
110
+ - speech_ref2
111
+ - sound
112
+ - - dump/raw/train/spk3.scp
113
+ - speech_ref3
114
+ - sound
115
+ - - dump/raw/train/noise1.scp
116
+ - noise_ref1
117
+ - sound
118
+ valid_data_path_and_name_and_type:
119
+ - - dump/raw/dev/wav.scp
120
+ - speech_mix
121
+ - sound
122
+ - - dump/raw/dev/espnet_rttm
123
+ - spk_labels
124
+ - rttm
125
+ - - dump/raw/dev/spk1.scp
126
+ - speech_ref1
127
+ - sound
128
+ - - dump/raw/dev/spk2.scp
129
+ - speech_ref2
130
+ - sound
131
+ - - dump/raw/dev/spk3.scp
132
+ - speech_ref3
133
+ - sound
134
+ - - dump/raw/dev/noise1.scp
135
+ - noise_ref1
136
+ - sound
137
+ allow_variable_data_keys: false
138
+ max_cache_size: 0.0
139
+ max_cache_fd: 32
140
+ valid_max_cache_size: null
141
+ optim: adam
142
+ optim_conf:
143
+ lr: 0.001
144
+ eps: 1.0e-07
145
+ weight_decay: 0
146
+ scheduler: reducelronplateau
147
+ scheduler_conf:
148
+ mode: min
149
+ factor: 0.5
150
+ patience: 1
151
+ num_spk: 3
152
+ init: xavier_uniform
153
+ model_conf:
154
+ loss_type: si_snr
155
+ diar_weight: 0.2
156
+ attractor_weight: 0.2
157
+ use_preprocessor: true
158
+ criterions:
159
+ - name: si_snr
160
+ conf:
161
+ eps: 1.0e-07
162
+ wrapper: pit2
163
+ wrapper_conf:
164
+ weight: 1.0
165
+ independent_perm: true
166
+ frontend: default
167
+ frontend_conf:
168
+ fs: 8k
169
+ hop_length: 64
170
+ specaug: specaug
171
+ specaug_conf:
172
+ apply_time_warp: false
173
+ apply_freq_mask: true
174
+ freq_mask_width_range:
175
+ - 0
176
+ - 30
177
+ num_freq_mask: 2
178
+ apply_time_mask: true
179
+ time_mask_width_range:
180
+ - 0
181
+ - 40
182
+ num_time_mask: 2
183
+ normalize: null
184
+ normalize_conf: {}
185
+ diar_encoder: transformer
186
+ diar_encoder_conf:
187
+ input_size: 208
188
+ input_layer: conv2d8
189
+ num_blocks: 4
190
+ linear_units: 512
191
+ dropout_rate: 0.1
192
+ output_size: 256
193
+ attention_heads: 4
194
+ attention_dropout_rate: 0.1
195
+ diar_decoder: linear
196
+ diar_decoder_conf: {}
197
+ label_aggregator: label_aggregator
198
+ label_aggregator_conf:
199
+ win_length: 256
200
+ hop_length: 64
201
+ attractor: rnn
202
+ attractor_conf:
203
+ unit: 256
204
+ layer: 1
205
+ dropout: 0.1
206
+ attractor_grad: true
207
+ enh_encoder: conv
208
+ enh_encoder_conf:
209
+ channel: 512
210
+ kernel_size: 16
211
+ stride: 8
212
+ separator: tcn
213
+ separator_conf:
214
+ layer: 8
215
+ stack: 3
216
+ bottleneck_dim: 128
217
+ hidden_dim: 512
218
+ kernel: 3
219
+ causal: false
220
+ norm_type: gLN
221
+ mask_module: mask
222
+ mask_module_conf:
223
+ max_num_spk: 3
224
+ mask_nonlinear: relu
225
+ input_dim: 512
226
+ bottleneck_dim: 128
227
+ enh_decoder: conv
228
+ enh_decoder_conf:
229
+ channel: 512
230
+ kernel_size: 16
231
+ stride: 8
232
+ required:
233
+ - output_dir
234
+ version: 0.10.7a1
235
+ distributed: true
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/acc.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/backward_time.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/cf.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/der.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/fa.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/forward_time.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/gpu_max_cached_mem_GB.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/iter_time.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/loss.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/loss_att.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/loss_diar.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/mi.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/optim0_lr0.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/optim_step_time.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/sad_fr.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/sad_mr.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/si_snr_loss.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.10.7a1
2
+ files:
3
+ model_file: exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/12epoch.pth
4
+ python: "3.7.11 (default, Jul 27 2021, 14:32:16) \n[GCC 7.5.0]"
5
+ timestamp: 1650047915.718722
6
+ torch: 1.10.1+cu102
7
+ yaml_files:
8
+ train_config: exp/diar_enh_train_diar_enh_convtasnet_lmf_adapt/config.yaml