ESPnet
audio
diarization
YushiUeda commited on
Commit
01ebc78
1 Parent(s): 79ef8cd

Update model

Browse files
Files changed (24) hide show
  1. README.md +310 -0
  2. exp/diar_enh_stats_8k/train/feats_stats.npz +0 -0
  3. exp/diar_enh_train_diar_enh_convtasnet_adapt/27epoch.pth +3 -0
  4. exp/diar_enh_train_diar_enh_convtasnet_adapt/RESULTS.md +19 -0
  5. exp/diar_enh_train_diar_enh_convtasnet_adapt/config.yaml +223 -0
  6. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/acc.png +0 -0
  7. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/backward_time.png +0 -0
  8. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/cf.png +0 -0
  9. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/der.png +0 -0
  10. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/fa.png +0 -0
  11. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/forward_time.png +0 -0
  12. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/gpu_max_cached_mem_GB.png +0 -0
  13. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/iter_time.png +0 -0
  14. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/loss.png +0 -0
  15. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/loss_att.png +0 -0
  16. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/loss_diar.png +0 -0
  17. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/mi.png +0 -0
  18. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/optim0_lr0.png +0 -0
  19. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/optim_step_time.png +0 -0
  20. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/sad_fr.png +0 -0
  21. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/sad_mr.png +0 -0
  22. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/si_snr_loss.png +0 -0
  23. exp/diar_enh_train_diar_enh_convtasnet_adapt/images/train_time.png +0 -0
  24. meta.yaml +8 -0
README.md ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - diarization
6
+ language: noinfo
7
+ datasets:
8
+ - librimix
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 DIAR model
13
+
14
+ ### `espnet/YushiUeda_librimix_diar_enh_2_3_spk`
15
+
16
+ This model was trained by YushiUeda using librimix recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+ git checkout 4f0f9a2435549211ef670354d09eb45883441b2d
23
+ pip install -e .
24
+ cd egs2/librimix/diar_enh1
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/YushiUeda_librimix_diar_enh_2_3_spk
26
+ ```
27
+
28
+ <!-- Generated by local/show_enh_score.sh -->
29
+ # RESULTS
30
+ ## Environments
31
+ - date: `Fri Mar 25 17:40:43 EDT 2022`
32
+ - python version: `3.7.11 (default, Jul 27 2021, 14:32:16) [GCC 7.5.0]`
33
+ - espnet version: `espnet 0.10.7a1`
34
+ - pytorch version: `pytorch 1.10.1+cu102`
35
+ - Git hash: `4f0f9a2435549211ef670354d09eb45883441b2d`
36
+ - Commit date: `Tue Mar 15 10:52:24 2022 -0400`
37
+
38
+
39
+ ## ..
40
+
41
+ config: conf/tuning/train_diar_enh_convtasnet_adapt.yaml
42
+
43
+ |dataset|STOI|SAR|SDR|SIR|SI_SNR|DER|
44
+ |---|---|---|---|---|---|---|
45
+ |diarized_enhanced_test|0.7602|7.3687|5.9088|15.0722|4.3856|6.27|
46
+
47
+ ## DIAR config
48
+
49
+ <details><summary>expand</summary>
50
+
51
+ ```
52
+ config: conf/tuning/train_diar_enh_convtasnet_adapt.yaml
53
+ print_config: false
54
+ log_level: INFO
55
+ dry_run: false
56
+ iterator_type: chunk
57
+ output_dir: exp/diar_enh_train_diar_enh_convtasnet_adapt
58
+ ngpu: 1
59
+ seed: 0
60
+ num_workers: 4
61
+ num_att_plot: 3
62
+ dist_backend: nccl
63
+ dist_init_method: env://
64
+ dist_world_size: 4
65
+ dist_rank: 0
66
+ local_rank: 0
67
+ dist_master_addr: localhost
68
+ dist_master_port: 47601
69
+ dist_launcher: null
70
+ multiprocessing_distributed: true
71
+ unused_parameters: false
72
+ sharded_ddp: false
73
+ cudnn_enabled: true
74
+ cudnn_benchmark: false
75
+ cudnn_deterministic: true
76
+ collect_stats: false
77
+ write_collected_feats: false
78
+ max_epoch: 50
79
+ patience: 4
80
+ val_scheduler_criterion:
81
+ - valid
82
+ - loss
83
+ early_stopping_criterion:
84
+ - valid
85
+ - loss
86
+ - min
87
+ best_model_criterion:
88
+ - - valid
89
+ - si_snr_loss
90
+ - min
91
+ keep_nbest_models: 1
92
+ nbest_averaging_interval: 0
93
+ grad_clip: 5.0
94
+ grad_clip_type: 2.0
95
+ grad_noise: false
96
+ accum_grad: 4
97
+ no_forward_run: false
98
+ resume: true
99
+ train_dtype: float32
100
+ use_amp: false
101
+ log_interval: null
102
+ use_matplotlib: true
103
+ use_tensorboard: true
104
+ use_wandb: false
105
+ wandb_project: null
106
+ wandb_id: null
107
+ wandb_entity: null
108
+ wandb_name: null
109
+ wandb_model_log_interval: -1
110
+ detect_anomaly: false
111
+ pretrain_path: null
112
+ init_param:
113
+ - exp/diar_enh_train_diar_enh_convtasnet_2_raw/valid.si_snr_loss.best.pth
114
+ ignore_init_mismatch: false
115
+ freeze_param: []
116
+ num_iters_per_epoch: null
117
+ batch_size: 4
118
+ valid_batch_size: null
119
+ batch_bins: 1000000
120
+ valid_batch_bins: null
121
+ train_shape_file:
122
+ - exp/diar_enh_stats_8k/train/speech_mix_shape
123
+ - exp/diar_enh_stats_8k/train/spk_labels_shape
124
+ - exp/diar_enh_stats_8k/train/speech_ref1_shape
125
+ - exp/diar_enh_stats_8k/train/speech_ref2_shape
126
+ - exp/diar_enh_stats_8k/train/speech_ref3_shape
127
+ - exp/diar_enh_stats_8k/train/noise_ref1_shape
128
+ valid_shape_file:
129
+ - exp/diar_enh_stats_8k/valid/speech_mix_shape
130
+ - exp/diar_enh_stats_8k/valid/spk_labels_shape
131
+ - exp/diar_enh_stats_8k/valid/speech_ref1_shape
132
+ - exp/diar_enh_stats_8k/valid/speech_ref2_shape
133
+ - exp/diar_enh_stats_8k/valid/speech_ref3_shape
134
+ - exp/diar_enh_stats_8k/valid/noise_ref1_shape
135
+ batch_type: folded
136
+ valid_batch_type: null
137
+ fold_length:
138
+ - 800
139
+ - 80000
140
+ - 80000
141
+ - 80000
142
+ - 80000
143
+ - 80000
144
+ sort_in_batch: descending
145
+ sort_batch: descending
146
+ multiple_iterator: false
147
+ chunk_length: 24000
148
+ chunk_shift_ratio: 0.5
149
+ num_cache_chunks: 1024
150
+ train_data_path_and_name_and_type:
151
+ - - dump/raw/train/wav.scp
152
+ - speech_mix
153
+ - sound
154
+ - - dump/raw/train/espnet_rttm
155
+ - spk_labels
156
+ - rttm
157
+ - - dump/raw/train/spk1.scp
158
+ - speech_ref1
159
+ - sound
160
+ - - dump/raw/train/spk2.scp
161
+ - speech_ref2
162
+ - sound
163
+ - - dump/raw/train/spk3.scp
164
+ - speech_ref3
165
+ - sound
166
+ - - dump/raw/train/noise1.scp
167
+ - noise_ref1
168
+ - sound
169
+ valid_data_path_and_name_and_type:
170
+ - - dump/raw/dev/wav.scp
171
+ - speech_mix
172
+ - sound
173
+ - - dump/raw/dev/espnet_rttm
174
+ - spk_labels
175
+ - rttm
176
+ - - dump/raw/dev/spk1.scp
177
+ - speech_ref1
178
+ - sound
179
+ - - dump/raw/dev/spk2.scp
180
+ - speech_ref2
181
+ - sound
182
+ - - dump/raw/dev/spk3.scp
183
+ - speech_ref3
184
+ - sound
185
+ - - dump/raw/dev/noise1.scp
186
+ - noise_ref1
187
+ - sound
188
+ allow_variable_data_keys: false
189
+ max_cache_size: 0.0
190
+ max_cache_fd: 32
191
+ valid_max_cache_size: null
192
+ optim: adam
193
+ optim_conf:
194
+ lr: 0.0003
195
+ weight_decay: 0
196
+ scheduler: reducelronplateau
197
+ scheduler_conf:
198
+ mode: min
199
+ factor: 0.5
200
+ patience: 1
201
+ num_spk: 3
202
+ init: xavier_uniform
203
+ model_conf:
204
+ loss_type: si_snr
205
+ diar_weight: 0.2
206
+ attractor_weight: 0.2
207
+ use_preprocessor: true
208
+ criterions:
209
+ - name: si_snr
210
+ conf:
211
+ eps: 1.0e-07
212
+ wrapper: pit2
213
+ wrapper_conf:
214
+ weight: 1.0
215
+ independent_perm: true
216
+ frontend: null
217
+ frontend_conf:
218
+ fs: 8k
219
+ hop_length: 64
220
+ specaug: null
221
+ specaug_conf: {}
222
+ normalize: null
223
+ normalize_conf: {}
224
+ diar_encoder: transformer
225
+ diar_encoder_conf:
226
+ input_size: 128
227
+ input_layer: conv2d8
228
+ num_blocks: 4
229
+ linear_units: 512
230
+ dropout_rate: 0.1
231
+ output_size: 256
232
+ attention_heads: 4
233
+ attention_dropout_rate: 0.1
234
+ diar_decoder: linear
235
+ diar_decoder_conf: {}
236
+ label_aggregator: label_aggregator
237
+ label_aggregator_conf:
238
+ win_length: 256
239
+ hop_length: 64
240
+ attractor: rnn
241
+ attractor_conf:
242
+ unit: 256
243
+ layer: 1
244
+ dropout: 0.1
245
+ attractor_grad: true
246
+ enh_encoder: conv
247
+ enh_encoder_conf:
248
+ channel: 512
249
+ kernel_size: 16
250
+ stride: 8
251
+ separator: tcn
252
+ separator_conf:
253
+ layer: 8
254
+ stack: 3
255
+ bottleneck_dim: 128
256
+ hidden_dim: 512
257
+ kernel: 3
258
+ causal: false
259
+ norm_type: gLN
260
+ mask_module: mask
261
+ mask_module_conf:
262
+ max_num_spk: 3
263
+ mask_nonlinear: relu
264
+ input_dim: 512
265
+ bottleneck_dim: 128
266
+ enh_decoder: conv
267
+ enh_decoder_conf:
268
+ channel: 512
269
+ kernel_size: 16
270
+ stride: 8
271
+ required:
272
+ - output_dir
273
+ version: 0.10.7a1
274
+ distributed: true
275
+ ```
276
+
277
+ </details>
278
+
279
+
280
+
281
+ ### Citing ESPnet
282
+
283
+ ```BibTex
284
+ @inproceedings{watanabe2018espnet,
285
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
286
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
287
+ year={2018},
288
+ booktitle={Proceedings of Interspeech},
289
+ pages={2207--2211},
290
+ doi={10.21437/Interspeech.2018-1456},
291
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
292
+ }
293
+
294
+
295
+
296
+
297
+ ```
298
+
299
+ or arXiv:
300
+
301
+ ```bibtex
302
+ @misc{watanabe2018espnet,
303
+ title={ESPnet: End-to-End Speech Processing Toolkit},
304
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
305
+ year={2018},
306
+ eprint={1804.00015},
307
+ archivePrefix={arXiv},
308
+ primaryClass={cs.CL}
309
+ }
310
+ ```
exp/diar_enh_stats_8k/train/feats_stats.npz ADDED
Binary file (778 Bytes). View file
 
exp/diar_enh_train_diar_enh_convtasnet_adapt/27epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:661fe7b1e7897dab4d193fa1de8c2aca9c7cc4fe9319c64e7592c3139dba8cc9
3
+ size 36265509
exp/diar_enh_train_diar_enh_convtasnet_adapt/RESULTS.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by local/show_enh_score.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Fri Mar 25 17:40:43 EDT 2022`
5
+ - python version: `3.7.11 (default, Jul 27 2021, 14:32:16) [GCC 7.5.0]`
6
+ - espnet version: `espnet 0.10.7a1`
7
+ - pytorch version: `pytorch 1.10.1+cu102`
8
+ - Git hash: `4f0f9a2435549211ef670354d09eb45883441b2d`
9
+ - Commit date: `Tue Mar 15 10:52:24 2022 -0400`
10
+
11
+
12
+ ## ..
13
+
14
+ config: conf/tuning/train_diar_enh_convtasnet_adapt.yaml
15
+
16
+ |dataset|STOI|SAR|SDR|SIR|SI_SNR|DER|
17
+ |---|---|---|---|---|---|---|
18
+ |diarized_enhanced_test|0.7602|7.3687|5.9088|15.0722|4.3856|6.27|
19
+
exp/diar_enh_train_diar_enh_convtasnet_adapt/config.yaml ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_diar_enh_convtasnet_adapt.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: chunk
6
+ output_dir: exp/diar_enh_train_diar_enh_convtasnet_adapt
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 47601
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 50
28
+ patience: 4
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - si_snr_loss
39
+ - min
40
+ keep_nbest_models: 1
41
+ nbest_averaging_interval: 0
42
+ grad_clip: 5.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 4
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ use_wandb: false
54
+ wandb_project: null
55
+ wandb_id: null
56
+ wandb_entity: null
57
+ wandb_name: null
58
+ wandb_model_log_interval: -1
59
+ detect_anomaly: false
60
+ pretrain_path: null
61
+ init_param:
62
+ - exp/diar_enh_train_diar_enh_convtasnet_2_raw/valid.si_snr_loss.best.pth
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: null
66
+ batch_size: 4
67
+ valid_batch_size: null
68
+ batch_bins: 1000000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/diar_enh_stats_8k/train/speech_mix_shape
72
+ - exp/diar_enh_stats_8k/train/spk_labels_shape
73
+ - exp/diar_enh_stats_8k/train/speech_ref1_shape
74
+ - exp/diar_enh_stats_8k/train/speech_ref2_shape
75
+ - exp/diar_enh_stats_8k/train/speech_ref3_shape
76
+ - exp/diar_enh_stats_8k/train/noise_ref1_shape
77
+ valid_shape_file:
78
+ - exp/diar_enh_stats_8k/valid/speech_mix_shape
79
+ - exp/diar_enh_stats_8k/valid/spk_labels_shape
80
+ - exp/diar_enh_stats_8k/valid/speech_ref1_shape
81
+ - exp/diar_enh_stats_8k/valid/speech_ref2_shape
82
+ - exp/diar_enh_stats_8k/valid/speech_ref3_shape
83
+ - exp/diar_enh_stats_8k/valid/noise_ref1_shape
84
+ batch_type: folded
85
+ valid_batch_type: null
86
+ fold_length:
87
+ - 800
88
+ - 80000
89
+ - 80000
90
+ - 80000
91
+ - 80000
92
+ - 80000
93
+ sort_in_batch: descending
94
+ sort_batch: descending
95
+ multiple_iterator: false
96
+ chunk_length: 24000
97
+ chunk_shift_ratio: 0.5
98
+ num_cache_chunks: 1024
99
+ train_data_path_and_name_and_type:
100
+ - - dump/raw/train/wav.scp
101
+ - speech_mix
102
+ - sound
103
+ - - dump/raw/train/espnet_rttm
104
+ - spk_labels
105
+ - rttm
106
+ - - dump/raw/train/spk1.scp
107
+ - speech_ref1
108
+ - sound
109
+ - - dump/raw/train/spk2.scp
110
+ - speech_ref2
111
+ - sound
112
+ - - dump/raw/train/spk3.scp
113
+ - speech_ref3
114
+ - sound
115
+ - - dump/raw/train/noise1.scp
116
+ - noise_ref1
117
+ - sound
118
+ valid_data_path_and_name_and_type:
119
+ - - dump/raw/dev/wav.scp
120
+ - speech_mix
121
+ - sound
122
+ - - dump/raw/dev/espnet_rttm
123
+ - spk_labels
124
+ - rttm
125
+ - - dump/raw/dev/spk1.scp
126
+ - speech_ref1
127
+ - sound
128
+ - - dump/raw/dev/spk2.scp
129
+ - speech_ref2
130
+ - sound
131
+ - - dump/raw/dev/spk3.scp
132
+ - speech_ref3
133
+ - sound
134
+ - - dump/raw/dev/noise1.scp
135
+ - noise_ref1
136
+ - sound
137
+ allow_variable_data_keys: false
138
+ max_cache_size: 0.0
139
+ max_cache_fd: 32
140
+ valid_max_cache_size: null
141
+ optim: adam
142
+ optim_conf:
143
+ lr: 0.0003
144
+ weight_decay: 0
145
+ scheduler: reducelronplateau
146
+ scheduler_conf:
147
+ mode: min
148
+ factor: 0.5
149
+ patience: 1
150
+ num_spk: 3
151
+ init: xavier_uniform
152
+ model_conf:
153
+ loss_type: si_snr
154
+ diar_weight: 0.2
155
+ attractor_weight: 0.2
156
+ use_preprocessor: true
157
+ criterions:
158
+ - name: si_snr
159
+ conf:
160
+ eps: 1.0e-07
161
+ wrapper: pit2
162
+ wrapper_conf:
163
+ weight: 1.0
164
+ independent_perm: true
165
+ frontend: null
166
+ frontend_conf:
167
+ fs: 8k
168
+ hop_length: 64
169
+ specaug: null
170
+ specaug_conf: {}
171
+ normalize: null
172
+ normalize_conf: {}
173
+ diar_encoder: transformer
174
+ diar_encoder_conf:
175
+ input_size: 128
176
+ input_layer: conv2d8
177
+ num_blocks: 4
178
+ linear_units: 512
179
+ dropout_rate: 0.1
180
+ output_size: 256
181
+ attention_heads: 4
182
+ attention_dropout_rate: 0.1
183
+ diar_decoder: linear
184
+ diar_decoder_conf: {}
185
+ label_aggregator: label_aggregator
186
+ label_aggregator_conf:
187
+ win_length: 256
188
+ hop_length: 64
189
+ attractor: rnn
190
+ attractor_conf:
191
+ unit: 256
192
+ layer: 1
193
+ dropout: 0.1
194
+ attractor_grad: true
195
+ enh_encoder: conv
196
+ enh_encoder_conf:
197
+ channel: 512
198
+ kernel_size: 16
199
+ stride: 8
200
+ separator: tcn
201
+ separator_conf:
202
+ layer: 8
203
+ stack: 3
204
+ bottleneck_dim: 128
205
+ hidden_dim: 512
206
+ kernel: 3
207
+ causal: false
208
+ norm_type: gLN
209
+ mask_module: mask
210
+ mask_module_conf:
211
+ max_num_spk: 3
212
+ mask_nonlinear: relu
213
+ input_dim: 512
214
+ bottleneck_dim: 128
215
+ enh_decoder: conv
216
+ enh_decoder_conf:
217
+ channel: 512
218
+ kernel_size: 16
219
+ stride: 8
220
+ required:
221
+ - output_dir
222
+ version: 0.10.7a1
223
+ distributed: true
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/acc.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/backward_time.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/cf.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/der.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/fa.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/forward_time.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/gpu_max_cached_mem_GB.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/iter_time.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/loss.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/loss_att.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/loss_diar.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/mi.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/optim0_lr0.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/optim_step_time.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/sad_fr.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/sad_mr.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/si_snr_loss.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_adapt/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.10.7a1
2
+ files:
3
+ model_file: exp/diar_enh_train_diar_enh_convtasnet_adapt/27epoch.pth
4
+ python: "3.7.11 (default, Jul 27 2021, 14:32:16) \n[GCC 7.5.0]"
5
+ timestamp: 1650047110.334787
6
+ torch: 1.10.1+cu102
7
+ yaml_files:
8
+ train_config: exp/diar_enh_train_diar_enh_convtasnet_adapt/config.yaml