ESPnet
audio
diarization
jaekookang commited on
Commit
704b675
1 Parent(s): f411f5d

Update model

Browse files
README.md CHANGED
@@ -1,3 +1,237 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - diarization
6
+ language: noinfo
7
+ datasets:
8
+ - mini_librispeech
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 DIAR model
13
+
14
+ ### `jkang/espnet2_mini_librispeech_diar`
15
+
16
+ This model was trained by jaekookang using mini_librispeech recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+ git checkout e08a89e0a43db7fc12bec835c62a000ad10bd417
23
+ pip install -e .
24
+ cd egs2/mini_librispeech/diar1
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model jkang/espnet2_mini_librispeech_diar
26
+ ```
27
+
28
+ <!-- Generated by scripts/utils/show_diar_result.sh -->
29
+ # RESULTS
30
+ ## Environments
31
+ - date: `Tue Feb 8 16:41:16 KST 2022`
32
+ - python version: `3.9.7 (default, Sep 16 2021, 13:09:58) [GCC 7.5.0]`
33
+ - espnet version: `espnet 0.10.6a1`
34
+ - pytorch version: `pytorch 1.10.1`
35
+ - Git hash: `e08a89e0a43db7fc12bec835c62a000ad10bd417`
36
+ - Commit date: `Sun Feb 6 18:54:20 2022 -0500`
37
+
38
+ ## diar_train_diar_raw
39
+ ### DER
40
+ dev_clean_2_ns2_beta2_500
41
+ |threshold_median_collar|DER|
42
+ |---|---|
43
+ |result_th0.3_med11_collar0.0|31.39|
44
+ |result_th0.3_med1_collar0.0|31.78|
45
+ |result_th0.4_med11_collar0.0|29.99|
46
+ |result_th0.4_med1_collar0.0|30.61|
47
+ |result_th0.5_med11_collar0.0|29.28|
48
+ |result_th0.5_med1_collar0.0|30.19|
49
+ |result_th0.6_med11_collar0.0|29.50|
50
+ |result_th0.6_med1_collar0.0|30.66|
51
+ |result_th0.7_med11_collar0.0|30.90|
52
+ |result_th0.7_med1_collar0.0|32.38|
53
+
54
+ ## DIAR config
55
+
56
+ <details><summary>expand</summary>
57
+
58
+ ```
59
+ config: conf/train_diar.yaml
60
+ print_config: false
61
+ log_level: INFO
62
+ dry_run: false
63
+ iterator_type: chunk
64
+ output_dir: exp/diar_train_diar_raw
65
+ ngpu: 1
66
+ seed: 0
67
+ num_workers: 1
68
+ num_att_plot: 3
69
+ dist_backend: nccl
70
+ dist_init_method: env://
71
+ dist_world_size: null
72
+ dist_rank: null
73
+ local_rank: 0
74
+ dist_master_addr: null
75
+ dist_master_port: null
76
+ dist_launcher: null
77
+ multiprocessing_distributed: false
78
+ unused_parameters: false
79
+ sharded_ddp: false
80
+ cudnn_enabled: true
81
+ cudnn_benchmark: false
82
+ cudnn_deterministic: true
83
+ collect_stats: false
84
+ write_collected_feats: false
85
+ max_epoch: 100
86
+ patience: 3
87
+ val_scheduler_criterion:
88
+ - valid
89
+ - loss
90
+ early_stopping_criterion:
91
+ - valid
92
+ - loss
93
+ - min
94
+ best_model_criterion:
95
+ - - valid
96
+ - acc
97
+ - max
98
+ keep_nbest_models: 3
99
+ nbest_averaging_interval: 0
100
+ grad_clip: 5
101
+ grad_clip_type: 2.0
102
+ grad_noise: false
103
+ accum_grad: 2
104
+ no_forward_run: false
105
+ resume: true
106
+ train_dtype: float32
107
+ use_amp: false
108
+ log_interval: null
109
+ use_matplotlib: true
110
+ use_tensorboard: true
111
+ use_wandb: false
112
+ wandb_project: null
113
+ wandb_id: null
114
+ wandb_entity: null
115
+ wandb_name: null
116
+ wandb_model_log_interval: -1
117
+ detect_anomaly: false
118
+ pretrain_path: null
119
+ init_param: []
120
+ ignore_init_mismatch: false
121
+ freeze_param: []
122
+ num_iters_per_epoch: null
123
+ batch_size: 16
124
+ valid_batch_size: null
125
+ batch_bins: 1000000
126
+ valid_batch_bins: null
127
+ train_shape_file:
128
+ - exp/diar_stats_8k/train/speech_shape
129
+ - exp/diar_stats_8k/train/spk_labels_shape
130
+ valid_shape_file:
131
+ - exp/diar_stats_8k/valid/speech_shape
132
+ - exp/diar_stats_8k/valid/spk_labels_shape
133
+ batch_type: folded
134
+ valid_batch_type: null
135
+ fold_length:
136
+ - 80000
137
+ - 800
138
+ sort_in_batch: descending
139
+ sort_batch: descending
140
+ multiple_iterator: false
141
+ chunk_length: 200000
142
+ chunk_shift_ratio: 0.5
143
+ num_cache_chunks: 64
144
+ train_data_path_and_name_and_type:
145
+ - - dump/raw/simu/data/train_clean_5_ns2_beta2_500/wav.scp
146
+ - speech
147
+ - sound
148
+ - - dump/raw/simu/data/train_clean_5_ns2_beta2_500/espnet_rttm
149
+ - spk_labels
150
+ - rttm
151
+ valid_data_path_and_name_and_type:
152
+ - - dump/raw/simu/data/dev_clean_2_ns2_beta2_500/wav.scp
153
+ - speech
154
+ - sound
155
+ - - dump/raw/simu/data/dev_clean_2_ns2_beta2_500/espnet_rttm
156
+ - spk_labels
157
+ - rttm
158
+ allow_variable_data_keys: false
159
+ max_cache_size: 0.0
160
+ max_cache_fd: 32
161
+ valid_max_cache_size: null
162
+ optim: adam
163
+ optim_conf:
164
+ lr: 0.01
165
+ scheduler: noamlr
166
+ scheduler_conf:
167
+ warmup_steps: 1000
168
+ num_spk: 2
169
+ init: xavier_uniform
170
+ input_size: null
171
+ model_conf:
172
+ attractor_weight: 1.0
173
+ use_preprocessor: true
174
+ frontend: default
175
+ frontend_conf:
176
+ fs: 8k
177
+ hop_length: 128
178
+ specaug: null
179
+ specaug_conf: {}
180
+ normalize: global_mvn
181
+ normalize_conf:
182
+ stats_file: exp/diar_stats_8k/train/feats_stats.npz
183
+ encoder: transformer
184
+ encoder_conf:
185
+ input_layer: linear
186
+ num_blocks: 2
187
+ linear_units: 512
188
+ dropout_rate: 0.1
189
+ output_size: 256
190
+ attention_heads: 4
191
+ attention_dropout_rate: 0.0
192
+ decoder: linear
193
+ decoder_conf: {}
194
+ label_aggregator: label_aggregator
195
+ label_aggregator_conf: {}
196
+ attractor: null
197
+ attractor_conf: {}
198
+ required:
199
+ - output_dir
200
+ version: 0.10.6a1
201
+ distributed: false
202
+ ```
203
+
204
+ </details>
205
+
206
+
207
+
208
+ ### Citing ESPnet
209
+
210
+ ```BibTex
211
+ @inproceedings{watanabe2018espnet,
212
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
213
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
214
+ year={2018},
215
+ booktitle={Proceedings of Interspeech},
216
+ pages={2207--2211},
217
+ doi={10.21437/Interspeech.2018-1456},
218
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
219
+ }
220
+
221
+
222
+
223
+
224
+ ```
225
+
226
+ or arXiv:
227
+
228
+ ```bibtex
229
+ @misc{watanabe2018espnet,
230
+ title={ESPnet: End-to-End Speech Processing Toolkit},
231
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
232
+ year={2018},
233
+ eprint={1804.00015},
234
+ archivePrefix={arXiv},
235
+ primaryClass={cs.CL}
236
+ }
237
+ ```
exp/diar_stats_8k/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
exp/diar_train_diar_raw/30epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b25262282cb313a95cb7b817031798c6c9b1e02c68483fb1f9676d74b3dcdb43
3
+ size 4404388
exp/diar_train_diar_raw/RESULTS.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_diar_result.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Tue Feb 8 16:41:16 KST 2022`
5
+ - python version: `3.9.7 (default, Sep 16 2021, 13:09:58) [GCC 7.5.0]`
6
+ - espnet version: `espnet 0.10.6a1`
7
+ - pytorch version: `pytorch 1.10.1`
8
+ - Git hash: `e08a89e0a43db7fc12bec835c62a000ad10bd417`
9
+ - Commit date: `Sun Feb 6 18:54:20 2022 -0500`
10
+
11
+ ## diar_train_diar_raw
12
+ ### DER
13
+ dev_clean_2_ns2_beta2_500
14
+ |threshold_median_collar|DER|
15
+ |---|---|
16
+ |result_th0.3_med11_collar0.0|31.39|
17
+ |result_th0.3_med1_collar0.0|31.78|
18
+ |result_th0.4_med11_collar0.0|29.99|
19
+ |result_th0.4_med1_collar0.0|30.61|
20
+ |result_th0.5_med11_collar0.0|29.28|
21
+ |result_th0.5_med1_collar0.0|30.19|
22
+ |result_th0.6_med11_collar0.0|29.50|
23
+ |result_th0.6_med1_collar0.0|30.66|
24
+ |result_th0.7_med11_collar0.0|30.90|
25
+ |result_th0.7_med1_collar0.0|32.38|
exp/diar_train_diar_raw/config.yaml ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_diar.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: chunk
6
+ output_dir: exp/diar_train_diar_raw
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: 3
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - acc
39
+ - max
40
+ keep_nbest_models: 3
41
+ nbest_averaging_interval: 0
42
+ grad_clip: 5
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 2
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ use_wandb: false
54
+ wandb_project: null
55
+ wandb_id: null
56
+ wandb_entity: null
57
+ wandb_name: null
58
+ wandb_model_log_interval: -1
59
+ detect_anomaly: false
60
+ pretrain_path: null
61
+ init_param: []
62
+ ignore_init_mismatch: false
63
+ freeze_param: []
64
+ num_iters_per_epoch: null
65
+ batch_size: 16
66
+ valid_batch_size: null
67
+ batch_bins: 1000000
68
+ valid_batch_bins: null
69
+ train_shape_file:
70
+ - exp/diar_stats_8k/train/speech_shape
71
+ - exp/diar_stats_8k/train/spk_labels_shape
72
+ valid_shape_file:
73
+ - exp/diar_stats_8k/valid/speech_shape
74
+ - exp/diar_stats_8k/valid/spk_labels_shape
75
+ batch_type: folded
76
+ valid_batch_type: null
77
+ fold_length:
78
+ - 80000
79
+ - 800
80
+ sort_in_batch: descending
81
+ sort_batch: descending
82
+ multiple_iterator: false
83
+ chunk_length: 200000
84
+ chunk_shift_ratio: 0.5
85
+ num_cache_chunks: 64
86
+ train_data_path_and_name_and_type:
87
+ - - dump/raw/simu/data/train_clean_5_ns2_beta2_500/wav.scp
88
+ - speech
89
+ - sound
90
+ - - dump/raw/simu/data/train_clean_5_ns2_beta2_500/espnet_rttm
91
+ - spk_labels
92
+ - rttm
93
+ valid_data_path_and_name_and_type:
94
+ - - dump/raw/simu/data/dev_clean_2_ns2_beta2_500/wav.scp
95
+ - speech
96
+ - sound
97
+ - - dump/raw/simu/data/dev_clean_2_ns2_beta2_500/espnet_rttm
98
+ - spk_labels
99
+ - rttm
100
+ allow_variable_data_keys: false
101
+ max_cache_size: 0.0
102
+ max_cache_fd: 32
103
+ valid_max_cache_size: null
104
+ optim: adam
105
+ optim_conf:
106
+ lr: 0.01
107
+ scheduler: noamlr
108
+ scheduler_conf:
109
+ warmup_steps: 1000
110
+ num_spk: 2
111
+ init: xavier_uniform
112
+ input_size: null
113
+ model_conf:
114
+ attractor_weight: 1.0
115
+ use_preprocessor: true
116
+ frontend: default
117
+ frontend_conf:
118
+ fs: 8k
119
+ hop_length: 128
120
+ specaug: null
121
+ specaug_conf: {}
122
+ normalize: global_mvn
123
+ normalize_conf:
124
+ stats_file: exp/diar_stats_8k/train/feats_stats.npz
125
+ encoder: transformer
126
+ encoder_conf:
127
+ input_layer: linear
128
+ num_blocks: 2
129
+ linear_units: 512
130
+ dropout_rate: 0.1
131
+ output_size: 256
132
+ attention_heads: 4
133
+ attention_dropout_rate: 0.0
134
+ decoder: linear
135
+ decoder_conf: {}
136
+ label_aggregator: label_aggregator
137
+ label_aggregator_conf: {}
138
+ attractor: null
139
+ attractor_conf: {}
140
+ required:
141
+ - output_dir
142
+ version: 0.10.6a1
143
+ distributed: false
exp/diar_train_diar_raw/images/acc.png ADDED
exp/diar_train_diar_raw/images/backward_time.png ADDED
exp/diar_train_diar_raw/images/cf.png ADDED
exp/diar_train_diar_raw/images/der.png ADDED
exp/diar_train_diar_raw/images/fa.png ADDED
exp/diar_train_diar_raw/images/forward_time.png ADDED
exp/diar_train_diar_raw/images/gpu_max_cached_mem_GB.png ADDED
exp/diar_train_diar_raw/images/iter_time.png ADDED
exp/diar_train_diar_raw/images/loss.png ADDED
exp/diar_train_diar_raw/images/loss_att.png ADDED
exp/diar_train_diar_raw/images/loss_pit.png ADDED
exp/diar_train_diar_raw/images/mi.png ADDED
exp/diar_train_diar_raw/images/optim0_lr0.png ADDED
exp/diar_train_diar_raw/images/optim_step_time.png ADDED
exp/diar_train_diar_raw/images/sad_fr.png ADDED
exp/diar_train_diar_raw/images/sad_mr.png ADDED
exp/diar_train_diar_raw/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ espnet: 0.10.6a1
2
+ files:
3
+ model_file: exp/diar_train_diar_raw/30epoch.pth
4
+ python: "3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]"
5
+ timestamp: 1644309231.467943
6
+ torch: 1.10.1
7
+ yaml_files:
8
+ train_config: exp/diar_train_diar_raw/config.yaml