“siddhu001” commited on
Commit
a9b08cf
·
1 Parent(s): 996ddfc

Update model

Browse files
README.md CHANGED
@@ -1,3 +1,290 @@
1
- ---
2
- license: cc-by-4.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - automatic-speech-recognition
6
+ language: en
7
+ datasets:
8
+ - swbd
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 ASR model
13
+
14
+ ### `espnet/Turn_taking_prediction_SWBD`
15
+
16
+ This model was trained by “siddhu001” using swbd recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout cea64abdeea5fa4f3da1a898be396e8c95c6e3ae
26
+ pip install -e .
27
+ cd egs2/swbd/asr1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/Turn_taking_prediction_SWBD
29
+ ```
30
+
31
+ # RESULTS
32
+
33
+ ## asr_train_asr_whisper_turn_taking_target_raw_en_word
34
+ ### ROC_AUC
35
+
36
+ |dataset|Continuation|Backchannel|Turn change|Interruption|Silence|Overall|
37
+ |---|---|---|---|---|---|---|
38
+ |decode_asr_chunk_asr_model_valid.loss.ave/test|93.3|89.4|90.8|91.3|95.1|92.0|
39
+
40
+ ## ASR config
41
+
42
+ <details><summary>expand</summary>
43
+
44
+ ```
45
+ config: conf/train_asr_whisper_3_uselast.yaml
46
+ print_config: false
47
+ log_level: INFO
48
+ drop_last_iter: false
49
+ dry_run: false
50
+ iterator_type: sequence
51
+ valid_iterator_type: null
52
+ output_dir: exp/asr_train_asr_whisper_3_uselast_raw_en_word
53
+ ngpu: 1
54
+ seed: 0
55
+ num_workers: 1
56
+ num_att_plot: 3
57
+ dist_backend: nccl
58
+ dist_init_method: env://
59
+ dist_world_size: 8
60
+ dist_rank: 0
61
+ local_rank: 0
62
+ dist_master_addr: localhost
63
+ dist_master_port: 33429
64
+ dist_launcher: null
65
+ multiprocessing_distributed: true
66
+ unused_parameters: true
67
+ sharded_ddp: false
68
+ cudnn_enabled: true
69
+ cudnn_benchmark: false
70
+ cudnn_deterministic: true
71
+ collect_stats: false
72
+ write_collected_feats: false
73
+ max_epoch: 32
74
+ patience: null
75
+ val_scheduler_criterion:
76
+ - valid
77
+ - loss
78
+ early_stopping_criterion:
79
+ - valid
80
+ - loss
81
+ - min
82
+ best_model_criterion:
83
+ - - valid
84
+ - loss
85
+ - min
86
+ keep_nbest_models: 10
87
+ nbest_averaging_interval: 0
88
+ grad_clip: 5.0
89
+ grad_clip_type: 2.0
90
+ grad_noise: false
91
+ accum_grad: 1
92
+ no_forward_run: false
93
+ resume: true
94
+ train_dtype: float32
95
+ use_amp: false
96
+ log_interval: null
97
+ use_matplotlib: true
98
+ use_tensorboard: true
99
+ create_graph_in_tensorboard: false
100
+ use_wandb: false
101
+ wandb_project: null
102
+ wandb_id: null
103
+ wandb_entity: null
104
+ wandb_name: null
105
+ wandb_model_log_interval: -1
106
+ detect_anomaly: false
107
+ use_adapter: false
108
+ adapter: lora
109
+ save_strategy: all
110
+ adapter_conf: {}
111
+ pretrain_path: null
112
+ init_param: []
113
+ ignore_init_mismatch: false
114
+ freeze_param:
115
+ - encoder
116
+ num_iters_per_epoch: 750
117
+ batch_size: 4000
118
+ valid_batch_size: null
119
+ batch_bins: 1000000
120
+ valid_batch_bins: null
121
+ train_shape_file:
122
+ - exp/asr_stats_raw_en_word/train/speech_shape
123
+ - exp/asr_stats_raw_en_word/train/text_shape.word
124
+ valid_shape_file:
125
+ - exp/asr_stats_raw_en_word/valid/speech_shape
126
+ - exp/asr_stats_raw_en_word/valid/text_shape.word
127
+ batch_type: folded
128
+ valid_batch_type: null
129
+ fold_length:
130
+ - 80000
131
+ - 150
132
+ sort_in_batch: descending
133
+ shuffle_within_batch: false
134
+ sort_batch: descending
135
+ multiple_iterator: false
136
+ chunk_length: 500
137
+ chunk_shift_ratio: 0.5
138
+ num_cache_chunks: 1024
139
+ chunk_excluded_key_prefixes: []
140
+ chunk_default_fs: null
141
+ train_data_path_and_name_and_type:
142
+ - - dump/raw/train/wav.scp
143
+ - speech
144
+ - kaldi_ark
145
+ - - dump/raw/train/text
146
+ - text
147
+ - text
148
+ valid_data_path_and_name_and_type:
149
+ - - dump/raw/valid/wav.scp
150
+ - speech
151
+ - kaldi_ark
152
+ - - dump/raw/valid/text
153
+ - text
154
+ - text
155
+ allow_variable_data_keys: false
156
+ max_cache_size: 0.0
157
+ max_cache_fd: 32
158
+ allow_multi_rates: false
159
+ valid_max_cache_size: null
160
+ exclude_weight_decay: false
161
+ exclude_weight_decay_conf: {}
162
+ optim: adam
163
+ optim_conf:
164
+ lr: 0.0005
165
+ scheduler: warmuplr
166
+ scheduler_conf:
167
+ warmup_steps: 500
168
+ token_list:
169
+ - <blank>
170
+ - <unk>
171
+ - C
172
+ - NA
173
+ - I
174
+ - BC
175
+ - T
176
+ - <sos/eos>
177
+ init: null
178
+ input_size: 1
179
+ ctc_conf:
180
+ dropout_rate: 0.0
181
+ ctc_type: builtin
182
+ reduce: true
183
+ ignore_nan_grad: null
184
+ zero_infinity: true
185
+ brctc_risk_strategy: exp
186
+ brctc_group_strategy: end
187
+ brctc_risk_factor: 0.0
188
+ joint_net_conf: null
189
+ use_preprocessor: true
190
+ use_lang_prompt: false
191
+ use_nlp_prompt: false
192
+ token_type: word
193
+ bpemodel: null
194
+ non_linguistic_symbols: null
195
+ cleaner: null
196
+ g2p: null
197
+ speech_volume_normalize: null
198
+ rir_scp: null
199
+ rir_apply_prob: 1.0
200
+ noise_scp: null
201
+ noise_apply_prob: 1.0
202
+ noise_db_range: '13_15'
203
+ short_noise_thres: 0.5
204
+ aux_ctc_tasks: []
205
+ frontend: null
206
+ frontend_conf: {}
207
+ specaug: null
208
+ specaug_conf: {}
209
+ normalize: null
210
+ normalize_conf: {}
211
+ model: espnet
212
+ model_conf:
213
+ ctc_weight: 0.0
214
+ lsm_weight: 0.1
215
+ length_normalized_loss: false
216
+ superb_setup: true
217
+ num_class: 5
218
+ ssl_input_size: 1024
219
+ extract_feats_in_collect_stats: false
220
+ use_only_last_correct: true
221
+ preencoder: null
222
+ preencoder_conf: {}
223
+ encoder: whisper
224
+ encoder_conf:
225
+ whisper_model: medium
226
+ dropout_rate: 0.0
227
+ use_specaug: false
228
+ specaug_conf:
229
+ apply_time_warp: true
230
+ time_warp_window: 5
231
+ time_warp_mode: bicubic
232
+ apply_freq_mask: true
233
+ freq_mask_width_range:
234
+ - 0
235
+ - 40
236
+ num_freq_mask: 2
237
+ apply_time_mask: true
238
+ time_mask_width_ratio_range:
239
+ - 0.0
240
+ - 0.12
241
+ num_time_mask: 5
242
+ postencoder: null
243
+ postencoder_conf: {}
244
+ decoder: null
245
+ decoder_conf: {}
246
+ preprocessor: default
247
+ preprocessor_conf: {}
248
+ required:
249
+ - output_dir
250
+ - token_list
251
+ version: '202402'
252
+ distributed: true
253
+ ```
254
+
255
+ </details>
256
+
257
+
258
+
259
+ ### Citing ESPnet
260
+
261
+ ```BibTex
262
+ @inproceedings{watanabe2018espnet,
263
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
264
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
265
+ year={2018},
266
+ booktitle={Proceedings of Interspeech},
267
+ pages={2207--2211},
268
+ doi={10.21437/Interspeech.2018-1456},
269
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
270
+ }
271
+
272
+
273
+
274
+
275
+
276
+
277
+ ```
278
+
279
+ or arXiv:
280
+
281
+ ```bibtex
282
+ @misc{watanabe2018espnet,
283
+ title={ESPnet: End-to-End Speech Processing Toolkit},
284
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
285
+ year={2018},
286
+ eprint={1804.00015},
287
+ archivePrefix={arXiv},
288
+ primaryClass={cs.CL}
289
+ }
290
+ ```
exp/asr_train_asr_whisper_turn_taking_raw_en_word/RESULTS.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # RESULTS
2
+
3
+ ## asr_train_asr_whisper_turn_taking_target_raw_en_word
4
+ ### ROC_AUC
5
+
6
+ |dataset|Continuation|Backchannel|Turn change|Interruption|Silence|Overall|
7
+ |---|---|---|---|---|---|---|
8
+ |decode_asr_chunk_asr_model_valid.loss.ave/test|93.3|89.4|90.8|91.3|95.1|92.0|
9
+
exp/asr_train_asr_whisper_turn_taking_raw_en_word/config.yaml ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_asr_whisper_3_uselast.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/asr_train_asr_whisper_3_uselast_raw_en_word
9
+ ngpu: 1
10
+ seed: 0
11
+ num_workers: 1
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: 8
16
+ dist_rank: 0
17
+ local_rank: 0
18
+ dist_master_addr: localhost
19
+ dist_master_port: 33429
20
+ dist_launcher: null
21
+ multiprocessing_distributed: true
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: true
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 32
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - valid
40
+ - loss
41
+ - min
42
+ keep_nbest_models: 10
43
+ nbest_averaging_interval: 0
44
+ grad_clip: 5.0
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: null
53
+ use_matplotlib: true
54
+ use_tensorboard: true
55
+ create_graph_in_tensorboard: false
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ use_adapter: false
64
+ adapter: lora
65
+ save_strategy: all
66
+ adapter_conf: {}
67
+ pretrain_path: null
68
+ init_param: []
69
+ ignore_init_mismatch: false
70
+ freeze_param:
71
+ - encoder
72
+ num_iters_per_epoch: 750
73
+ batch_size: 4000
74
+ valid_batch_size: null
75
+ batch_bins: 1000000
76
+ valid_batch_bins: null
77
+ train_shape_file:
78
+ - exp/asr_stats_raw_en_word/train/speech_shape
79
+ - exp/asr_stats_raw_en_word/train/text_shape.word
80
+ valid_shape_file:
81
+ - exp/asr_stats_raw_en_word/valid/speech_shape
82
+ - exp/asr_stats_raw_en_word/valid/text_shape.word
83
+ batch_type: folded
84
+ valid_batch_type: null
85
+ fold_length:
86
+ - 80000
87
+ - 150
88
+ sort_in_batch: descending
89
+ shuffle_within_batch: false
90
+ sort_batch: descending
91
+ multiple_iterator: false
92
+ chunk_length: 500
93
+ chunk_shift_ratio: 0.5
94
+ num_cache_chunks: 1024
95
+ chunk_excluded_key_prefixes: []
96
+ chunk_default_fs: null
97
+ train_data_path_and_name_and_type:
98
+ - - dump/raw/train/wav.scp
99
+ - speech
100
+ - kaldi_ark
101
+ - - dump/raw/train/text
102
+ - text
103
+ - text
104
+ valid_data_path_and_name_and_type:
105
+ - - dump/raw/valid/wav.scp
106
+ - speech
107
+ - kaldi_ark
108
+ - - dump/raw/valid/text
109
+ - text
110
+ - text
111
+ allow_variable_data_keys: false
112
+ max_cache_size: 0.0
113
+ max_cache_fd: 32
114
+ allow_multi_rates: false
115
+ valid_max_cache_size: null
116
+ exclude_weight_decay: false
117
+ exclude_weight_decay_conf: {}
118
+ optim: adam
119
+ optim_conf:
120
+ lr: 0.0005
121
+ scheduler: warmuplr
122
+ scheduler_conf:
123
+ warmup_steps: 500
124
+ token_list:
125
+ - <blank>
126
+ - <unk>
127
+ - C
128
+ - NA
129
+ - I
130
+ - BC
131
+ - T
132
+ - <sos/eos>
133
+ init: null
134
+ input_size: 1
135
+ ctc_conf:
136
+ dropout_rate: 0.0
137
+ ctc_type: builtin
138
+ reduce: true
139
+ ignore_nan_grad: null
140
+ zero_infinity: true
141
+ brctc_risk_strategy: exp
142
+ brctc_group_strategy: end
143
+ brctc_risk_factor: 0.0
144
+ joint_net_conf: null
145
+ use_preprocessor: true
146
+ use_lang_prompt: false
147
+ use_nlp_prompt: false
148
+ token_type: word
149
+ bpemodel: null
150
+ non_linguistic_symbols: null
151
+ cleaner: null
152
+ g2p: null
153
+ speech_volume_normalize: null
154
+ rir_scp: null
155
+ rir_apply_prob: 1.0
156
+ noise_scp: null
157
+ noise_apply_prob: 1.0
158
+ noise_db_range: '13_15'
159
+ short_noise_thres: 0.5
160
+ aux_ctc_tasks: []
161
+ frontend: null
162
+ frontend_conf: {}
163
+ specaug: null
164
+ specaug_conf: {}
165
+ normalize: null
166
+ normalize_conf: {}
167
+ model: espnet
168
+ model_conf:
169
+ ctc_weight: 0.0
170
+ lsm_weight: 0.1
171
+ length_normalized_loss: false
172
+ superb_setup: true
173
+ num_class: 5
174
+ ssl_input_size: 1024
175
+ extract_feats_in_collect_stats: false
176
+ use_only_last_correct: true
177
+ preencoder: null
178
+ preencoder_conf: {}
179
+ encoder: whisper
180
+ encoder_conf:
181
+ whisper_model: medium
182
+ dropout_rate: 0.0
183
+ use_specaug: false
184
+ specaug_conf:
185
+ apply_time_warp: true
186
+ time_warp_window: 5
187
+ time_warp_mode: bicubic
188
+ apply_freq_mask: true
189
+ freq_mask_width_range:
190
+ - 0
191
+ - 40
192
+ num_freq_mask: 2
193
+ apply_time_mask: true
194
+ time_mask_width_ratio_range:
195
+ - 0.0
196
+ - 0.12
197
+ num_time_mask: 5
198
+ postencoder: null
199
+ postencoder_conf: {}
200
+ decoder: null
201
+ decoder_conf: {}
202
+ preprocessor: default
203
+ preprocessor_conf: {}
204
+ required:
205
+ - output_dir
206
+ - token_list
207
+ version: '202402'
208
+ distributed: true
exp/asr_train_asr_whisper_turn_taking_raw_en_word/valid.loss.ave.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62c6cf9a1db90bf0f8c43c51cdace3b5bcb4ae42275a038fbbf2e3d3545864f5
3
+ size 1233224014
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202412'
2
+ files:
3
+ asr_model_file: exp/asr_train_asr_whisper_turn_taking_raw_en_word/valid.loss.ave.pth
4
+ python: "3.9.13 (main, Aug 25 2022, 23:26:10) \n[GCC 11.2.0]"
5
+ timestamp: 1741218801.877928
6
+ torch: 2.0.0+cu117
7
+ yaml_files:
8
+ asr_train_config: exp/asr_train_asr_whisper_turn_taking_raw_en_word/config.yaml