Yoshiki commited on
Commit
570a094
1 Parent(s): d82e13d

Update model

Browse files
Files changed (25) hide show
  1. README.md +406 -1
  2. data/nlsyms.txt +3 -0
  3. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/RESULTS.md +33 -0
  4. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/RESULTS_enh.md +22 -0
  5. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/config.yaml +303 -0
  6. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/acc.png +0 -0
  7. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/backward_time.png +0 -0
  8. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/cer.png +0 -0
  9. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/cer_ctc.png +0 -0
  10. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/forward_time.png +0 -0
  11. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/gpu_max_cached_mem_GB.png +0 -0
  12. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/iter_time.png +0 -0
  13. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/loss.png +0 -0
  14. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/loss_asr.png +0 -0
  15. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/loss_att.png +0 -0
  16. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/loss_ctc.png +0 -0
  17. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/loss_enh.png +0 -0
  18. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/optim0_lr0.png +0 -0
  19. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/optim_step_time.png +0 -0
  20. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/train_time.png +0 -0
  21. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/wer.png +0 -0
  22. exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/valid.acc.ave_10best.pth +3 -0
  23. exp/lm_train_lm_transformer_en_char/12epoch.pth +3 -0
  24. exp/lm_train_lm_transformer_en_char/config.yaml +189 -0
  25. meta.yaml +10 -0
README.md CHANGED
@@ -1,3 +1,408 @@
1
  ---
2
- license: apache-2.0
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - speech-enhancement-recognition
6
+ language: en
7
+ datasets:
8
+ - chime4
9
+ license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 EnhS2T model
13
+
14
+ ### `Yoshiki/chime4_enh_asr1_wpd_wavlm_conformer`
15
+
16
+ This model was trained by Yoshiki using chime4 recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ 8ed83f45d5aa2ca6b3635e44b9c29afb9b5fb600
26
+ pip install -e .
27
+ cd egs2/chime4/enh_asr1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model Yoshiki/chime4_enh_asr1_wpd_wavlm_conformer
29
+ ```
30
+
31
+ <!-- Generated by scripts/utils/show_asr_result.sh -->
32
+ # RESULTS
33
+ ## Environments
34
+ - date: `Tue Oct 11 02:40:53 UTC 2022`
35
+ - python version: `3.7.4 (default, Aug 13 2019, 20:35:49) [GCC 7.3.0]`
36
+ - espnet version: `espnet 202207`
37
+ - pytorch version: `pytorch 1.10.1+cu111`
38
+ - Git hash: ``
39
+ - Commit date: ``
40
+
41
+ ## enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char
42
+ ### WER
43
+
44
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
45
+ |---|---|---|---|---|---|---|---|---|
46
+ |decode_asr_transformer_largelm_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave_10best/dt05_real_isolated_6ch_track|1640|27119|98.8|0.9|0.2|0.2|1.3|16.2|
47
+ |decode_asr_transformer_largelm_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave_10best/dt05_simu_isolated_6ch_track|1640|27120|98.9|0.9|0.2|0.1|1.3|15.2|
48
+ |decode_asr_transformer_largelm_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave_10best/et05_real_isolated_6ch_track|1320|21409|98.4|1.4|0.2|0.2|1.8|20.6|
49
+ |decode_asr_transformer_largelm_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave_10best/et05_simu_isolated_6ch_track|1320|21416|98.9|1.0|0.2|0.1|1.2|15.2|
50
+
51
+ ### CER
52
+
53
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
54
+ |---|---|---|---|---|---|---|---|---|
55
+ |decode_asr_transformer_largelm_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave_10best/dt05_real_isolated_6ch_track|1640|160390|99.7|0.1|0.2|0.2|0.5|16.2|
56
+ |decode_asr_transformer_largelm_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave_10best/dt05_simu_isolated_6ch_track|1640|160400|99.7|0.1|0.2|0.1|0.5|15.2|
57
+ |decode_asr_transformer_largelm_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave_10best/et05_real_isolated_6ch_track|1320|126796|99.5|0.2|0.3|0.2|0.7|20.6|
58
+ |decode_asr_transformer_largelm_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave_10best/et05_simu_isolated_6ch_track|1320|126812|99.7|0.2|0.2|0.1|0.5|15.2|
59
+
60
+ ### TER
61
+
62
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
63
+ |---|---|---|---|---|---|---|---|---|
64
+
65
+ ## EnhS2T config
66
+
67
+ <details><summary>expand</summary>
68
+
69
+ ```
70
+ config: conf/tuning/train_enh_asr_wpd_init_noenhloss_wavlm_conformer.yaml
71
+ print_config: false
72
+ log_level: INFO
73
+ dry_run: false
74
+ iterator_type: sequence
75
+ output_dir: exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char
76
+ ngpu: 1
77
+ seed: 0
78
+ num_workers: 1
79
+ num_att_plot: 3
80
+ dist_backend: nccl
81
+ dist_init_method: env://
82
+ dist_world_size: null
83
+ dist_rank: null
84
+ local_rank: 0
85
+ dist_master_addr: null
86
+ dist_master_port: null
87
+ dist_launcher: null
88
+ multiprocessing_distributed: false
89
+ unused_parameters: true
90
+ sharded_ddp: false
91
+ cudnn_enabled: true
92
+ cudnn_benchmark: false
93
+ cudnn_deterministic: true
94
+ collect_stats: false
95
+ write_collected_feats: false
96
+ max_epoch: 31
97
+ patience: 10
98
+ val_scheduler_criterion:
99
+ - valid
100
+ - loss
101
+ early_stopping_criterion:
102
+ - valid
103
+ - loss
104
+ - min
105
+ best_model_criterion:
106
+ - - valid
107
+ - acc
108
+ - max
109
+ - - train
110
+ - loss
111
+ - min
112
+ keep_nbest_models: 10
113
+ nbest_averaging_interval: 0
114
+ grad_clip: 1
115
+ grad_clip_type: 2.0
116
+ grad_noise: false
117
+ accum_grad: 2
118
+ no_forward_run: false
119
+ resume: true
120
+ train_dtype: float32
121
+ use_amp: false
122
+ log_interval: null
123
+ use_matplotlib: true
124
+ use_tensorboard: true
125
+ create_graph_in_tensorboard: false
126
+ use_wandb: false
127
+ wandb_project: null
128
+ wandb_id: null
129
+ wandb_entity: null
130
+ wandb_name: null
131
+ wandb_model_log_interval: -1
132
+ detect_anomaly: false
133
+ pretrain_path: null
134
+ init_param:
135
+ - ../enh1/exp/enh_train_enh_beamformer_wpd_ci_sdr_shorttap_raw/valid.loss.best.pth:separator:enh_model.separator
136
+ - ../asr1/exp/asr_train_asr_conformer_wavlm2_raw_en_char/valid.acc.best.pth:frontend:s2t_model.frontend
137
+ - ../asr1/exp/asr_train_asr_conformer_wavlm2_raw_en_char/valid.acc.best.pth:preencoder:s2t_model.preencoder
138
+ - ../asr1/exp/asr_train_asr_conformer_wavlm2_raw_en_char/valid.acc.best.pth:encoder:s2t_model.encoder
139
+ - ../asr1/exp/asr_train_asr_conformer_wavlm2_raw_en_char/valid.acc.best.pth:ctc:s2t_model.ctc
140
+ - ../asr1/exp/asr_train_asr_conformer_wavlm2_raw_en_char/valid.acc.best.pth:decoder:s2t_model.decoder
141
+ ignore_init_mismatch: false
142
+ freeze_param:
143
+ - s2t_model.frontend.upstream
144
+ num_iters_per_epoch: null
145
+ batch_size: 16
146
+ valid_batch_size: null
147
+ batch_bins: 1000000
148
+ valid_batch_bins: null
149
+ train_shape_file:
150
+ - exp/enh_asr_stats_raw_en_char/train/speech_shape
151
+ - exp/enh_asr_stats_raw_en_char/train/speech_ref1_shape
152
+ - exp/enh_asr_stats_raw_en_char/train/text_spk1_shape.char
153
+ valid_shape_file:
154
+ - exp/enh_asr_stats_raw_en_char/valid/speech_shape
155
+ - exp/enh_asr_stats_raw_en_char/valid/speech_ref1_shape
156
+ - exp/enh_asr_stats_raw_en_char/valid/text_spk1_shape.char
157
+ batch_type: folded
158
+ valid_batch_type: null
159
+ fold_length:
160
+ - 80000
161
+ - 80000
162
+ - 150
163
+ sort_in_batch: descending
164
+ sort_batch: descending
165
+ multiple_iterator: false
166
+ chunk_length: 500
167
+ chunk_shift_ratio: 0.5
168
+ num_cache_chunks: 1024
169
+ train_data_path_and_name_and_type:
170
+ - - dump/raw/tr05_multi_isolated_6ch_track/wav.scp
171
+ - speech
172
+ - sound
173
+ - - dump/raw/tr05_multi_isolated_6ch_track/spk1.scp
174
+ - speech_ref1
175
+ - sound
176
+ - - dump/raw/tr05_multi_isolated_6ch_track/text_spk1
177
+ - text_spk1
178
+ - text
179
+ valid_data_path_and_name_and_type:
180
+ - - dump/raw/dt05_multi_isolated_6ch_track/wav.scp
181
+ - speech
182
+ - sound
183
+ - - dump/raw/dt05_multi_isolated_6ch_track/spk1.scp
184
+ - speech_ref1
185
+ - sound
186
+ - - dump/raw/dt05_multi_isolated_6ch_track/text_spk1
187
+ - text_spk1
188
+ - text
189
+ allow_variable_data_keys: false
190
+ max_cache_size: 0.0
191
+ max_cache_fd: 32
192
+ valid_max_cache_size: null
193
+ optim: sgd
194
+ optim_conf:
195
+ lr: 0.001
196
+ momentum: 0.9
197
+ scheduler: null
198
+ scheduler_conf: {}
199
+ token_list: data/en_token_list/char/tokens.txt
200
+ src_token_list: null
201
+ init: xavier_uniform
202
+ input_size: null
203
+ ctc_conf:
204
+ dropout_rate: 0.0
205
+ ctc_type: builtin
206
+ reduce: true
207
+ ignore_nan_grad: null
208
+ zero_infinity: true
209
+ enh_criterions:
210
+ - name: ci_sdr
211
+ conf:
212
+ filter_length: 512
213
+ wrapper: fixed_order
214
+ wrapper_conf:
215
+ weight: 0.1
216
+ diar_num_spk: null
217
+ diar_input_size: null
218
+ enh_model_conf:
219
+ stft_consistency: false
220
+ loss_type: mask_mse
221
+ mask_type: null
222
+ asr_model_conf:
223
+ ctc_weight: 0.3
224
+ lsm_weight: 0.1
225
+ length_normalized_loss: false
226
+ extract_feats_in_collect_stats: false
227
+ st_model_conf:
228
+ stft_consistency: false
229
+ loss_type: mask_mse
230
+ mask_type: null
231
+ diar_model_conf:
232
+ diar_weight: 1.0
233
+ attractor_weight: 1.0
234
+ subtask_series:
235
+ - enh
236
+ - asr
237
+ model_conf:
238
+ calc_enh_loss: false
239
+ bypass_enh_prob: 0.0
240
+ use_preprocessor: true
241
+ token_type: char
242
+ bpemodel: null
243
+ src_token_type: bpe
244
+ src_bpemodel: null
245
+ non_linguistic_symbols: data/nlsyms.txt
246
+ cleaner: null
247
+ g2p: null
248
+ text_name:
249
+ - text_spk1
250
+ enh_encoder: stft
251
+ enh_encoder_conf:
252
+ n_fft: 512
253
+ win_length: 400
254
+ hop_length: 128
255
+ use_builtin_complex: false
256
+ enh_separator: wpe_beamformer
257
+ enh_separator_conf:
258
+ num_spk: 1
259
+ loss_type: spectrum
260
+ use_wpe: false
261
+ wnet_type: blstmp
262
+ wlayers: 3
263
+ wunits: 512
264
+ wprojs: 512
265
+ wdropout_rate: 0.0
266
+ taps: 3
267
+ delay: 3
268
+ use_dnn_mask_for_wpe: true
269
+ use_beamformer: true
270
+ bnet_type: blstmp
271
+ blayers: 3
272
+ bunits: 512
273
+ bprojs: 512
274
+ badim: 320
275
+ ref_channel: 4
276
+ use_noise_mask: true
277
+ beamformer_type: wpd_souden
278
+ bdropout_rate: 0.0
279
+ enh_decoder: stft
280
+ enh_decoder_conf:
281
+ n_fft: 512
282
+ win_length: 400
283
+ hop_length: 128
284
+ enh_mask_module: multi_mask
285
+ enh_mask_module_conf: {}
286
+ frontend: s3prl
287
+ frontend_conf:
288
+ frontend_conf:
289
+ upstream: wavlm_large
290
+ download_dir: ./hub
291
+ multilayer_feature: true
292
+ fs: 16k
293
+ specaug: specaug
294
+ specaug_conf:
295
+ apply_time_warp: true
296
+ time_warp_window: 5
297
+ time_warp_mode: bicubic
298
+ apply_freq_mask: true
299
+ freq_mask_width_range:
300
+ - 0
301
+ - 100
302
+ num_freq_mask: 4
303
+ apply_time_mask: true
304
+ time_mask_width_range:
305
+ - 0
306
+ - 40
307
+ num_time_mask: 2
308
+ normalize: utterance_mvn
309
+ normalize_conf: {}
310
+ asr_preencoder: linear
311
+ asr_preencoder_conf:
312
+ input_size: 1024
313
+ output_size: 80
314
+ asr_encoder: conformer
315
+ asr_encoder_conf:
316
+ output_size: 256
317
+ attention_heads: 4
318
+ linear_units: 2048
319
+ num_blocks: 12
320
+ dropout_rate: 0.1
321
+ positional_dropout_rate: 0.1
322
+ attention_dropout_rate: 0.0
323
+ input_layer: conv2d2
324
+ normalize_before: true
325
+ macaron_style: true
326
+ pos_enc_layer_type: rel_pos
327
+ selfattention_layer_type: rel_selfattn
328
+ activation_type: swish
329
+ use_cnn_module: true
330
+ cnn_module_kernel: 15
331
+ asr_postencoder: null
332
+ asr_postencoder_conf: {}
333
+ asr_decoder: transformer
334
+ asr_decoder_conf:
335
+ input_layer: embed
336
+ attention_heads: 4
337
+ linear_units: 2048
338
+ num_blocks: 6
339
+ dropout_rate: 0.1
340
+ positional_dropout_rate: 0.1
341
+ self_attention_dropout_rate: 0.0
342
+ src_attention_dropout_rate: 0.0
343
+ st_preencoder: null
344
+ st_preencoder_conf: {}
345
+ st_encoder: rnn
346
+ st_encoder_conf: {}
347
+ st_postencoder: null
348
+ st_postencoder_conf: {}
349
+ st_decoder: rnn
350
+ st_decoder_conf: {}
351
+ st_extra_asr_decoder: rnn
352
+ st_extra_asr_decoder_conf: {}
353
+ st_extra_mt_decoder: rnn
354
+ st_extra_mt_decoder_conf: {}
355
+ diar_frontend: default
356
+ diar_frontend_conf: {}
357
+ diar_specaug: null
358
+ diar_specaug_conf: {}
359
+ diar_normalize: utterance_mvn
360
+ diar_normalize_conf: {}
361
+ diar_encoder: transformer
362
+ diar_encoder_conf: {}
363
+ diar_decoder: linear
364
+ diar_decoder_conf: {}
365
+ label_aggregator: label_aggregator
366
+ label_aggregator_conf: {}
367
+ diar_attractor: null
368
+ diar_attractor_conf: {}
369
+ required:
370
+ - output_dir
371
+ version: '202207'
372
+ distributed: false
373
+ ```
374
+
375
+ </details>
376
+
377
+
378
+
379
+ ### Citing ESPnet
380
+
381
+ ```BibTex
382
+ @inproceedings{watanabe2018espnet,
383
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
384
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
385
+ year={2018},
386
+ booktitle={Proceedings of Interspeech},
387
+ pages={2207--2211},
388
+ doi={10.21437/Interspeech.2018-1456},
389
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
390
+ }
391
+
392
+
393
+
394
+
395
+ ```
396
+
397
+ or arXiv:
398
+
399
+ ```bibtex
400
+ @misc{watanabe2018espnet,
401
+ title={ESPnet: End-to-End Speech Processing Toolkit},
402
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
403
+ year={2018},
404
+ eprint={1804.00015},
405
+ archivePrefix={arXiv},
406
+ primaryClass={cs.CL}
407
+ }
408
+ ```
data/nlsyms.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ <*IN*>
2
+ <*MR.*>
3
+ <NOISE>
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/RESULTS.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_asr_result.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Tue Oct 11 02:40:53 UTC 2022`
5
+ - python version: `3.7.4 (default, Aug 13 2019, 20:35:49) [GCC 7.3.0]`
6
+ - espnet version: `espnet 202207`
7
+ - pytorch version: `pytorch 1.10.1+cu111`
8
+ - Git hash: ``
9
+ - Commit date: ``
10
+
11
+ ## enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char
12
+ ### WER
13
+
14
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
15
+ |---|---|---|---|---|---|---|---|---|
16
+ |decode_asr_transformer_largelm_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave_10best/dt05_real_isolated_6ch_track|1640|27119|98.8|0.9|0.2|0.2|1.3|16.2|
17
+ |decode_asr_transformer_largelm_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave_10best/dt05_simu_isolated_6ch_track|1640|27120|98.9|0.9|0.2|0.1|1.3|15.2|
18
+ |decode_asr_transformer_largelm_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave_10best/et05_real_isolated_6ch_track|1320|21409|98.4|1.4|0.2|0.2|1.8|20.6|
19
+ |decode_asr_transformer_largelm_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave_10best/et05_simu_isolated_6ch_track|1320|21416|98.9|1.0|0.2|0.1|1.2|15.2|
20
+
21
+ ### CER
22
+
23
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
24
+ |---|---|---|---|---|---|---|---|---|
25
+ |decode_asr_transformer_largelm_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave_10best/dt05_real_isolated_6ch_track|1640|160390|99.7|0.1|0.2|0.2|0.5|16.2|
26
+ |decode_asr_transformer_largelm_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave_10best/dt05_simu_isolated_6ch_track|1640|160400|99.7|0.1|0.2|0.1|0.5|15.2|
27
+ |decode_asr_transformer_largelm_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave_10best/et05_real_isolated_6ch_track|1320|126796|99.5|0.2|0.3|0.2|0.7|20.6|
28
+ |decode_asr_transformer_largelm_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave_10best/et05_simu_isolated_6ch_track|1320|126812|99.7|0.2|0.2|0.1|0.5|15.2|
29
+
30
+ ### TER
31
+
32
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
33
+ |---|---|---|---|---|---|---|---|---|
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/RESULTS_enh.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by ./scripts/utils/show_enh_score.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Tue Oct 11 02:43:26 UTC 2022`
5
+ - python version: `3.7.4 (default, Aug 13 2019, 20:35:49) [GCC 7.3.0]`
6
+ - espnet version: `espnet 202207`
7
+ - pytorch version: `pytorch 1.10.1+cu111`
8
+ - Git hash: ``
9
+ - Commit date: ``
10
+
11
+
12
+ ## enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char
13
+
14
+ config: conf/tuning/train_enh_asr_wpd_init_noenhloss_wavlm_conformer.yaml
15
+
16
+ |dataset|STOI|SAR|SDR|SIR|SI_SNR|
17
+ |---|---|---|---|---|---|
18
+ |enhanced_dt05_real_isolated_6ch_track|59.38|-0.54|-0.54|0.00|-28.31|
19
+ |enhanced_dt05_simu_isolated_6ch_track|94.48|14.95|14.95|0.00|12.43|
20
+ |enhanced_et05_real_isolated_6ch_track|51.17|-2.56|-2.56|0.00|-30.17|
21
+ |enhanced_et05_simu_isolated_6ch_track|94.93|16.08|16.08|0.00|13.98|
22
+
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/config.yaml ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_enh_asr_wpd_init_noenhloss_wavlm_conformer.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 31
28
+ patience: 10
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - acc
39
+ - max
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 10
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 1
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 2
49
+ no_forward_run: false
50
+ resume: true
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_matplotlib: true
55
+ use_tensorboard: true
56
+ create_graph_in_tensorboard: false
57
+ use_wandb: false
58
+ wandb_project: null
59
+ wandb_id: null
60
+ wandb_entity: null
61
+ wandb_name: null
62
+ wandb_model_log_interval: -1
63
+ detect_anomaly: false
64
+ pretrain_path: null
65
+ init_param:
66
+ - ../enh1/exp/enh_train_enh_beamformer_wpd_ci_sdr_shorttap_raw/valid.loss.best.pth:separator:enh_model.separator
67
+ - ../asr1/exp/asr_train_asr_conformer_wavlm2_raw_en_char/valid.acc.best.pth:frontend:s2t_model.frontend
68
+ - ../asr1/exp/asr_train_asr_conformer_wavlm2_raw_en_char/valid.acc.best.pth:preencoder:s2t_model.preencoder
69
+ - ../asr1/exp/asr_train_asr_conformer_wavlm2_raw_en_char/valid.acc.best.pth:encoder:s2t_model.encoder
70
+ - ../asr1/exp/asr_train_asr_conformer_wavlm2_raw_en_char/valid.acc.best.pth:ctc:s2t_model.ctc
71
+ - ../asr1/exp/asr_train_asr_conformer_wavlm2_raw_en_char/valid.acc.best.pth:decoder:s2t_model.decoder
72
+ ignore_init_mismatch: false
73
+ freeze_param:
74
+ - s2t_model.frontend.upstream
75
+ num_iters_per_epoch: null
76
+ batch_size: 16
77
+ valid_batch_size: null
78
+ batch_bins: 1000000
79
+ valid_batch_bins: null
80
+ train_shape_file:
81
+ - exp/enh_asr_stats_raw_en_char/train/speech_shape
82
+ - exp/enh_asr_stats_raw_en_char/train/speech_ref1_shape
83
+ - exp/enh_asr_stats_raw_en_char/train/text_spk1_shape.char
84
+ valid_shape_file:
85
+ - exp/enh_asr_stats_raw_en_char/valid/speech_shape
86
+ - exp/enh_asr_stats_raw_en_char/valid/speech_ref1_shape
87
+ - exp/enh_asr_stats_raw_en_char/valid/text_spk1_shape.char
88
+ batch_type: folded
89
+ valid_batch_type: null
90
+ fold_length:
91
+ - 80000
92
+ - 80000
93
+ - 150
94
+ sort_in_batch: descending
95
+ sort_batch: descending
96
+ multiple_iterator: false
97
+ chunk_length: 500
98
+ chunk_shift_ratio: 0.5
99
+ num_cache_chunks: 1024
100
+ train_data_path_and_name_and_type:
101
+ - - dump/raw/tr05_multi_isolated_6ch_track/wav.scp
102
+ - speech
103
+ - sound
104
+ - - dump/raw/tr05_multi_isolated_6ch_track/spk1.scp
105
+ - speech_ref1
106
+ - sound
107
+ - - dump/raw/tr05_multi_isolated_6ch_track/text_spk1
108
+ - text_spk1
109
+ - text
110
+ valid_data_path_and_name_and_type:
111
+ - - dump/raw/dt05_multi_isolated_6ch_track/wav.scp
112
+ - speech
113
+ - sound
114
+ - - dump/raw/dt05_multi_isolated_6ch_track/spk1.scp
115
+ - speech_ref1
116
+ - sound
117
+ - - dump/raw/dt05_multi_isolated_6ch_track/text_spk1
118
+ - text_spk1
119
+ - text
120
+ allow_variable_data_keys: false
121
+ max_cache_size: 0.0
122
+ max_cache_fd: 32
123
+ valid_max_cache_size: null
124
+ optim: sgd
125
+ optim_conf:
126
+ lr: 0.001
127
+ momentum: 0.9
128
+ scheduler: null
129
+ scheduler_conf: {}
130
+ token_list: data/en_token_list/char/tokens.txt
131
+ src_token_list: null
132
+ init: xavier_uniform
133
+ input_size: null
134
+ ctc_conf:
135
+ dropout_rate: 0.0
136
+ ctc_type: builtin
137
+ reduce: true
138
+ ignore_nan_grad: null
139
+ zero_infinity: true
140
+ enh_criterions:
141
+ - name: ci_sdr
142
+ conf:
143
+ filter_length: 512
144
+ wrapper: fixed_order
145
+ wrapper_conf:
146
+ weight: 0.1
147
+ diar_num_spk: null
148
+ diar_input_size: null
149
+ enh_model_conf:
150
+ stft_consistency: false
151
+ loss_type: mask_mse
152
+ mask_type: null
153
+ asr_model_conf:
154
+ ctc_weight: 0.3
155
+ lsm_weight: 0.1
156
+ length_normalized_loss: false
157
+ extract_feats_in_collect_stats: false
158
+ st_model_conf:
159
+ stft_consistency: false
160
+ loss_type: mask_mse
161
+ mask_type: null
162
+ diar_model_conf:
163
+ diar_weight: 1.0
164
+ attractor_weight: 1.0
165
+ subtask_series:
166
+ - enh
167
+ - asr
168
+ model_conf:
169
+ calc_enh_loss: false
170
+ bypass_enh_prob: 0.0
171
+ use_preprocessor: true
172
+ token_type: char
173
+ bpemodel: null
174
+ src_token_type: bpe
175
+ src_bpemodel: null
176
+ non_linguistic_symbols: data/nlsyms.txt
177
+ cleaner: null
178
+ g2p: null
179
+ text_name:
180
+ - text_spk1
181
+ enh_encoder: stft
182
+ enh_encoder_conf:
183
+ n_fft: 512
184
+ win_length: 400
185
+ hop_length: 128
186
+ use_builtin_complex: false
187
+ enh_separator: wpe_beamformer
188
+ enh_separator_conf:
189
+ num_spk: 1
190
+ loss_type: spectrum
191
+ use_wpe: false
192
+ wnet_type: blstmp
193
+ wlayers: 3
194
+ wunits: 512
195
+ wprojs: 512
196
+ wdropout_rate: 0.0
197
+ taps: 3
198
+ delay: 3
199
+ use_dnn_mask_for_wpe: true
200
+ use_beamformer: true
201
+ bnet_type: blstmp
202
+ blayers: 3
203
+ bunits: 512
204
+ bprojs: 512
205
+ badim: 320
206
+ ref_channel: 4
207
+ use_noise_mask: true
208
+ beamformer_type: wpd_souden
209
+ bdropout_rate: 0.0
210
+ enh_decoder: stft
211
+ enh_decoder_conf:
212
+ n_fft: 512
213
+ win_length: 400
214
+ hop_length: 128
215
+ enh_mask_module: multi_mask
216
+ enh_mask_module_conf: {}
217
+ frontend: s3prl
218
+ frontend_conf:
219
+ frontend_conf:
220
+ upstream: wavlm_large
221
+ download_dir: ./hub
222
+ multilayer_feature: true
223
+ fs: 16k
224
+ specaug: specaug
225
+ specaug_conf:
226
+ apply_time_warp: true
227
+ time_warp_window: 5
228
+ time_warp_mode: bicubic
229
+ apply_freq_mask: true
230
+ freq_mask_width_range:
231
+ - 0
232
+ - 100
233
+ num_freq_mask: 4
234
+ apply_time_mask: true
235
+ time_mask_width_range:
236
+ - 0
237
+ - 40
238
+ num_time_mask: 2
239
+ normalize: utterance_mvn
240
+ normalize_conf: {}
241
+ asr_preencoder: linear
242
+ asr_preencoder_conf:
243
+ input_size: 1024
244
+ output_size: 80
245
+ asr_encoder: conformer
246
+ asr_encoder_conf:
247
+ output_size: 256
248
+ attention_heads: 4
249
+ linear_units: 2048
250
+ num_blocks: 12
251
+ dropout_rate: 0.1
252
+ positional_dropout_rate: 0.1
253
+ attention_dropout_rate: 0.0
254
+ input_layer: conv2d2
255
+ normalize_before: true
256
+ macaron_style: true
257
+ pos_enc_layer_type: rel_pos
258
+ selfattention_layer_type: rel_selfattn
259
+ activation_type: swish
260
+ use_cnn_module: true
261
+ cnn_module_kernel: 15
262
+ asr_postencoder: null
263
+ asr_postencoder_conf: {}
264
+ asr_decoder: transformer
265
+ asr_decoder_conf:
266
+ input_layer: embed
267
+ attention_heads: 4
268
+ linear_units: 2048
269
+ num_blocks: 6
270
+ dropout_rate: 0.1
271
+ positional_dropout_rate: 0.1
272
+ self_attention_dropout_rate: 0.0
273
+ src_attention_dropout_rate: 0.0
274
+ st_preencoder: null
275
+ st_preencoder_conf: {}
276
+ st_encoder: rnn
277
+ st_encoder_conf: {}
278
+ st_postencoder: null
279
+ st_postencoder_conf: {}
280
+ st_decoder: rnn
281
+ st_decoder_conf: {}
282
+ st_extra_asr_decoder: rnn
283
+ st_extra_asr_decoder_conf: {}
284
+ st_extra_mt_decoder: rnn
285
+ st_extra_mt_decoder_conf: {}
286
+ diar_frontend: default
287
+ diar_frontend_conf: {}
288
+ diar_specaug: null
289
+ diar_specaug_conf: {}
290
+ diar_normalize: utterance_mvn
291
+ diar_normalize_conf: {}
292
+ diar_encoder: transformer
293
+ diar_encoder_conf: {}
294
+ diar_decoder: linear
295
+ diar_decoder_conf: {}
296
+ label_aggregator: label_aggregator
297
+ label_aggregator_conf: {}
298
+ diar_attractor: null
299
+ diar_attractor_conf: {}
300
+ required:
301
+ - output_dir
302
+ version: '202207'
303
+ distributed: false
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/acc.png ADDED
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/backward_time.png ADDED
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/cer.png ADDED
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/cer_ctc.png ADDED
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/forward_time.png ADDED
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/gpu_max_cached_mem_GB.png ADDED
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/iter_time.png ADDED
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/loss.png ADDED
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/loss_asr.png ADDED
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/loss_att.png ADDED
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/loss_ctc.png ADDED
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/loss_enh.png ADDED
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/optim0_lr0.png ADDED
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/optim_step_time.png ADDED
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/train_time.png ADDED
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/images/wer.png ADDED
exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/valid.acc.ave_10best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f39ec424fc0424d9ba4fe166e2a9da957ed7e98b92bd3a3336daf646f3cfbce
3
+ size 1492909697
exp/lm_train_lm_transformer_en_char/12epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13dc5af025af5be3560922587402820207a78d258eb78fe1d47553a62b4b5895
3
+ size 202293743
exp/lm_train_lm_transformer_en_char/config.yaml ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_lm_transformer.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp_real2/lm_train_lm_transformer_en_char
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 30
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ keep_nbest_models: 10
41
+ nbest_averaging_interval: 0
42
+ grad_clip: 5.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 4
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ use_wandb: false
54
+ wandb_project: null
55
+ wandb_id: null
56
+ wandb_entity: null
57
+ wandb_name: null
58
+ wandb_model_log_interval: -1
59
+ detect_anomaly: false
60
+ pretrain_path: null
61
+ init_param: []
62
+ ignore_init_mismatch: false
63
+ freeze_param: []
64
+ num_iters_per_epoch: null
65
+ batch_size: 20
66
+ valid_batch_size: null
67
+ batch_bins: 150000
68
+ valid_batch_bins: null
69
+ train_shape_file:
70
+ - exp_real2/lm_stats_en_char/train/text_shape.char
71
+ valid_shape_file:
72
+ - exp_real2/lm_stats_en_char/valid/text_shape.char
73
+ batch_type: numel
74
+ valid_batch_type: null
75
+ fold_length:
76
+ - 150
77
+ sort_in_batch: descending
78
+ sort_batch: descending
79
+ multiple_iterator: false
80
+ chunk_length: 500
81
+ chunk_shift_ratio: 0.5
82
+ num_cache_chunks: 1024
83
+ train_data_path_and_name_and_type:
84
+ - - dump/raw/lm_train.txt
85
+ - text
86
+ - text
87
+ valid_data_path_and_name_and_type:
88
+ - - dump/raw/dt05_multi_isolated_1ch_track/text
89
+ - text
90
+ - text
91
+ allow_variable_data_keys: false
92
+ max_cache_size: 0.0
93
+ max_cache_fd: 32
94
+ valid_max_cache_size: null
95
+ optim: adam
96
+ optim_conf:
97
+ lr: 0.001
98
+ scheduler: warmuplr
99
+ scheduler_conf:
100
+ warmup_steps: 25000
101
+ token_list:
102
+ - <blank>
103
+ - <unk>
104
+ - <space>
105
+ - E
106
+ - T
107
+ - A
108
+ - N
109
+ - I
110
+ - O
111
+ - S
112
+ - R
113
+ - H
114
+ - L
115
+ - D
116
+ - C
117
+ - U
118
+ - M
119
+ - P
120
+ - F
121
+ - G
122
+ - Y
123
+ - W
124
+ - B
125
+ - V
126
+ - K
127
+ - .
128
+ - X
129
+ - ''''
130
+ - J
131
+ - Q
132
+ - Z
133
+ - ','
134
+ - '-'
135
+ - '"'
136
+ - <NOISE>
137
+ - '*'
138
+ - ':'
139
+ - (
140
+ - )
141
+ - '?'
142
+ - '&'
143
+ - ;
144
+ - '!'
145
+ - /
146
+ - '{'
147
+ - '}'
148
+ - '1'
149
+ - '2'
150
+ - '0'
151
+ - $
152
+ - '8'
153
+ - '9'
154
+ - '6'
155
+ - '3'
156
+ - '5'
157
+ - '7'
158
+ - '4'
159
+ - '~'
160
+ - '`'
161
+ - _
162
+ - <*IN*>
163
+ - <*MR.*>
164
+ - \
165
+ - ^
166
+ - <sos/eos>
167
+ init: null
168
+ model_conf:
169
+ ignore_id: 0
170
+ use_preprocessor: true
171
+ token_type: char
172
+ bpemodel: null
173
+ non_linguistic_symbols: data/nlsyms.txt
174
+ cleaner: null
175
+ g2p: null
176
+ lm: transformer
177
+ lm_conf:
178
+ pos_enc: null
179
+ embed_unit: 128
180
+ att_unit: 512
181
+ head: 8
182
+ unit: 2048
183
+ layer: 16
184
+ dropout_rate: 0.1
185
+ required:
186
+ - output_dir
187
+ - token_list
188
+ version: 0.10.7a1
189
+ distributed: false
meta.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
1
+ espnet: '202207'
2
+ files:
3
+ enh_s2t_model_file: exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/valid.acc.ave_10best.pth
4
+ lm_file: exp/lm_train_lm_transformer_en_char/12epoch.pth
5
+ python: "3.7.4 (default, Aug 13 2019, 20:35:49) \n[GCC 7.3.0]"
6
+ timestamp: 1665566217.541992
7
+ torch: 1.10.1+cu111
8
+ yaml_files:
9
+ enh_s2t_train_config: exp/enh_asr_train_enh_asr_wpd_init_noenhloss_wavlm_conformer_raw_en_char/config.yaml
10
+ lm_train_config: exp/lm_train_lm_transformer_en_char/config.yaml