pyf98 commited on
Commit
6670f92
1 Parent(s): b73bc70

add model files

Browse files
Files changed (32) hide show
  1. README.md +371 -0
  2. data/nlsyms.txt +3 -0
  3. exp/asr_stats_raw_en_char_sp/train/feats_stats.npz +3 -0
  4. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/RESULTS.md +57 -0
  5. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/config.yaml +266 -0
  6. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/acc.png +0 -0
  7. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/backward_time.png +0 -0
  8. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/cer.png +0 -0
  9. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/cer_ctc.png +0 -0
  10. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/forward_time.png +0 -0
  11. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/gpu_max_cached_mem_GB.png +0 -0
  12. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/iter_time.png +0 -0
  13. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/loss.png +0 -0
  14. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/loss_att.png +0 -0
  15. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/loss_ctc.png +0 -0
  16. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/optim0_lr0.png +0 -0
  17. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/optim_step_time.png +0 -0
  18. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/train_time.png +0 -0
  19. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/wer.png +0 -0
  20. exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/valid.acc.ave_10best.pth +3 -0
  21. exp/lm_train_lm_transformer_en_char/12epoch.pth +3 -0
  22. exp/lm_train_lm_transformer_en_char/config.yaml +189 -0
  23. exp/lm_train_lm_transformer_en_char/images/backward_time.png +0 -0
  24. exp/lm_train_lm_transformer_en_char/images/forward_time.png +0 -0
  25. exp/lm_train_lm_transformer_en_char/images/gpu_max_cached_mem_GB.png +0 -0
  26. exp/lm_train_lm_transformer_en_char/images/iter_time.png +0 -0
  27. exp/lm_train_lm_transformer_en_char/images/loss.png +0 -0
  28. exp/lm_train_lm_transformer_en_char/images/optim0_lr0.png +0 -0
  29. exp/lm_train_lm_transformer_en_char/images/optim_step_time.png +0 -0
  30. exp/lm_train_lm_transformer_en_char/images/train_time.png +0 -0
  31. exp/lm_train_lm_transformer_en_char/perplexity_test/ppl +1 -0
  32. meta.yaml +10 -0
README.md ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - automatic-speech-recognition
6
+ language: en
7
+ datasets:
8
+ - chime4
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 ASR model
13
+
14
+ ### `pyf98/chime4_conformer_e12_linear1024`
15
+
16
+ This model was trained by Yifan Peng using chime4 recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout ad91279f0108d54bd22abe29671b376f048822c5
26
+ pip install -e .
27
+ cd egs2/chime4/asr1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model pyf98/chime4_conformer_e12_linear1024
29
+ ```
30
+
31
+ <!-- Generated by scripts/utils/show_asr_result.sh -->
32
+ # RESULTS
33
+ ## Environments
34
+ - date: `Wed Dec 28 15:49:24 EST 2022`
35
+ - python version: `3.9.15 (main, Nov 24 2022, 14:31:59) [GCC 11.2.0]`
36
+ - espnet version: `espnet 202211`
37
+ - pytorch version: `pytorch 1.12.1`
38
+ - Git hash: `f9a8009aef6ff9ba192a78c19b619ae4a9f3b9d2`
39
+ - Commit date: `Wed Dec 28 00:30:54 2022 -0500`
40
+
41
+ ## asr_train_asr_conformer_e12_linear1024_raw_en_char_sp
42
+ ### WER
43
+
44
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
45
+ |---|---|---|---|---|---|---|---|---|
46
+ |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_real_beamformit_5mics|1640|27119|92.8|5.8|1.5|0.6|7.8|56.5|
47
+ |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_simu_beamformit_5mics|1640|27120|91.3|6.7|2.0|0.8|9.5|60.5|
48
+ |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_real_beamformit_5mics|1320|21409|88.6|9.2|2.1|1.2|12.5|63.8|
49
+ |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_simu_beamformit_5mics|1320|21416|86.5|10.4|3.1|1.3|14.8|70.9|
50
+
51
+ ### CER
52
+
53
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
54
+ |---|---|---|---|---|---|---|---|---|
55
+ |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_real_beamformit_5mics|1640|160390|96.9|1.6|1.5|0.7|3.8|56.5|
56
+ |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_simu_beamformit_5mics|1640|160400|96.0|2.0|2.0|1.0|4.9|60.5|
57
+ |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_real_beamformit_5mics|1320|126796|94.8|2.8|2.3|1.2|6.4|63.9|
58
+ |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_simu_beamformit_5mics|1320|126812|93.1|3.4|3.4|1.5|8.4|70.9|
59
+
60
+ ### TER
61
+
62
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
63
+ |---|---|---|---|---|---|---|---|---|
64
+
65
+ ## ASR config
66
+
67
+ <details><summary>expand</summary>
68
+
69
+ ```
70
+ config: conf/tuning/train_asr_conformer_e12_linear1024.yaml
71
+ print_config: false
72
+ log_level: INFO
73
+ dry_run: false
74
+ iterator_type: sequence
75
+ output_dir: exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp
76
+ ngpu: 1
77
+ seed: 2022
78
+ num_workers: 4
79
+ num_att_plot: 3
80
+ dist_backend: nccl
81
+ dist_init_method: env://
82
+ dist_world_size: null
83
+ dist_rank: null
84
+ local_rank: 0
85
+ dist_master_addr: null
86
+ dist_master_port: null
87
+ dist_launcher: null
88
+ multiprocessing_distributed: false
89
+ unused_parameters: false
90
+ sharded_ddp: false
91
+ cudnn_enabled: true
92
+ cudnn_benchmark: false
93
+ cudnn_deterministic: true
94
+ collect_stats: false
95
+ write_collected_feats: false
96
+ max_epoch: 60
97
+ patience: null
98
+ val_scheduler_criterion:
99
+ - valid
100
+ - loss
101
+ early_stopping_criterion:
102
+ - valid
103
+ - loss
104
+ - min
105
+ best_model_criterion:
106
+ - - valid
107
+ - acc
108
+ - max
109
+ keep_nbest_models: 10
110
+ nbest_averaging_interval: 0
111
+ grad_clip: 5.0
112
+ grad_clip_type: 2.0
113
+ grad_noise: false
114
+ accum_grad: 1
115
+ no_forward_run: false
116
+ resume: true
117
+ train_dtype: float32
118
+ use_amp: true
119
+ log_interval: null
120
+ use_matplotlib: true
121
+ use_tensorboard: true
122
+ create_graph_in_tensorboard: false
123
+ use_wandb: false
124
+ wandb_project: null
125
+ wandb_id: null
126
+ wandb_entity: null
127
+ wandb_name: null
128
+ wandb_model_log_interval: -1
129
+ detect_anomaly: false
130
+ pretrain_path: null
131
+ init_param: []
132
+ ignore_init_mismatch: false
133
+ freeze_param: []
134
+ num_iters_per_epoch: null
135
+ batch_size: 20
136
+ valid_batch_size: null
137
+ batch_bins: 15000000
138
+ valid_batch_bins: null
139
+ train_shape_file:
140
+ - exp/asr_stats_raw_en_char_sp/train/speech_shape
141
+ - exp/asr_stats_raw_en_char_sp/train/text_shape.char
142
+ valid_shape_file:
143
+ - exp/asr_stats_raw_en_char_sp/valid/speech_shape
144
+ - exp/asr_stats_raw_en_char_sp/valid/text_shape.char
145
+ batch_type: numel
146
+ valid_batch_type: null
147
+ fold_length:
148
+ - 80000
149
+ - 150
150
+ sort_in_batch: descending
151
+ sort_batch: descending
152
+ multiple_iterator: false
153
+ chunk_length: 500
154
+ chunk_shift_ratio: 0.5
155
+ num_cache_chunks: 1024
156
+ train_data_path_and_name_and_type:
157
+ - - dump/raw/tr05_multi_noisy_si284_sp/wav.scp
158
+ - speech
159
+ - kaldi_ark
160
+ - - dump/raw/tr05_multi_noisy_si284_sp/text
161
+ - text
162
+ - text
163
+ valid_data_path_and_name_and_type:
164
+ - - dump/raw/dt05_multi_isolated_1ch_track/wav.scp
165
+ - speech
166
+ - kaldi_ark
167
+ - - dump/raw/dt05_multi_isolated_1ch_track/text
168
+ - text
169
+ - text
170
+ allow_variable_data_keys: false
171
+ max_cache_size: 0.0
172
+ max_cache_fd: 32
173
+ valid_max_cache_size: null
174
+ optim: adam
175
+ optim_conf:
176
+ lr: 0.001
177
+ weight_decay: 1.0e-06
178
+ scheduler: warmuplr
179
+ scheduler_conf:
180
+ warmup_steps: 25000
181
+ token_list:
182
+ - <blank>
183
+ - <unk>
184
+ - <space>
185
+ - E
186
+ - T
187
+ - A
188
+ - N
189
+ - I
190
+ - O
191
+ - S
192
+ - R
193
+ - H
194
+ - L
195
+ - D
196
+ - C
197
+ - U
198
+ - M
199
+ - P
200
+ - F
201
+ - G
202
+ - Y
203
+ - W
204
+ - B
205
+ - V
206
+ - K
207
+ - .
208
+ - X
209
+ - ''''
210
+ - J
211
+ - Q
212
+ - Z
213
+ - ','
214
+ - '-'
215
+ - '"'
216
+ - <NOISE>
217
+ - '*'
218
+ - ':'
219
+ - (
220
+ - )
221
+ - '?'
222
+ - '&'
223
+ - ;
224
+ - '!'
225
+ - /
226
+ - '{'
227
+ - '}'
228
+ - '1'
229
+ - '2'
230
+ - '0'
231
+ - $
232
+ - '8'
233
+ - '9'
234
+ - '6'
235
+ - '3'
236
+ - '5'
237
+ - '7'
238
+ - '4'
239
+ - '~'
240
+ - '`'
241
+ - _
242
+ - <*IN*>
243
+ - <*MR.*>
244
+ - \
245
+ - ^
246
+ - <sos/eos>
247
+ init: null
248
+ input_size: null
249
+ ctc_conf:
250
+ dropout_rate: 0.0
251
+ ctc_type: builtin
252
+ reduce: true
253
+ ignore_nan_grad: null
254
+ zero_infinity: true
255
+ joint_net_conf: null
256
+ use_preprocessor: true
257
+ token_type: char
258
+ bpemodel: null
259
+ non_linguistic_symbols: data/nlsyms.txt
260
+ cleaner: null
261
+ g2p: null
262
+ speech_volume_normalize: null
263
+ rir_scp: null
264
+ rir_apply_prob: 1.0
265
+ noise_scp: null
266
+ noise_apply_prob: 1.0
267
+ noise_db_range: '13_15'
268
+ short_noise_thres: 0.5
269
+ frontend: default
270
+ frontend_conf:
271
+ n_fft: 512
272
+ win_length: 400
273
+ hop_length: 160
274
+ fs: 16k
275
+ specaug: specaug
276
+ specaug_conf:
277
+ apply_time_warp: true
278
+ time_warp_window: 5
279
+ time_warp_mode: bicubic
280
+ apply_freq_mask: true
281
+ freq_mask_width_range:
282
+ - 0
283
+ - 27
284
+ num_freq_mask: 2
285
+ apply_time_mask: true
286
+ time_mask_width_ratio_range:
287
+ - 0.0
288
+ - 0.05
289
+ num_time_mask: 2
290
+ normalize: global_mvn
291
+ normalize_conf:
292
+ stats_file: exp/asr_stats_raw_en_char_sp/train/feats_stats.npz
293
+ model: espnet
294
+ model_conf:
295
+ ctc_weight: 0.3
296
+ lsm_weight: 0.1
297
+ length_normalized_loss: false
298
+ preencoder: null
299
+ preencoder_conf: {}
300
+ encoder: conformer
301
+ encoder_conf:
302
+ output_size: 256
303
+ attention_heads: 4
304
+ linear_units: 1024
305
+ num_blocks: 12
306
+ dropout_rate: 0.1
307
+ positional_dropout_rate: 0.1
308
+ attention_dropout_rate: 0.1
309
+ input_layer: conv2d
310
+ normalize_before: true
311
+ macaron_style: true
312
+ rel_pos_type: latest
313
+ pos_enc_layer_type: rel_pos
314
+ selfattention_layer_type: rel_selfattn
315
+ activation_type: swish
316
+ use_cnn_module: true
317
+ cnn_module_kernel: 31
318
+ postencoder: null
319
+ postencoder_conf: {}
320
+ decoder: transformer
321
+ decoder_conf:
322
+ attention_heads: 4
323
+ linear_units: 2048
324
+ num_blocks: 6
325
+ dropout_rate: 0.1
326
+ positional_dropout_rate: 0.1
327
+ self_attention_dropout_rate: 0.1
328
+ src_attention_dropout_rate: 0.1
329
+ preprocessor: default
330
+ preprocessor_conf: {}
331
+ required:
332
+ - output_dir
333
+ - token_list
334
+ version: '202211'
335
+ distributed: false
336
+ ```
337
+
338
+ </details>
339
+
340
+
341
+
342
+ ### Citing ESPnet
343
+
344
+ ```BibTex
345
+ @inproceedings{watanabe2018espnet,
346
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
347
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
348
+ year={2018},
349
+ booktitle={Proceedings of Interspeech},
350
+ pages={2207--2211},
351
+ doi={10.21437/Interspeech.2018-1456},
352
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
353
+ }
354
+
355
+
356
+
357
+
358
+ ```
359
+
360
+ or arXiv:
361
+
362
+ ```bibtex
363
+ @misc{watanabe2018espnet,
364
+ title={ESPnet: End-to-End Speech Processing Toolkit},
365
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
366
+ year={2018},
367
+ eprint={1804.00015},
368
+ archivePrefix={arXiv},
369
+ primaryClass={cs.CL}
370
+ }
371
+ ```
data/nlsyms.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ <*IN*>
2
+ <*MR.*>
3
+ <NOISE>
exp/asr_stats_raw_en_char_sp/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2aca9a9d9bf95ba2222dd91ed8d62655135d2637bf76704cf984b7d0e90756c
3
+ size 1402
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/RESULTS.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_asr_result.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Wed Dec 28 15:49:24 EST 2022`
5
+ - python version: `3.9.15 (main, Nov 24 2022, 14:31:59) [GCC 11.2.0]`
6
+ - espnet version: `espnet 202211`
7
+ - pytorch version: `pytorch 1.12.1`
8
+ - Git hash: `f9a8009aef6ff9ba192a78c19b619ae4a9f3b9d2`
9
+ - Commit date: `Wed Dec 28 00:30:54 2022 -0500`
10
+
11
+ ## asr_train_asr_conformer_e12_linear1024_raw_en_char_sp
12
+ ### WER
13
+
14
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
15
+ |---|---|---|---|---|---|---|---|---|
16
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_real_beamformit_2mics|1640|27119|87.9|9.8|2.3|0.9|13.0|77.7|
17
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_real_beamformit_5mics|1640|27119|89.5|8.6|1.9|0.8|11.3|74.0|
18
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_real_isolated_1ch_track|1640|27119|86.6|10.8|2.6|1.0|14.4|79.4|
19
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_simu_beamformit_2mics|1640|27120|85.6|11.5|3.0|1.2|15.7|78.9|
20
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_simu_beamformit_5mics|1640|27120|87.6|9.9|2.5|1.0|13.4|76.5|
21
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_simu_isolated_1ch_track|1640|27120|84.2|12.8|3.0|1.3|17.1|80.5|
22
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_real_beamformit_2mics|1320|21409|81.8|15.0|3.1|1.6|19.8|82.2|
23
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_real_beamformit_5mics|1320|21409|84.6|12.7|2.6|1.4|16.7|78.7|
24
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_real_isolated_1ch_track|1320|21409|79.7|16.7|3.6|2.0|22.3|84.1|
25
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_simu_beamformit_2mics|1320|21416|79.6|16.3|4.1|1.8|22.2|84.7|
26
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_simu_beamformit_5mics|1320|21416|82.2|14.3|3.4|1.5|19.3|83.0|
27
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_simu_isolated_1ch_track|1320|21416|78.1|17.9|4.0|2.3|24.2|84.6|
28
+ |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_real_beamformit_5mics|1640|27119|92.8|5.8|1.5|0.6|7.8|56.5|
29
+ |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_simu_beamformit_5mics|1640|27120|91.3|6.7|2.0|0.8|9.5|60.5|
30
+ |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_real_beamformit_5mics|1320|21409|88.6|9.2|2.1|1.2|12.5|63.8|
31
+ |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_simu_beamformit_5mics|1320|21416|86.5|10.4|3.1|1.3|14.8|70.9|
32
+
33
+ ### CER
34
+
35
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
36
+ |---|---|---|---|---|---|---|---|---|
37
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_real_beamformit_2mics|1640|160390|95.1|2.6|2.3|1.1|6.0|77.7|
38
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_real_beamformit_5mics|1640|160390|95.9|2.1|2.0|1.0|5.1|74.0|
39
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_real_isolated_1ch_track|1640|160390|94.4|2.9|2.6|1.3|6.9|79.4|
40
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_simu_beamformit_2mics|1640|160400|93.6|3.4|3.1|1.4|7.9|78.9|
41
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_simu_beamformit_5mics|1640|160400|94.8|2.7|2.5|1.2|6.4|76.5|
42
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_simu_isolated_1ch_track|1640|160400|92.9|3.9|3.2|1.7|8.8|80.5|
43
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_real_beamformit_2mics|1320|126796|91.9|4.5|3.6|1.9|10.0|82.2|
44
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_real_beamformit_5mics|1320|126796|93.5|3.6|2.9|1.6|8.0|78.7|
45
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_real_isolated_1ch_track|1320|126796|90.7|5.2|4.1|2.3|11.6|84.1|
46
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_simu_beamformit_2mics|1320|126812|90.1|5.2|4.7|2.2|12.0|84.7|
47
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_simu_beamformit_5mics|1320|126812|91.8|4.2|4.0|1.9|10.1|83.0|
48
+ |decode_asr_lm_lm_train_lm_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_simu_isolated_1ch_track|1320|126812|89.3|6.0|4.8|2.7|13.5|84.6|
49
+ |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_real_beamformit_5mics|1640|160390|96.9|1.6|1.5|0.7|3.8|56.5|
50
+ |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/dt05_simu_beamformit_5mics|1640|160400|96.0|2.0|2.0|1.0|4.9|60.5|
51
+ |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_real_beamformit_5mics|1320|126796|94.8|2.8|2.3|1.2|6.4|63.9|
52
+ |decode_asr_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/et05_simu_beamformit_5mics|1320|126812|93.1|3.4|3.4|1.5|8.4|70.9|
53
+
54
+ ### TER
55
+
56
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
57
+ |---|---|---|---|---|---|---|---|---|
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/config.yaml ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_asr_conformer_e12_linear1024.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp
7
+ ngpu: 1
8
+ seed: 2022
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 60
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - acc
39
+ - max
40
+ keep_nbest_models: 10
41
+ nbest_averaging_interval: 0
42
+ grad_clip: 5.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 1
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: true
50
+ log_interval: null
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ create_graph_in_tensorboard: false
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param: []
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: null
66
+ batch_size: 20
67
+ valid_batch_size: null
68
+ batch_bins: 15000000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/asr_stats_raw_en_char_sp/train/speech_shape
72
+ - exp/asr_stats_raw_en_char_sp/train/text_shape.char
73
+ valid_shape_file:
74
+ - exp/asr_stats_raw_en_char_sp/valid/speech_shape
75
+ - exp/asr_stats_raw_en_char_sp/valid/text_shape.char
76
+ batch_type: numel
77
+ valid_batch_type: null
78
+ fold_length:
79
+ - 80000
80
+ - 150
81
+ sort_in_batch: descending
82
+ sort_batch: descending
83
+ multiple_iterator: false
84
+ chunk_length: 500
85
+ chunk_shift_ratio: 0.5
86
+ num_cache_chunks: 1024
87
+ train_data_path_and_name_and_type:
88
+ - - dump/raw/tr05_multi_noisy_si284_sp/wav.scp
89
+ - speech
90
+ - kaldi_ark
91
+ - - dump/raw/tr05_multi_noisy_si284_sp/text
92
+ - text
93
+ - text
94
+ valid_data_path_and_name_and_type:
95
+ - - dump/raw/dt05_multi_isolated_1ch_track/wav.scp
96
+ - speech
97
+ - kaldi_ark
98
+ - - dump/raw/dt05_multi_isolated_1ch_track/text
99
+ - text
100
+ - text
101
+ allow_variable_data_keys: false
102
+ max_cache_size: 0.0
103
+ max_cache_fd: 32
104
+ valid_max_cache_size: null
105
+ optim: adam
106
+ optim_conf:
107
+ lr: 0.001
108
+ weight_decay: 1.0e-06
109
+ scheduler: warmuplr
110
+ scheduler_conf:
111
+ warmup_steps: 25000
112
+ token_list:
113
+ - <blank>
114
+ - <unk>
115
+ - <space>
116
+ - E
117
+ - T
118
+ - A
119
+ - N
120
+ - I
121
+ - O
122
+ - S
123
+ - R
124
+ - H
125
+ - L
126
+ - D
127
+ - C
128
+ - U
129
+ - M
130
+ - P
131
+ - F
132
+ - G
133
+ - Y
134
+ - W
135
+ - B
136
+ - V
137
+ - K
138
+ - .
139
+ - X
140
+ - ''''
141
+ - J
142
+ - Q
143
+ - Z
144
+ - ','
145
+ - '-'
146
+ - '"'
147
+ - <NOISE>
148
+ - '*'
149
+ - ':'
150
+ - (
151
+ - )
152
+ - '?'
153
+ - '&'
154
+ - ;
155
+ - '!'
156
+ - /
157
+ - '{'
158
+ - '}'
159
+ - '1'
160
+ - '2'
161
+ - '0'
162
+ - $
163
+ - '8'
164
+ - '9'
165
+ - '6'
166
+ - '3'
167
+ - '5'
168
+ - '7'
169
+ - '4'
170
+ - '~'
171
+ - '`'
172
+ - _
173
+ - <*IN*>
174
+ - <*MR.*>
175
+ - \
176
+ - ^
177
+ - <sos/eos>
178
+ init: null
179
+ input_size: null
180
+ ctc_conf:
181
+ dropout_rate: 0.0
182
+ ctc_type: builtin
183
+ reduce: true
184
+ ignore_nan_grad: null
185
+ zero_infinity: true
186
+ joint_net_conf: null
187
+ use_preprocessor: true
188
+ token_type: char
189
+ bpemodel: null
190
+ non_linguistic_symbols: data/nlsyms.txt
191
+ cleaner: null
192
+ g2p: null
193
+ speech_volume_normalize: null
194
+ rir_scp: null
195
+ rir_apply_prob: 1.0
196
+ noise_scp: null
197
+ noise_apply_prob: 1.0
198
+ noise_db_range: '13_15'
199
+ short_noise_thres: 0.5
200
+ frontend: default
201
+ frontend_conf:
202
+ n_fft: 512
203
+ win_length: 400
204
+ hop_length: 160
205
+ fs: 16k
206
+ specaug: specaug
207
+ specaug_conf:
208
+ apply_time_warp: true
209
+ time_warp_window: 5
210
+ time_warp_mode: bicubic
211
+ apply_freq_mask: true
212
+ freq_mask_width_range:
213
+ - 0
214
+ - 27
215
+ num_freq_mask: 2
216
+ apply_time_mask: true
217
+ time_mask_width_ratio_range:
218
+ - 0.0
219
+ - 0.05
220
+ num_time_mask: 2
221
+ normalize: global_mvn
222
+ normalize_conf:
223
+ stats_file: exp/asr_stats_raw_en_char_sp/train/feats_stats.npz
224
+ model: espnet
225
+ model_conf:
226
+ ctc_weight: 0.3
227
+ lsm_weight: 0.1
228
+ length_normalized_loss: false
229
+ preencoder: null
230
+ preencoder_conf: {}
231
+ encoder: conformer
232
+ encoder_conf:
233
+ output_size: 256
234
+ attention_heads: 4
235
+ linear_units: 1024
236
+ num_blocks: 12
237
+ dropout_rate: 0.1
238
+ positional_dropout_rate: 0.1
239
+ attention_dropout_rate: 0.1
240
+ input_layer: conv2d
241
+ normalize_before: true
242
+ macaron_style: true
243
+ rel_pos_type: latest
244
+ pos_enc_layer_type: rel_pos
245
+ selfattention_layer_type: rel_selfattn
246
+ activation_type: swish
247
+ use_cnn_module: true
248
+ cnn_module_kernel: 31
249
+ postencoder: null
250
+ postencoder_conf: {}
251
+ decoder: transformer
252
+ decoder_conf:
253
+ attention_heads: 4
254
+ linear_units: 2048
255
+ num_blocks: 6
256
+ dropout_rate: 0.1
257
+ positional_dropout_rate: 0.1
258
+ self_attention_dropout_rate: 0.1
259
+ src_attention_dropout_rate: 0.1
260
+ preprocessor: default
261
+ preprocessor_conf: {}
262
+ required:
263
+ - output_dir
264
+ - token_list
265
+ version: '202211'
266
+ distributed: false
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/acc.png ADDED
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/backward_time.png ADDED
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/cer.png ADDED
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/cer_ctc.png ADDED
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/forward_time.png ADDED
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/gpu_max_cached_mem_GB.png ADDED
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/iter_time.png ADDED
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/loss.png ADDED
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/loss_att.png ADDED
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/loss_ctc.png ADDED
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/optim0_lr0.png ADDED
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/optim_step_time.png ADDED
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/train_time.png ADDED
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/images/wer.png ADDED
exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/valid.acc.ave_10best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:315c57ff4f30a06e6b1632a994fb1b84820aafcd6a267eb17ef25c90ea24ef32
3
+ size 122063653
exp/lm_train_lm_transformer_en_char/12epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13dc5af025af5be3560922587402820207a78d258eb78fe1d47553a62b4b5895
3
+ size 202293743
exp/lm_train_lm_transformer_en_char/config.yaml ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_grad: 4
2
+ allow_variable_data_keys: false
3
+ batch_bins: 150000
4
+ batch_size: 20
5
+ batch_type: numel
6
+ best_model_criterion:
7
+ - - valid
8
+ - loss
9
+ - min
10
+ bpemodel: null
11
+ chunk_length: 500
12
+ chunk_shift_ratio: 0.5
13
+ cleaner: null
14
+ collect_stats: false
15
+ config: conf/train_lm_transformer.yaml
16
+ cudnn_benchmark: false
17
+ cudnn_deterministic: true
18
+ cudnn_enabled: true
19
+ detect_anomaly: false
20
+ dist_backend: nccl
21
+ dist_init_method: env://
22
+ dist_launcher: null
23
+ dist_master_addr: null
24
+ dist_master_port: null
25
+ dist_rank: null
26
+ dist_world_size: null
27
+ distributed: false
28
+ dry_run: false
29
+ early_stopping_criterion:
30
+ - valid
31
+ - loss
32
+ - min
33
+ fold_length:
34
+ - 150
35
+ freeze_param: []
36
+ g2p: null
37
+ grad_clip: 5.0
38
+ grad_clip_type: 2.0
39
+ grad_noise: false
40
+ ignore_init_mismatch: false
41
+ init: null
42
+ init_param: []
43
+ iterator_type: sequence
44
+ keep_nbest_models: 10
45
+ lm: transformer
46
+ lm_conf:
47
+ att_unit: 512
48
+ dropout_rate: 0.1
49
+ embed_unit: 128
50
+ head: 8
51
+ layer: 16
52
+ pos_enc: null
53
+ unit: 2048
54
+ local_rank: 0
55
+ log_interval: null
56
+ log_level: INFO
57
+ max_cache_fd: 32
58
+ max_cache_size: 0.0
59
+ max_epoch: 30
60
+ model_conf:
61
+ ignore_id: 0
62
+ multiple_iterator: false
63
+ multiprocessing_distributed: false
64
+ nbest_averaging_interval: 0
65
+ ngpu: 1
66
+ no_forward_run: false
67
+ non_linguistic_symbols: data/nlsyms.txt
68
+ num_att_plot: 3
69
+ num_cache_chunks: 1024
70
+ num_iters_per_epoch: null
71
+ num_workers: 1
72
+ optim: adam
73
+ optim_conf:
74
+ lr: 0.001
75
+ output_dir: exp_real2/lm_train_lm_transformer_en_char
76
+ patience: null
77
+ pretrain_path: null
78
+ print_config: false
79
+ required:
80
+ - output_dir
81
+ - token_list
82
+ resume: true
83
+ scheduler: warmuplr
84
+ scheduler_conf:
85
+ warmup_steps: 25000
86
+ seed: 0
87
+ sharded_ddp: false
88
+ sort_batch: descending
89
+ sort_in_batch: descending
90
+ token_list:
91
+ - <blank>
92
+ - <unk>
93
+ - <space>
94
+ - E
95
+ - T
96
+ - A
97
+ - N
98
+ - I
99
+ - O
100
+ - S
101
+ - R
102
+ - H
103
+ - L
104
+ - D
105
+ - C
106
+ - U
107
+ - M
108
+ - P
109
+ - F
110
+ - G
111
+ - Y
112
+ - W
113
+ - B
114
+ - V
115
+ - K
116
+ - .
117
+ - X
118
+ - ''''
119
+ - J
120
+ - Q
121
+ - Z
122
+ - ','
123
+ - '-'
124
+ - '"'
125
+ - <NOISE>
126
+ - '*'
127
+ - ':'
128
+ - (
129
+ - )
130
+ - '?'
131
+ - '&'
132
+ - ;
133
+ - '!'
134
+ - /
135
+ - '{'
136
+ - '}'
137
+ - '1'
138
+ - '2'
139
+ - '0'
140
+ - $
141
+ - '8'
142
+ - '9'
143
+ - '6'
144
+ - '3'
145
+ - '5'
146
+ - '7'
147
+ - '4'
148
+ - '~'
149
+ - '`'
150
+ - _
151
+ - <*IN*>
152
+ - <*MR.*>
153
+ - \
154
+ - ^
155
+ - <sos/eos>
156
+ token_type: char
157
+ train_data_path_and_name_and_type:
158
+ - - dump/raw/lm_train.txt
159
+ - text
160
+ - text
161
+ train_dtype: float32
162
+ train_shape_file:
163
+ - exp_real2/lm_stats_en_char/train/text_shape.char
164
+ unused_parameters: false
165
+ use_amp: false
166
+ use_matplotlib: true
167
+ use_preprocessor: true
168
+ use_tensorboard: true
169
+ use_wandb: false
170
+ val_scheduler_criterion:
171
+ - valid
172
+ - loss
173
+ valid_batch_bins: null
174
+ valid_batch_size: null
175
+ valid_batch_type: null
176
+ valid_data_path_and_name_and_type:
177
+ - - dump/raw/dt05_multi_isolated_1ch_track/text
178
+ - text
179
+ - text
180
+ valid_max_cache_size: null
181
+ valid_shape_file:
182
+ - exp_real2/lm_stats_en_char/valid/text_shape.char
183
+ version: 0.10.7a1
184
+ wandb_entity: null
185
+ wandb_id: null
186
+ wandb_model_log_interval: -1
187
+ wandb_name: null
188
+ wandb_project: null
189
+ write_collected_feats: false
exp/lm_train_lm_transformer_en_char/images/backward_time.png ADDED
exp/lm_train_lm_transformer_en_char/images/forward_time.png ADDED
exp/lm_train_lm_transformer_en_char/images/gpu_max_cached_mem_GB.png ADDED
exp/lm_train_lm_transformer_en_char/images/iter_time.png ADDED
exp/lm_train_lm_transformer_en_char/images/loss.png ADDED
exp/lm_train_lm_transformer_en_char/images/optim0_lr0.png ADDED
exp/lm_train_lm_transformer_en_char/images/optim_step_time.png ADDED
exp/lm_train_lm_transformer_en_char/images/train_time.png ADDED
exp/lm_train_lm_transformer_en_char/perplexity_test/ppl ADDED
@@ -0,0 +1 @@
 
 
1
+ 1.8016810278173603
meta.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ espnet: '202211'
2
+ files:
3
+ asr_model_file: exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/valid.acc.ave_10best.pth
4
+ lm_file: exp/lm_train_lm_transformer_en_char/12epoch.pth
5
+ python: "3.9.15 (main, Nov 24 2022, 14:31:59) \n[GCC 11.2.0]"
6
+ timestamp: 1672282073.431392
7
+ torch: 1.12.1
8
+ yaml_files:
9
+ asr_train_config: exp/asr_train_asr_conformer_e12_linear1024_raw_en_char_sp/config.yaml
10
+ lm_train_config: exp/lm_train_lm_transformer_en_char/config.yaml