siuze commited on
Commit
e91aca8
1 Parent(s): afd261b

Update model

Browse files
README.md CHANGED
@@ -5,7 +5,7 @@ tags:
5
  - automatic-speech-recognition
6
  language: foc
7
  datasets:
8
- - mini_an4
9
  license: cc-by-4.0
10
  ---
11
 
@@ -13,7 +13,7 @@ license: cc-by-4.0
13
 
14
  ### `siuze/FOC-yngping`
15
 
16
- This model was trained by siuze using mini_an4 recipe in [espnet](https://github.com/espnet/espnet/).
17
 
18
  ### Demo: How to use in ESPnet2
19
 
@@ -24,273 +24,18 @@ if you haven't done that already.
24
  cd espnet
25
  git checkout 52160d6ed337e9dec74dd59695fec1548042e0b2
26
  pip install -e .
27
- cd egs2/mini_an4/foc
28
  ./run.sh --skip_data_prep false --skip_train true --download_model siuze/FOC-yngping
29
  ```
30
 
31
- <!-- Generated by scripts/utils/show_asr_result.sh -->
32
- # RESULTS
33
- ## Environments
34
- - date: `Wed Apr 19 20:30:35 CST 2023`
35
- - python version: `3.8.16 | packaged by conda-forge | (default, Feb 1 2023, 16:01:55) [GCC 11.3.0]`
36
- - espnet version: `espnet 202301`
37
- - pytorch version: `pytorch 1.10.0`
38
- - Git hash: `52160d6ed337e9dec74dd59695fec1548042e0b2`
39
- - Commit date: `Thu Mar 16 21:37:39 2023 +0000`
40
 
41
- ## exp/asr_train_asr_transformer_raw_foc_char
42
- ### WER
43
-
44
- |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
45
- |---|---|---|---|---|---|---|---|---|
46
- |inference_asr_model_valid.acc.ave/test|500|1083|74.5|24.9|0.6|0.4|25.9|41.2|
47
-
48
- ### CER
49
-
50
- |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
51
- |---|---|---|---|---|---|---|---|---|
52
- |inference_asr_model_valid.acc.ave/test|500|6377|93.8|4.4|1.8|1.9|8.1|41.2|
53
-
54
- ### TER
55
-
56
- |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
57
- |---|---|---|---|---|---|---|---|---|
58
 
59
  ## ASR config
60
 
61
  <details><summary>expand</summary>
62
 
63
  ```
64
- config: conf/train_asr_transformer.yaml
65
- print_config: false
66
- log_level: INFO
67
- dry_run: false
68
- iterator_type: sequence
69
- output_dir: exp/asr_train_asr_transformer_raw_foc_char
70
- ngpu: 1
71
- seed: 0
72
- num_workers: 1
73
- num_att_plot: 3
74
- dist_backend: nccl
75
- dist_init_method: env://
76
- dist_world_size: null
77
- dist_rank: null
78
- local_rank: 0
79
- dist_master_addr: null
80
- dist_master_port: null
81
- dist_launcher: null
82
- multiprocessing_distributed: false
83
- unused_parameters: false
84
- sharded_ddp: false
85
- cudnn_enabled: true
86
- cudnn_benchmark: false
87
- cudnn_deterministic: true
88
- collect_stats: false
89
- write_collected_feats: false
90
- max_epoch: 60
91
- patience: null
92
- val_scheduler_criterion:
93
- - valid
94
- - loss
95
- early_stopping_criterion:
96
- - valid
97
- - loss
98
- - min
99
- best_model_criterion:
100
- - - valid
101
- - acc
102
- - max
103
- keep_nbest_models: 10
104
- nbest_averaging_interval: 0
105
- grad_clip: 5.0
106
- grad_clip_type: 2.0
107
- grad_noise: false
108
- accum_grad: 8
109
- no_forward_run: false
110
- resume: true
111
- train_dtype: float32
112
- use_amp: false
113
- log_interval: null
114
- use_matplotlib: true
115
- use_tensorboard: true
116
- create_graph_in_tensorboard: false
117
- use_wandb: false
118
- wandb_project: null
119
- wandb_id: null
120
- wandb_entity: null
121
- wandb_name: null
122
- wandb_model_log_interval: -1
123
- detect_anomaly: false
124
- pretrain_path: null
125
- init_param:
126
- - /home/pro-c/yewei/espnet/egs2/mini_an4/asr1/exp/asr_train_asr_transformer_raw_can_char/valid.acc.best.pth
127
- ignore_init_mismatch: true
128
- freeze_param: []
129
- num_iters_per_epoch: null
130
- batch_size: 16
131
- valid_batch_size: null
132
- batch_bins: 1000000
133
- valid_batch_bins: null
134
- train_shape_file:
135
- - exp/asr_stats_raw_foc_char/train/speech_shape
136
- - exp/asr_stats_raw_foc_char/train/text_shape.char
137
- valid_shape_file:
138
- - exp/asr_stats_raw_foc_char/valid/speech_shape
139
- - exp/asr_stats_raw_foc_char/valid/text_shape.char
140
- batch_type: folded
141
- valid_batch_type: null
142
- fold_length:
143
- - 80000
144
- - 150
145
- sort_in_batch: descending
146
- sort_batch: descending
147
- multiple_iterator: false
148
- chunk_length: 500
149
- chunk_shift_ratio: 0.5
150
- num_cache_chunks: 1024
151
- chunk_excluded_key_prefixes: []
152
- train_data_path_and_name_and_type:
153
- - - dump/raw/train/wav.scp
154
- - speech
155
- - sound
156
- - - dump/raw/train/text
157
- - text
158
- - text
159
- valid_data_path_and_name_and_type:
160
- - - dump/raw/dev/wav.scp
161
- - speech
162
- - sound
163
- - - dump/raw/dev/text
164
- - text
165
- - text
166
- allow_variable_data_keys: false
167
- max_cache_size: 0.0
168
- max_cache_fd: 32
169
- valid_max_cache_size: null
170
- exclude_weight_decay: false
171
- exclude_weight_decay_conf: {}
172
- optim: adam
173
- optim_conf:
174
- lr: 0.005
175
- scheduler: warmuplr
176
- scheduler_conf:
177
- warmup_steps: 30000
178
- token_list:
179
- - <blank>
180
- - <unk>
181
- - <space>
182
- - '3'
183
- - '2'
184
- - '5'
185
- - g
186
- - o
187
- - a
188
- - n
189
- - i
190
- - '4'
191
- - u
192
- - e
193
- - k
194
- - '1'
195
- - j
196
- - y
197
- - z
198
- - s
199
- - h
200
- - d
201
- - m
202
- - l
203
- - c
204
- - b
205
- - f
206
- - t
207
- - w
208
- - p
209
- - r
210
- - x
211
- - v
212
- - q
213
- - <sos/eos>
214
- init: xavier_uniform
215
- input_size: null
216
- ctc_conf:
217
- dropout_rate: 0.0
218
- ctc_type: builtin
219
- reduce: true
220
- ignore_nan_grad: null
221
- zero_infinity: true
222
- joint_net_conf: null
223
- use_preprocessor: true
224
- token_type: char
225
- bpemodel: null
226
- non_linguistic_symbols: null
227
- cleaner: null
228
- g2p: null
229
- speech_volume_normalize: null
230
- rir_scp: null
231
- rir_apply_prob: 1.0
232
- noise_scp: null
233
- noise_apply_prob: 1.0
234
- noise_db_range: '13_15'
235
- short_noise_thres: 0.5
236
- aux_ctc_tasks: []
237
- frontend: default
238
- frontend_conf:
239
- fs: 16k
240
- specaug: specaug
241
- specaug_conf:
242
- apply_time_warp: true
243
- time_warp_window: 5
244
- time_warp_mode: bicubic
245
- apply_freq_mask: true
246
- freq_mask_width_range:
247
- - 0
248
- - 27
249
- num_freq_mask: 2
250
- apply_time_mask: true
251
- time_mask_width_ratio_range:
252
- - 0.0
253
- - 0.05
254
- num_time_mask: 10
255
- normalize: global_mvn
256
- normalize_conf:
257
- stats_file: exp/asr_stats_raw_foc_char/train/feats_stats.npz
258
- model: espnet
259
- model_conf:
260
- ctc_weight: 0.3
261
- lsm_weight: 0.1
262
- length_normalized_loss: false
263
- preencoder: null
264
- preencoder_conf: {}
265
- encoder: transformer
266
- encoder_conf:
267
- output_size: 256
268
- attention_heads: 4
269
- linear_units: 2048
270
- num_blocks: 12
271
- dropout_rate: 0.1
272
- positional_dropout_rate: 0.1
273
- attention_dropout_rate: 0.0
274
- input_layer: conv2d
275
- normalize_before: true
276
- postencoder: null
277
- postencoder_conf: {}
278
- decoder: transformer
279
- decoder_conf:
280
- attention_heads: 4
281
- linear_units: 2048
282
- num_blocks: 6
283
- dropout_rate: 0.1
284
- positional_dropout_rate: 0.1
285
- self_attention_dropout_rate: 0.0
286
- src_attention_dropout_rate: 0.0
287
- preprocessor: default
288
- preprocessor_conf: {}
289
- required:
290
- - output_dir
291
- - token_list
292
- version: '202301'
293
- distributed: false
294
  ```
295
 
296
  </details>
 
5
  - automatic-speech-recognition
6
  language: foc
7
  datasets:
8
+ - foc-can
9
  license: cc-by-4.0
10
  ---
11
 
 
13
 
14
  ### `siuze/FOC-yngping`
15
 
16
+ This model was trained by siuze using foc-can recipe in [espnet](https://github.com/espnet/espnet/).
17
 
18
  ### Demo: How to use in ESPnet2
19
 
 
24
  cd espnet
25
  git checkout 52160d6ed337e9dec74dd59695fec1548042e0b2
26
  pip install -e .
27
+ cd egs2/foc-can/foc
28
  ./run.sh --skip_data_prep false --skip_train true --download_model siuze/FOC-yngping
29
  ```
30
 
 
 
 
 
 
 
 
 
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  ## ASR config
34
 
35
  <details><summary>expand</summary>
36
 
37
  ```
38
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  ```
40
 
41
  </details>
exp/asr_train/RESULTS.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_asr_result.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Sun Apr 23 18:36:51 CST 2023`
5
+ - python version: `3.8.16 | packaged by conda-forge | (default, Feb 1 2023, 16:01:55) [GCC 11.3.0]`
6
+ - espnet version: `espnet 202301`
7
+ - pytorch version: `pytorch 1.10.0`
8
+ - Git hash: `52160d6ed337e9dec74dd59695fec1548042e0b2`
9
+ - Commit date: `Thu Mar 16 21:37:39 2023 +0000`
10
+
11
+ ## exp/asr_train_asr_transformer_raw_foc_char
12
+ ### WER
13
+
14
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
15
+ |---|---|---|---|---|---|---|---|---|
16
+ |inference_asr_model_valid.acc.ave/test|51|91|51.6|47.3|1.1|1.1|49.5|68.6|
17
+ |inference_asr_model_valid.acc.ave标准测试/test|500|1083|72.7|26.9|0.5|0.6|27.9|45.2|
18
+
19
+ ### CER
20
+
21
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
22
+ |---|---|---|---|---|---|---|---|---|
23
+ |inference_asr_model_valid.acc.ave/test|51|549|86.2|9.3|4.6|2.7|16.6|68.6|
24
+ |inference_asr_model_valid.acc.ave标准测试/test|500|6377|93.4|4.7|1.9|2.2|8.8|45.2|
25
+
26
+ ### TER
27
+
28
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
29
+ |---|---|---|---|---|---|---|---|---|
exp/asr_train/config.yaml ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_asr_transformer.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/asr_train_asr_transformer_raw_foc_char
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 60
28
+ patience: 5
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - acc
39
+ - max
40
+ keep_nbest_models: 10
41
+ nbest_averaging_interval: 0
42
+ grad_clip: 5.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 8
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ create_graph_in_tensorboard: false
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param:
63
+ - /home/pro-c/yewei/espnet/egs2/mini_an4/asr1/exp/asr_train_asr_transformer_raw_can_char/valid.acc.ave_10best.pth
64
+ ignore_init_mismatch: true
65
+ freeze_param: []
66
+ num_iters_per_epoch: null
67
+ batch_size: 16
68
+ att_r2l_infer_weight: 0.5
69
+ rescore_r2l_max: 5
70
+ valid_batch_size: null
71
+ batch_bins: 1000000
72
+ valid_batch_bins: null
73
+ train_shape_file:
74
+ - exp/asr_stats_raw_foc_char/train/speech_shape
75
+ - exp/asr_stats_raw_foc_char/train/text_shape.char
76
+ valid_shape_file:
77
+ - exp/asr_stats_raw_foc_char/valid/speech_shape
78
+ - exp/asr_stats_raw_foc_char/valid/text_shape.char
79
+ batch_type: folded
80
+ valid_batch_type: null
81
+ fold_length:
82
+ - 80000
83
+ - 150
84
+ sort_in_batch: descending
85
+ sort_batch: descending
86
+ multiple_iterator: false
87
+ chunk_length: 500
88
+ chunk_shift_ratio: 0.5
89
+ num_cache_chunks: 1024
90
+ chunk_excluded_key_prefixes: []
91
+ train_data_path_and_name_and_type:
92
+ - - dump/raw/train/wav.scp
93
+ - speech
94
+ - sound
95
+ - - dump/raw/train/text
96
+ - text
97
+ - text
98
+ valid_data_path_and_name_and_type:
99
+ - - dump/raw/dev/wav.scp
100
+ - speech
101
+ - sound
102
+ - - dump/raw/dev/text
103
+ - text
104
+ - text
105
+ allow_variable_data_keys: false
106
+ max_cache_size: 0.0
107
+ max_cache_fd: 32
108
+ valid_max_cache_size: null
109
+ exclude_weight_decay: false
110
+ exclude_weight_decay_conf: {}
111
+ optim: adam
112
+ optim_conf:
113
+ lr: 0.005
114
+ scheduler: warmuplr
115
+ scheduler_conf:
116
+ warmup_steps: 30000
117
+ token_list:
118
+ - <blank>
119
+ - <unk>
120
+ - <space>
121
+ - '3'
122
+ - '2'
123
+ - '5'
124
+ - g
125
+ - o
126
+ - a
127
+ - n
128
+ - i
129
+ - '4'
130
+ - u
131
+ - e
132
+ - k
133
+ - '1'
134
+ - j
135
+ - y
136
+ - z
137
+ - s
138
+ - h
139
+ - d
140
+ - m
141
+ - l
142
+ - c
143
+ - b
144
+ - f
145
+ - t
146
+ - w
147
+ - p
148
+ - r
149
+ - x
150
+ - v
151
+ - q
152
+ - <sos/eos>
153
+ init: xavier_uniform
154
+ input_size: null
155
+ ctc_conf:
156
+ dropout_rate: 0.0
157
+ ctc_type: builtin
158
+ reduce: true
159
+ ignore_nan_grad: null
160
+ zero_infinity: true
161
+ joint_net_conf: null
162
+ use_preprocessor: true
163
+ token_type: char
164
+ bpemodel: null
165
+ non_linguistic_symbols: null
166
+ cleaner: null
167
+ g2p: null
168
+ speech_volume_normalize: null
169
+ rir_scp: null
170
+ rir_apply_prob: 1.0
171
+ noise_scp: null
172
+ noise_apply_prob: 1.0
173
+ noise_db_range: '13_15'
174
+ short_noise_thres: 0.5
175
+ aux_ctc_tasks: []
176
+ frontend: default
177
+ frontend_conf:
178
+ fs: 16k
179
+ specaug: null
180
+ specaug_conf: {}
181
+ normalize: global_mvn
182
+ normalize_conf:
183
+ stats_file: exp/asr_stats_raw_foc_char/train/feats_stats.npz
184
+ model: espnet
185
+ model_conf:
186
+ ctc_weight: 0.3
187
+ lsm_weight: 0.1
188
+ att_r2l_weight: 0.5
189
+ length_normalized_loss: false
190
+ preencoder: null
191
+ preencoder_conf: {}
192
+ encoder: transformer
193
+ encoder_conf:
194
+ output_size: 256
195
+ attention_heads: 4
196
+ linear_units: 2048
197
+ num_blocks: 12
198
+ dropout_rate: 0.1
199
+ positional_dropout_rate: 0.1
200
+ attention_dropout_rate: 0.0
201
+ input_layer: conv2d
202
+ normalize_before: true
203
+ postencoder: null
204
+ postencoder_conf: {}
205
+ decoder: transformer
206
+ decoder_conf:
207
+ attention_heads: 4
208
+ linear_units: 2048
209
+ num_blocks: 6
210
+ dropout_rate: 0.1
211
+ positional_dropout_rate: 0.1
212
+ self_attention_dropout_rate: 0.0
213
+ src_attention_dropout_rate: 0.0
214
+ preprocessor: default
215
+ preprocessor_conf: {}
216
+ required:
217
+ - output_dir
218
+ - token_list
219
+ version: '202301'
220
+ distributed: false
exp/asr_train/images/acc.png ADDED
exp/asr_train/images/acc_R2L.png ADDED
exp/asr_train/images/backward_time.png ADDED
exp/asr_train/images/cer.png ADDED
exp/asr_train/images/cer_R2L.png ADDED
exp/asr_train/images/cer_ctc.png ADDED
exp/asr_train/images/forward_time.png ADDED
exp/asr_train/images/gpu_max_cached_mem_GB.png ADDED
exp/asr_train/images/iter_time.png ADDED
exp/asr_train/images/loss.png ADDED
exp/asr_train/images/loss_att.png ADDED
exp/asr_train/images/loss_att_R2L.png ADDED
exp/asr_train/images/loss_ctc.png ADDED
exp/asr_train/images/optim0_lr0.png ADDED
exp/asr_train/images/optim_step_time.png ADDED
exp/asr_train/images/train_time.png ADDED
exp/asr_train/images/wer.png ADDED
exp/asr_train/images/wer_R2L.png ADDED
exp/{asr_train_asr_transformer_raw_foc_char → asr_train}/valid.acc.ave_10best.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:592b170fe66e194555eeefeee626f81a24611a0b58abf87fbcddb06ae68c467a
3
- size 108694309
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b7dd8f8f0449dda7ff4c2d26b0db8d2d712e0390dd9cdc47087597d14682e68
3
+ size 146717987
meta.yaml CHANGED
@@ -1,9 +1,9 @@
1
  espnet: '202301'
2
  files:
3
- asr_model_file: exp/asr_train_asr_transformer_raw_foc_char/valid.acc.ave_10best.pth
4
  python: "3.8.16 | packaged by conda-forge | (default, Feb 1 2023, 16:01:55) \n[GCC\
5
  \ 11.3.0]"
6
- timestamp: 1681908736.234121
7
  torch: 1.10.0
8
  yaml_files:
9
- asr_train_config: exp/asr_train_asr_transformer_raw_foc_char/config.yaml
 
1
  espnet: '202301'
2
  files:
3
+ asr_model_file: exp/asr_train/valid.acc.ave_10best.pth
4
  python: "3.8.16 | packaged by conda-forge | (default, Feb 1 2023, 16:01:55) \n[GCC\
5
  \ 11.3.0]"
6
+ timestamp: 1682343910.954183
7
  torch: 1.10.0
8
  yaml_files:
9
+ asr_train_config: exp/asr_train/config.yaml