Dallyana commited on
Commit
3c576c1
1 Parent(s): 7200459

Upload 9 files

Browse files
README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - es
5
+ library_name: espnet
6
+ tags:
7
+ - automatic-speech-recognition
8
+ - speech
9
+ - espnet
10
+ - spanish
11
+ ---
12
+
13
+ # reazonspeech-espnet-v1
14
+
15
+ `reazonspeech-espnet-v1` es un modelo de reconocimiento automático del habla (ASR) entrenado con espnet2 para el español ecuatoriano. Este modelo tiene como objetivo reconocer el habla de diferentes regiones y acentos del Ecuador, usando un corpus propio y el corpus de Common Voice. El modelo usa una arquitectura de transformador con codificación por subpalabras (BPE). El modelo alcanza un WER de X% y un MOS de Y en el conjunto de datos de prueba. Para más detalles sobre el modelo, puedes consultar este artículo.
16
+
17
+
18
+
data/es_token_list/bpe_unigram64/bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ebfb4bda4d1ca304ed4cb00bb4fc00c861634ea02f912bb79f8431443c33e4f
3
+ size 238394
data/es_token_list/bpe_unigram64/tokens.txt ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <blank>
2
+ <unk>
3
+
4
+ a
5
+ o
6
+ i
7
+ s
8
+ r
9
+ e
10
+ n
11
+ c
12
+ u
13
+ l
14
+ m
15
+ b
16
+ g
17
+ t
18
+ ▁de
19
+ ▁a
20
+ en
21
+ do
22
+ er
23
+ ▁p
24
+ ra
25
+ ta
26
+ te
27
+ h
28
+ ▁que
29
+ p
30
+ ▁la
31
+ ▁el
32
+ ▁es
33
+ to
34
+ d
35
+ da
36
+ es
37
+ ▁no
38
+ os
39
+ y
40
+ ▁y
41
+ ▁ma
42
+ ▁un
43
+ ▁se
44
+ ▁en
45
+ la
46
+ f
47
+ z
48
+ ñ
49
+ 0
50
+ 4
51
+ 3
52
+ w
53
+ 6
54
+ 8
55
+ 9
56
+ 2
57
+ 1
58
+ x
59
+ j
60
+ q
61
+ 5
62
+ 7
63
+ v
64
+ <sos/eos>
exp/asr_stats_raw_es_bpe64_sp/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92894c3672055bd42b228eafdcca0cc989649de0d4fdbe3f4e163d82c22b70f1
3
+ size 1402
exp/asr_train_asr_transformer_raw_es_bpe64_sp/config.yaml ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_asr_transformer.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/asr_train_asr_transformer_raw_es_bpe64_sp
9
+ ngpu: 0
10
+ seed: 0
11
+ num_workers: 1
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: null
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: false
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: true
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 20
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - valid
40
+ - acc
41
+ - max
42
+ keep_nbest_models: 10
43
+ nbest_averaging_interval: 0
44
+ grad_clip: 5.0
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: null
53
+ use_matplotlib: true
54
+ use_tensorboard: true
55
+ create_graph_in_tensorboard: false
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ use_lora: false
64
+ save_lora_only: true
65
+ lora_conf: {}
66
+ pretrain_path: null
67
+ init_param: []
68
+ ignore_init_mismatch: false
69
+ freeze_param: []
70
+ num_iters_per_epoch: null
71
+ batch_size: 16
72
+ valid_batch_size: null
73
+ batch_bins: 1000000
74
+ valid_batch_bins: null
75
+ train_shape_file:
76
+ - exp/asr_stats_raw_es_bpe64_sp/train/speech_shape
77
+ - exp/asr_stats_raw_es_bpe64_sp/train/text_shape.bpe
78
+ valid_shape_file:
79
+ - exp/asr_stats_raw_es_bpe64_sp/valid/speech_shape
80
+ - exp/asr_stats_raw_es_bpe64_sp/valid/text_shape.bpe
81
+ batch_type: folded
82
+ valid_batch_type: null
83
+ fold_length:
84
+ - 80000
85
+ - 150
86
+ sort_in_batch: descending
87
+ shuffle_within_batch: false
88
+ sort_batch: descending
89
+ multiple_iterator: false
90
+ chunk_length: 500
91
+ chunk_shift_ratio: 0.5
92
+ num_cache_chunks: 1024
93
+ chunk_excluded_key_prefixes: []
94
+ chunk_default_fs: null
95
+ train_data_path_and_name_and_type:
96
+ - - dump/raw/train_nodev_sp/wav.scp
97
+ - speech
98
+ - sound
99
+ - - dump/raw/train_nodev_sp/text
100
+ - text
101
+ - text
102
+ valid_data_path_and_name_and_type:
103
+ - - dump/raw/train_dev/wav.scp
104
+ - speech
105
+ - sound
106
+ - - dump/raw/train_dev/text
107
+ - text
108
+ - text
109
+ allow_variable_data_keys: false
110
+ max_cache_size: 0.0
111
+ max_cache_fd: 32
112
+ allow_multi_rates: false
113
+ valid_max_cache_size: null
114
+ exclude_weight_decay: false
115
+ exclude_weight_decay_conf: {}
116
+ optim: adam
117
+ optim_conf:
118
+ lr: 0.001
119
+ scheduler: warmuplr
120
+ scheduler_conf:
121
+ warmup_steps: 2500
122
+ token_list:
123
+ - <blank>
124
+ - <unk>
125
+ - ▁
126
+ - a
127
+ - o
128
+ - i
129
+ - s
130
+ - r
131
+ - e
132
+ - n
133
+ - c
134
+ - u
135
+ - l
136
+ - m
137
+ - b
138
+ - g
139
+ - t
140
+ - ▁de
141
+ - ▁a
142
+ - en
143
+ - do
144
+ - er
145
+ - ▁p
146
+ - ra
147
+ - ta
148
+ - te
149
+ - h
150
+ - ▁que
151
+ - p
152
+ - ▁la
153
+ - ▁el
154
+ - ▁es
155
+ - to
156
+ - d
157
+ - da
158
+ - es
159
+ - ▁no
160
+ - os
161
+ - y
162
+ - ▁y
163
+ - ▁ma
164
+ - ▁un
165
+ - ▁se
166
+ - ▁en
167
+ - la
168
+ - f
169
+ - z
170
+ - ñ
171
+ - '0'
172
+ - '4'
173
+ - '3'
174
+ - w
175
+ - '6'
176
+ - '8'
177
+ - '9'
178
+ - '2'
179
+ - '1'
180
+ - x
181
+ - j
182
+ - q
183
+ - '5'
184
+ - '7'
185
+ - v
186
+ - <sos/eos>
187
+ init: xavier_uniform
188
+ input_size: null
189
+ ctc_conf:
190
+ dropout_rate: 0.0
191
+ ctc_type: builtin
192
+ reduce: true
193
+ ignore_nan_grad: null
194
+ zero_infinity: true
195
+ brctc_risk_strategy: exp
196
+ brctc_group_strategy: end
197
+ brctc_risk_factor: 0.0
198
+ joint_net_conf: null
199
+ use_preprocessor: true
200
+ use_lang_prompt: false
201
+ use_nlp_prompt: false
202
+ token_type: bpe
203
+ bpemodel: data/es_token_list/bpe_unigram64/bpe.model
204
+ non_linguistic_symbols: null
205
+ cleaner: null
206
+ g2p: null
207
+ speech_volume_normalize: null
208
+ rir_scp: null
209
+ rir_apply_prob: 1.0
210
+ noise_scp: null
211
+ noise_apply_prob: 1.0
212
+ noise_db_range: '13_15'
213
+ short_noise_thres: 0.5
214
+ aux_ctc_tasks: []
215
+ frontend: default
216
+ frontend_conf:
217
+ fs: 16k
218
+ specaug: null
219
+ specaug_conf: {}
220
+ normalize: global_mvn
221
+ normalize_conf:
222
+ stats_file: exp/asr_stats_raw_es_bpe64_sp/train/feats_stats.npz
223
+ model: espnet
224
+ model_conf:
225
+ ctc_weight: 0.3
226
+ lsm_weight: 0.1
227
+ length_normalized_loss: false
228
+ preencoder: null
229
+ preencoder_conf: {}
230
+ encoder: transformer
231
+ encoder_conf:
232
+ output_size: 256
233
+ attention_heads: 4
234
+ linear_units: 2048
235
+ num_blocks: 12
236
+ dropout_rate: 0.1
237
+ positional_dropout_rate: 0.1
238
+ attention_dropout_rate: 0.0
239
+ input_layer: conv2d
240
+ normalize_before: true
241
+ postencoder: null
242
+ postencoder_conf: {}
243
+ decoder: transformer
244
+ decoder_conf:
245
+ attention_heads: 4
246
+ linear_units: 2048
247
+ num_blocks: 6
248
+ dropout_rate: 0.1
249
+ positional_dropout_rate: 0.1
250
+ self_attention_dropout_rate: 0.0
251
+ src_attention_dropout_rate: 0.0
252
+ preprocessor: default
253
+ preprocessor_conf: {}
254
+ required:
255
+ - output_dir
256
+ - token_list
257
+ version: '202402'
258
+ distributed: false
exp/asr_train_asr_transformer_raw_es_bpe64_sp/valid.acc.ave_10best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00ea407c6f54f0f1b6e22906dd350afb4f6197e525033a33b35045595dcb2288
3
+ size 108783461
exp/lm_train_lm_es_bpe64/20epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ed654cc6a9a7a28311d10605d10e3575f6b1a4928ccdc598e958d915241e89e
3
+ size 27417796
exp/lm_train_lm_es_bpe64/config.yaml ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_lm.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/lm_train_lm_es_bpe64
9
+ ngpu: 0
10
+ seed: 0
11
+ num_workers: 1
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: null
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: false
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: true
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 20
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - valid
40
+ - loss
41
+ - min
42
+ keep_nbest_models: 1
43
+ nbest_averaging_interval: 0
44
+ grad_clip: 5.0
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: null
53
+ use_matplotlib: true
54
+ use_tensorboard: true
55
+ create_graph_in_tensorboard: false
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ use_lora: false
64
+ save_lora_only: true
65
+ lora_conf: {}
66
+ pretrain_path: null
67
+ init_param: []
68
+ ignore_init_mismatch: false
69
+ freeze_param: []
70
+ num_iters_per_epoch: null
71
+ batch_size: 64
72
+ valid_batch_size: null
73
+ batch_bins: 1000000
74
+ valid_batch_bins: null
75
+ train_shape_file:
76
+ - exp/lm_stats_es_bpe64/train/text_shape.bpe
77
+ valid_shape_file:
78
+ - exp/lm_stats_es_bpe64/valid/text_shape.bpe
79
+ batch_type: folded
80
+ valid_batch_type: null
81
+ fold_length:
82
+ - 150
83
+ sort_in_batch: descending
84
+ shuffle_within_batch: false
85
+ sort_batch: descending
86
+ multiple_iterator: false
87
+ chunk_length: 500
88
+ chunk_shift_ratio: 0.5
89
+ num_cache_chunks: 1024
90
+ chunk_excluded_key_prefixes: []
91
+ chunk_default_fs: null
92
+ train_data_path_and_name_and_type:
93
+ - - dump/raw/lm_train.txt
94
+ - text
95
+ - text
96
+ valid_data_path_and_name_and_type:
97
+ - - dump/raw/org/train_dev/text
98
+ - text
99
+ - text
100
+ allow_variable_data_keys: false
101
+ max_cache_size: 0.0
102
+ max_cache_fd: 32
103
+ allow_multi_rates: false
104
+ valid_max_cache_size: null
105
+ exclude_weight_decay: false
106
+ exclude_weight_decay_conf: {}
107
+ optim: adam
108
+ optim_conf:
109
+ lr: 0.1
110
+ scheduler: null
111
+ scheduler_conf: {}
112
+ token_list:
113
+ - <blank>
114
+ - <unk>
115
+ - ▁
116
+ - a
117
+ - o
118
+ - i
119
+ - s
120
+ - r
121
+ - e
122
+ - n
123
+ - c
124
+ - u
125
+ - l
126
+ - m
127
+ - b
128
+ - g
129
+ - t
130
+ - ▁de
131
+ - ▁a
132
+ - en
133
+ - do
134
+ - er
135
+ - ▁p
136
+ - ra
137
+ - ta
138
+ - te
139
+ - h
140
+ - ▁que
141
+ - p
142
+ - ▁la
143
+ - ▁el
144
+ - ▁es
145
+ - to
146
+ - d
147
+ - da
148
+ - es
149
+ - ▁no
150
+ - os
151
+ - y
152
+ - ▁y
153
+ - ▁ma
154
+ - ▁un
155
+ - ▁se
156
+ - ▁en
157
+ - la
158
+ - f
159
+ - z
160
+ - ñ
161
+ - '0'
162
+ - '4'
163
+ - '3'
164
+ - w
165
+ - '6'
166
+ - '8'
167
+ - '9'
168
+ - '2'
169
+ - '1'
170
+ - x
171
+ - j
172
+ - q
173
+ - '5'
174
+ - '7'
175
+ - v
176
+ - <sos/eos>
177
+ init: null
178
+ use_preprocessor: true
179
+ token_type: bpe
180
+ bpemodel: data/es_token_list/bpe_unigram64/bpe.model
181
+ non_linguistic_symbols: null
182
+ cleaner: null
183
+ g2p: null
184
+ lm: seq_rnn
185
+ lm_conf:
186
+ unit: 650
187
+ nlayers: 2
188
+ model: lm
189
+ model_conf: {}
190
+ required:
191
+ - output_dir
192
+ - token_list
193
+ version: '202402'
194
+ distributed: false
meta.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ espnet: '202310'
2
+ files:
3
+ asr_model_file: exp/asr_train_asr_transformer_raw_es_bpe64_sp/valid.acc.ave_10best.pth
4
+ lm_file: exp/lm_train_lm_es_bpe64/20epoch.pth
5
+ python: "3.8"
6
+ pytorch: 1.12.1
7
+ yaml_files:
8
+ asr_train_config: exp/asr_train_asr_transformer_raw_es_bpe64_sp/config.yaml
9
+ lm_train_config: exp/lm_train_lm_es_bpe64/config.yaml