Initial Upload
Browse files- README.md +24 -0
- config.json +4 -0
- hyperparams.yaml +166 -0
- model.ckpt +3 -0
- normalizer.ckpt +3 -0
- tokenizer.ckpt +3 -0
README.md
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language: "fi"
|
3 |
+
thumbnail:
|
4 |
+
tags:
|
5 |
+
- automatic-speech-recognition
|
6 |
+
- Attention
|
7 |
+
- pytorch
|
8 |
+
- speechbrain
|
9 |
+
|
10 |
+
metrics:
|
11 |
+
- wer
|
12 |
+
- cer
|
13 |
+
---
|
14 |
+
|
15 |
+
# Description
|
16 |
+
Attention-based Encoder-Decoder model trained on Puhelahjat (1500h colloquial Finnish donated by huge number of volunteers) and Finnish Parliament ASR Corpus (3000h speech from the sessions of the Finnish Parliament)
|
17 |
+
The Encoder is a CRDNN (Conv+LSTM+DNN), Decoder is GRU.
|
18 |
+
|
19 |
+
# Performance expectations
|
20 |
+
This is a relatively fast and compact model (~40M parameters), performance is not state-of-the-art.
|
21 |
+
This does not include a language model, the model is fully end-to-end.
|
22 |
+
|
23 |
+
This model should generalize to many types of speech. However, the model will also try to match colloquial speech (unlike some models which have learned to follow the written forms of Finnish).
|
24 |
+
|
config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"speechbrain_interface": "EncoderDecoderASR"
|
3 |
+
}
|
4 |
+
|
hyperparams.yaml
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ############################################################################
|
2 |
+
# Model: E2E ASR with attention-based ASR
|
3 |
+
# Training data: All Finnish
|
4 |
+
# Encoder: CRDNN
|
5 |
+
# Decoder: GRU + beamsearch
|
6 |
+
# Authors: Aku Rouhe 2022
|
7 |
+
# ############################################################################
|
8 |
+
|
9 |
+
num_units: 5000
|
10 |
+
tokenizer: !new:sentencepiece.SentencePieceProcessor
|
11 |
+
|
12 |
+
# Feature parameters
|
13 |
+
sample_rate: 16000
|
14 |
+
n_fft: 400
|
15 |
+
n_mels: 40
|
16 |
+
|
17 |
+
# Model parameters
|
18 |
+
activation: !name:torch.nn.LeakyReLU
|
19 |
+
dropout: 0.15
|
20 |
+
cnn_blocks: 2
|
21 |
+
cnn_channels: (64, 128)
|
22 |
+
inter_layer_pooling_size: (2, 2)
|
23 |
+
cnn_kernelsize: (3, 3)
|
24 |
+
time_pooling_size: 4
|
25 |
+
rnn_class: !name:speechbrain.nnet.RNN.LSTM
|
26 |
+
rnn_layers: 3
|
27 |
+
rnn_neurons: 512
|
28 |
+
rnn_bidirectional: True
|
29 |
+
dnn_blocks: 1
|
30 |
+
dnn_neurons: 512
|
31 |
+
emb_size: 128
|
32 |
+
dec_neurons: 1024
|
33 |
+
dec_layers: 1
|
34 |
+
output_neurons: !ref <num_units>
|
35 |
+
|
36 |
+
unk_index: 1
|
37 |
+
blank_index: 0
|
38 |
+
pad_index: 0
|
39 |
+
bos_index: 1
|
40 |
+
eos_index: 2
|
41 |
+
|
42 |
+
min_decode_ratio: 0.0
|
43 |
+
max_decode_ratio: 1.0
|
44 |
+
valid_beam_size: 4
|
45 |
+
test_beam_size: 8
|
46 |
+
eos_threshold: 1.2
|
47 |
+
using_max_attn_shift: False
|
48 |
+
max_attn_shift: 240
|
49 |
+
ctc_weight_decode: 0.0
|
50 |
+
coverage_penalty: 3.0
|
51 |
+
temperature: 1.5
|
52 |
+
|
53 |
+
# Feature extraction
|
54 |
+
compute_features: !new:speechbrain.lobes.features.Fbank
|
55 |
+
sample_rate: !ref <sample_rate>
|
56 |
+
n_fft: !ref <n_fft>
|
57 |
+
n_mels: !ref <n_mels>
|
58 |
+
|
59 |
+
# Feature normalization (mean and std)
|
60 |
+
normalize: !new:speechbrain.processing.features.InputNormalization
|
61 |
+
norm_type: global
|
62 |
+
update_until_epoch: -1
|
63 |
+
|
64 |
+
# The CRDNN model is an encoder that combines CNNs, RNNs, and DNNs.
|
65 |
+
encoder: !new:speechbrain.lobes.models.CRDNN.CRDNN
|
66 |
+
input_shape: [null, null, !ref <n_mels>]
|
67 |
+
activation: !ref <activation>
|
68 |
+
dropout: !ref <dropout>
|
69 |
+
cnn_blocks: !ref <cnn_blocks>
|
70 |
+
cnn_channels: !ref <cnn_channels>
|
71 |
+
cnn_kernelsize: !ref <cnn_kernelsize>
|
72 |
+
inter_layer_pooling_size: !ref <inter_layer_pooling_size>
|
73 |
+
time_pooling: True
|
74 |
+
using_2d_pooling: False
|
75 |
+
time_pooling_size: !ref <time_pooling_size>
|
76 |
+
rnn_class: !ref <rnn_class>
|
77 |
+
rnn_layers: !ref <rnn_layers>
|
78 |
+
rnn_neurons: !ref <rnn_neurons>
|
79 |
+
rnn_bidirectional: !ref <rnn_bidirectional>
|
80 |
+
rnn_re_init: True
|
81 |
+
dnn_blocks: !ref <dnn_blocks>
|
82 |
+
dnn_neurons: !ref <dnn_neurons>
|
83 |
+
use_rnnp: False
|
84 |
+
|
85 |
+
# Embedding (from indexes to an embedding space of dimension emb_size).
|
86 |
+
embedding: !new:speechbrain.nnet.embedding.Embedding
|
87 |
+
num_embeddings: !ref <output_neurons>
|
88 |
+
embedding_dim: !ref <emb_size>
|
89 |
+
|
90 |
+
# Attention-based RNN decoder.
|
91 |
+
decoder: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
|
92 |
+
enc_dim: !ref <dnn_neurons>
|
93 |
+
input_size: !ref <emb_size>
|
94 |
+
rnn_type: gru
|
95 |
+
attn_type: location
|
96 |
+
hidden_size: !ref <dec_neurons>
|
97 |
+
attn_dim: 2048
|
98 |
+
num_layers: !ref <dec_layers>
|
99 |
+
scaling: 1.0
|
100 |
+
channels: 10
|
101 |
+
kernel_size: 100
|
102 |
+
re_init: True
|
103 |
+
dropout: !ref <dropout>
|
104 |
+
|
105 |
+
# Linear transformation on the top of the encoder.
|
106 |
+
ctc_lin: !new:speechbrain.nnet.linear.Linear
|
107 |
+
input_size: !ref <dnn_neurons>
|
108 |
+
n_neurons: !ref <output_neurons>
|
109 |
+
|
110 |
+
# Linear transformation on the top of the decoder.
|
111 |
+
seq_lin: !new:speechbrain.nnet.linear.Linear
|
112 |
+
input_size: !ref <dec_neurons>
|
113 |
+
n_neurons: !ref <output_neurons>
|
114 |
+
|
115 |
+
# Final softmax (for log posteriors computation).
|
116 |
+
log_softmax: !new:speechbrain.nnet.activations.Softmax
|
117 |
+
apply_log: True
|
118 |
+
|
119 |
+
# Cost definition for the CTC part.
|
120 |
+
ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
|
121 |
+
blank_index: !ref <blank_index>
|
122 |
+
|
123 |
+
full_encode_step: !new:speechbrain.nnet.containers.LengthsCapableSequential
|
124 |
+
input_shape: [null, null, !ref <n_mels>]
|
125 |
+
compute_features: !ref <compute_features>
|
126 |
+
normalize: !ref <normalize>
|
127 |
+
model: !ref <encoder>
|
128 |
+
|
129 |
+
# Gathering all the submodels in a single model object.
|
130 |
+
model: !new:torch.nn.ModuleList
|
131 |
+
- - !ref <encoder>
|
132 |
+
- !ref <embedding>
|
133 |
+
- !ref <decoder>
|
134 |
+
- !ref <ctc_lin>
|
135 |
+
- !ref <seq_lin>
|
136 |
+
|
137 |
+
test_search: !new:speechbrain.decoders.S2SRNNBeamSearcher
|
138 |
+
embedding: !ref <embedding>
|
139 |
+
decoder: !ref <decoder>
|
140 |
+
linear: !ref <seq_lin>
|
141 |
+
ctc_linear: !ref <ctc_lin>
|
142 |
+
bos_index: !ref <bos_index>
|
143 |
+
eos_index: !ref <eos_index>
|
144 |
+
blank_index: !ref <blank_index>
|
145 |
+
min_decode_ratio: !ref <min_decode_ratio>
|
146 |
+
max_decode_ratio: !ref <max_decode_ratio>
|
147 |
+
beam_size: !ref <test_beam_size>
|
148 |
+
eos_threshold: !ref <eos_threshold>
|
149 |
+
using_max_attn_shift: !ref <using_max_attn_shift>
|
150 |
+
max_attn_shift: !ref <max_attn_shift>
|
151 |
+
coverage_penalty: !ref <coverage_penalty>
|
152 |
+
ctc_weight: !ref <ctc_weight_decode>
|
153 |
+
temperature: !ref <temperature>
|
154 |
+
|
155 |
+
# Objects in "modules" dict will have their parameters moved to the correct
|
156 |
+
# device, as well as having train()/eval() called on them by the Brain class
|
157 |
+
modules:
|
158 |
+
encoder: !ref <full_encode_step>
|
159 |
+
decoder: !ref <test_search>
|
160 |
+
|
161 |
+
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
162 |
+
loadables:
|
163 |
+
model: !ref <model>
|
164 |
+
normalizer: !ref <normalize>
|
165 |
+
tokenizer: !ref <tokenizer>
|
166 |
+
|
model.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d364b7506f4e74899a620caa2dbff8152d52b7ffcd1c20f99a29cde95b61637
|
3 |
+
size 185195486
|
normalizer.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db3da32bfaf668885250821f17c09091c05593ad87087ddfb88d63dc74abd735
|
3 |
+
size 1383
|
tokenizer.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fd3d0616da87d358b9e2b3c17e4e4067ceaa8f11c1d29d499ba69f1f517fc06d
|
3 |
+
size 319229
|