File size: 1,738 Bytes
5def8da 56efb41 ce7818c 5def8da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
text_encoder: !new:uetasr.featurizers.text.Subword
model_prefix: vocabs/subword_vietnamese_500
data_path: transcript_v3.1.txt
character_coverage: 1.0
model_type: bpe # word bpe unigram char
num_threads: 16
unk_id: 1
pad_id: 0
eos_id: -1
unk_piece: <unk>
pad_piece: <blank>
eos_piece: </s>
vocab_size: 500
audio_encoder: !new:uetasr.featurizers.audio.LogMelSpectrogram
fs: 16000
n_fft: 512
win_length: 400
hop_length: 160
n_mels: 80
fmin: 0
fmax: 8000
htk: False
d_model: 256
encoder_model: !new:uetasr.models.encoders.Conformer
num_features: 80
window_size: 1
d_model: !ref <d_model>
input_layer: vgg2l
pos_enc_layer_type: rel_pos
dropout_rate_pos_enc: 0.2
selfattention_layer_type: rel_selfattn
attention_heads: 4
dropout_rate_att: 0.1
dropout_rate_pos_wise: 0.1
dropout_rate: 0.1
positionwise_layer_type: linear
linear_units: 1024
conv_mod_kernel: 31
num_blocks: 18
use_macaron: True
use_cnn_module: True
eps_layer_norm: 0.000000000001
decoder_model: !new:uetasr.models.decoders.RNNDecoder
vocab_size: !ref <text_encoder.vocab_size>
embedding_dim: 256
num_layers: 1
hidden_dim: !ref <d_model>
dropout_embed: 0.2
dropout_rnn: 0.1
rnn_type: LSTM
jointer_model: !new:uetasr.layers.jointer.RNNTJointer
encoder_dim: !ref <d_model>
decoder_dim: !ref <d_model>
hidden_dim: 512
output_dim: !ref <text_encoder.vocab_size>
ctc_lin: null
model: !new:uetasr.models.rnnt.RNNT
encoder: !ref <encoder_model>
decoder: !ref <decoder_model>
jointer: !ref <jointer_model>
ctc_lin: !ref <ctc_lin>
ctc_dropout: 0.1
use_cmvn: True
|