|
|
|
|
|
encoder: conformer |
|
encoder_conf: |
|
output_size: 256 |
|
attention_heads: 4 |
|
linear_units: 2048 |
|
num_blocks: 12 |
|
dropout_rate: 0.1 |
|
positional_dropout_rate: 0.1 |
|
attention_dropout_rate: 0.1 |
|
input_layer: conv2d |
|
normalize_before: true |
|
cnn_module_kernel: 15 |
|
use_cnn_module: true |
|
activation_type: 'swish' |
|
pos_enc_layer_type: 'rel_pos' |
|
selfattention_layer_type: 'rel_selfattn' |
|
|
|
|
|
joint_conf: |
|
join_dim: 512 |
|
prejoin_linear: True |
|
postjoin_linear: false |
|
joint_mode: 'add' |
|
activation: 'tanh' |
|
|
|
predictor: rnn |
|
predictor_conf: |
|
embed_size: 256 |
|
output_size: 256 |
|
embed_dropout: 0.1 |
|
hidden_size: 256 |
|
num_layers: 2 |
|
bias: true |
|
rnn_type: 'lstm' |
|
dropout: 0.1 |
|
|
|
decoder: bitransformer |
|
decoder_conf: |
|
attention_heads: 4 |
|
dropout_rate: 0.1 |
|
linear_units: 2048 |
|
num_blocks: 3 |
|
positional_dropout_rate: 0.1 |
|
r_num_blocks: 3 |
|
self_attention_dropout_rate: 0.1 |
|
src_attention_dropout_rate: 0.1 |
|
|
|
|
|
model_conf: |
|
transducer_weight: 0.75 |
|
ctc_weight: 0.1 |
|
attention_weight: 0.15 |
|
lsm_weight: 0.1 |
|
length_normalized_loss: false |
|
reverse_weight: 0.3 |
|
|
|
dataset_conf: |
|
filter_conf: |
|
max_length: 1650 |
|
min_length: 10 |
|
token_max_length: 200 |
|
token_min_length: 1 |
|
resample_conf: |
|
resample_rate: 16000 |
|
speed_perturb: true |
|
fbank_conf: |
|
num_mel_bins: 80 |
|
frame_shift: 10 |
|
frame_length: 25 |
|
dither: 0.1 |
|
spec_aug: true |
|
spec_aug_conf: |
|
num_t_mask: 2 |
|
num_f_mask: 2 |
|
max_t: 50 |
|
max_f: 10 |
|
shuffle: true |
|
shuffle_conf: |
|
shuffle_size: 1500 |
|
sort: true |
|
sort_conf: |
|
sort_size: 500 |
|
batch_conf: |
|
batch_type: 'dynamic' |
|
max_frames_in_batch: 4000 |
|
|
|
grad_clip: 4 |
|
accum_grad: 1 |
|
max_epoch: 140 |
|
log_interval: 100 |
|
|
|
optim: adam |
|
optim_conf: |
|
lr: 0.001 |
|
scheduler: warmuplr |
|
scheduler_conf: |
|
warmup_steps: 25000 |
|
|
|
|