wat_owsm_v1 / conf /train_cgn.yaml
wangpuupup's picture
Upload 7 files
f30f1c2 verified
preprocessor: s2t
preprocessor_conf:
text_prev_name: text_prev
text_ctc_name: text_ctc
fs: 16000
na_symbol: "<na>"
speech_length: 30
speech_resolution: 0.02
speech_init_silence: 30
text_prev_apply_prob: 0.0
time_apply_prob: 0.0
notime_symbol: "<notimestamps>"
first_time_symbol: "<0.00>"
last_time_symbol: "<30.00>"
frontend_conf:
n_fft: 512
win_length: 400
hop_length: 160
specaug: specaug
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 27
num_freq_mask: 2
apply_time_mask: true
time_mask_width_ratio_range:
- 0.
- 0.05
num_time_mask: 5
normalize: global_mvn
normalize_conf:
stats_file: /espnet/egs2/owsm_v1/s2t1/exp/s2t_stats_raw_bpe20000/train/feats_stats.npz
encoder: transformer
encoder_conf:
output_size: 768 # dimension of attention
attention_heads: 12
linear_units: 3072 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d2 # encoder architecture type
normalize_before: true
decoder: adptransformer
decoder_conf:
attention_heads: 12
linear_units: 3072
num_blocks: 12
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1
length_normalized_loss: false
sym_na: "<na>"
optim: adamw
optim_conf:
lr: 0.00055
betas:
- 0.9
- 0.98
eps: 1.0e-06
weight_decay: 0.0
scheduler: warmuplr
scheduler_conf:
warmup_steps: 10000
# 4 GPU/node x 8 nodes = 32 A100
batch_type: unsorted
batch_size: 5
accum_grad: 4
num_iters_per_epoch: 40000
max_epoch: 10
patience: none
init: none
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 1
use_amp: true
num_workers: 4
init_param:
- /espnet/egs2/owsm_v1/s2t1/exp/s2t_train_raw_bpe20000/valid.acc.ave.pth
ignore_init_mismatch: false