File size: 2,154 Bytes
f30f1c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
preprocessor: s2t
preprocessor_conf:
    text_prev_name: text_prev
    text_ctc_name: text_ctc
    fs: 16000
    na_symbol: "<na>"
    speech_length: 30
    speech_resolution: 0.02
    speech_init_silence: 30
    text_prev_apply_prob: 0.0
    time_apply_prob: 0.0
    notime_symbol: "<notimestamps>"
    first_time_symbol: "<0.00>"
    last_time_symbol: "<30.00>"

frontend_conf:
    n_fft: 512
    win_length: 400
    hop_length: 160

specaug: specaug
specaug_conf:
    apply_time_warp: false
    time_warp_window: 5
    time_warp_mode: bicubic
    apply_freq_mask: true
    freq_mask_width_range:
    - 0
    - 27
    num_freq_mask: 2
    apply_time_mask: true
    time_mask_width_ratio_range:
    - 0.
    - 0.05
    num_time_mask: 5

normalize: global_mvn
normalize_conf:
    stats_file: /espnet/egs2/owsm_v1/s2t1/exp/s2t_stats_raw_bpe20000/train/feats_stats.npz

encoder: transformer
encoder_conf:
    output_size: 768    # dimension of attention
    attention_heads: 12
    linear_units: 3072  # the number of units of position-wise feed forward
    num_blocks: 12      # the number of encoder blocks
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    attention_dropout_rate: 0.1
    input_layer: conv2d2 # encoder architecture type
    normalize_before: true

decoder: adptransformer
decoder_conf:
    attention_heads: 12
    linear_units: 3072
    num_blocks: 12
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    self_attention_dropout_rate: 0.1
    src_attention_dropout_rate: 0.1

model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1
    length_normalized_loss: false
    sym_na: "<na>"

optim: adamw
optim_conf:
    lr: 0.00055
    betas:
    - 0.9
    - 0.98
    eps: 1.0e-06
    weight_decay: 0.0
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 10000

# 4 GPU/node x 8 nodes = 32 A100
batch_type: unsorted
batch_size: 5
accum_grad: 4
num_iters_per_epoch: 40000
max_epoch: 10
patience: none
init: none
best_model_criterion:
-   - valid
    - acc
    - max
keep_nbest_models: 1
use_amp: true
num_workers: 4

init_param:
- /espnet/egs2/owsm_v1/s2t1/exp/s2t_train_raw_bpe20000/valid.acc.ave.pth
ignore_init_mismatch: false