File size: 3,209 Bytes
d395273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# network architecture
# encoder related
encoder: conformer
encoder_conf:
    output_size: 512    # dimension of attention
    attention_heads: 8
    linear_units: 2048 # the number of units of position-wise feed forward
    num_blocks: 18      # the number of encoder blocks
    dropout_rate: 0.1
    positional_dropout_rate: 0.0
    attention_dropout_rate: 0.0
    input_layer: conv2d6 # encoder input type, you can chose conv2d, conv2d6 and conv2d8
    normalize_before: true
    cnn_module_kernel: 15
    use_cnn_module: True
    activation_type: 'swish'
    macaron_style: True
    pos_enc_layer_type: 'rel_pos'
    selfattention_layer_type: 'abs_selfattn'
    nonorm: False
    cnn_prev: True
    cnn_after: False

# decoder related
decoder: transformer
decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 1
    dropout_rate: 0.0
    positional_dropout_rate: 0.0
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0

# hybrid CTC/attention
model_conf:
    ctc_weight: 1.0
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false

raw_wav: False
data_save: True
use_gc: True

w2v_encoder: True
pretrain: True
random_pretrain: False
wav2vec: True
w2v_coef: 1.0

mpc_didi_ver: False
wav2mpc: False
wav2mpc_reduction: False
mpc_mask_loss: False
mpc_coef: 0.0

mask: True
quantize_targets: True
project_targets: True
latent_vars: 320
w2v_reduct: True
w2v_ext_loss: True
w2v_loss_weights: [0.1,0]

w2v_mask_prob: 0.65
mpc_prob: 0.5

remove_valbest: False

model:
  method: 'npc'                                         # Accepts npc/apc/vqapc
  paras:
    kernel_size: 15     # Receptive field size (R) = kernel_size + 2*(n_blocks)
    mask_size: 5     # Desired input mask size (M_in) as described in NPC paper
    n_blocks: 4                     # Number of ConvBlocks stacked in NPC model
    hidden_size: 512                       # Dimension of feature of all layers
    dropout: 0.1                                         # Dropout in ConvBlock
    residual: True                           # Residual connection in ConvBlock
    batch_norm: True                             # Apply BatchNorm in ConvBlock
    activate: 'relu'                         # Activation function of ConvBlock
    disable_cross_layer: False      # Apply Masked ConvBlock at last layer only
    vq: 
      codebook_size: [64,64,64,64]    # Codebook size of each group in VQ-layer
      code_dim: [128,128,128,128] # Dim of each group summing up to hidden_size
      gumbel_temperature: 1.0       # Temperature of Gumbel Softmax in VQ-layer

collate_conf:
    spec_aug: false

# specaugmentation related
spec_aug_conf:
    num_time_mask: 2
    num_freq_mask: 2
    max_time_mask: 50
    max_freq_mask: 10
    max_time_warp: 80
    gauss_mask_for_time: False
    warp_for_time: False

# dataset related
dataset_conf:
    max_length: 4500
    min_length: 80
    max_frames_in_batch: 16000
    batch_type: 'dynamic' # static or dynamic
    batch_size: 20
    sort: true

grad_clip: 10
accum_grad: 2
max_epoch: 180
log_interval: 100

optim: adam
optim_conf:
    lr: 0.001
scheduler: warmuplr     # pytorch v1.1.0+ required
scheduler_conf:
    warmup_steps: 10000