# network architecture # encoder related encoder: conformer encoder_conf: output_size: 512 # dimension of attention attention_heads: 8 linear_units: 2048 # the number of units of position-wise feed forward num_blocks: 18 # the number of encoder blocks dropout_rate: 0.1 positional_dropout_rate: 0.0 attention_dropout_rate: 0.0 input_layer: conv2d6 # encoder input type, you can chose conv2d, conv2d6 and conv2d8 normalize_before: true cnn_module_kernel: 15 use_cnn_module: True activation_type: 'swish' macaron_style: True pos_enc_layer_type: 'rel_pos' selfattention_layer_type: 'abs_selfattn' nonorm: False cnn_prev: True cnn_after: False # decoder related decoder: transformer decoder_conf: attention_heads: 4 linear_units: 2048 num_blocks: 1 dropout_rate: 0.0 positional_dropout_rate: 0.0 self_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0 # hybrid CTC/attention model_conf: ctc_weight: 1.0 lsm_weight: 0.1 # label smoothing option length_normalized_loss: false raw_wav: False data_save: True use_gc: True w2v_encoder: True pretrain: True random_pretrain: False wav2vec: True w2v_coef: 1.0 mpc_didi_ver: False wav2mpc: False wav2mpc_reduction: False mpc_mask_loss: False mpc_coef: 0.0 mask: True quantize_targets: True project_targets: True latent_vars: 320 w2v_reduct: True w2v_ext_loss: True w2v_loss_weights: [0.1,0] w2v_mask_prob: 0.65 mpc_prob: 0.5 remove_valbest: False model: method: 'npc' # Accepts npc/apc/vqapc paras: kernel_size: 15 # Receptive field size (R) = kernel_size + 2*(n_blocks) mask_size: 5 # Desired input mask size (M_in) as described in NPC paper n_blocks: 4 # Number of ConvBlocks stacked in NPC model hidden_size: 512 # Dimension of feature of all layers dropout: 0.1 # Dropout in ConvBlock residual: True # Residual connection in ConvBlock batch_norm: True # Apply BatchNorm in ConvBlock activate: 'relu' # Activation function of ConvBlock disable_cross_layer: False # Apply Masked ConvBlock at last layer only vq: codebook_size: [64,64,64,64] # Codebook size of each group in VQ-layer code_dim: [128,128,128,128] # Dim of each group summing up to hidden_size gumbel_temperature: 1.0 # Temperature of Gumbel Softmax in VQ-layer collate_conf: spec_aug: false # specaugmentation related spec_aug_conf: num_time_mask: 2 num_freq_mask: 2 max_time_mask: 50 max_freq_mask: 10 max_time_warp: 80 gauss_mask_for_time: False warp_for_time: False # dataset related dataset_conf: max_length: 4500 min_length: 80 max_frames_in_batch: 16000 batch_type: 'dynamic' # static or dynamic batch_size: 20 sort: true grad_clip: 10 accum_grad: 2 max_epoch: 180 log_interval: 100 optim: adam optim_conf: lr: 0.001 scheduler: warmuplr # pytorch v1.1.0+ required scheduler_conf: warmup_steps: 10000