File size: 3,028 Bytes
18a317b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
# network architecture
model: Emotion2vec
model_conf:
loss_beta: 0.0
loss_scale: null
depth: 8
start_drop_path_rate: 0.0
end_drop_path_rate: 0.0
num_heads: 12
norm_eps: 1e-05
norm_affine: true
encoder_dropout: 0.1
post_mlp_drop: 0.1
attention_dropout: 0.1
activation_dropout: 0.0
dropout_input: 0.0
layerdrop: 0.05
embed_dim: 768
mlp_ratio: 4.0
layer_norm_first: false
average_top_k_layers: 8
end_of_block_targets: false
clone_batch: 8
layer_norm_target_layer: false
batch_norm_target_layer: false
instance_norm_target_layer: true
instance_norm_targets: false
layer_norm_targets: false
ema_decay: 0.999
ema_same_dtype: true
log_norms: true
ema_end_decay: 0.99999
ema_anneal_end_step: 20000
ema_encoder_only: false
max_update: 100000
extractor_mode: layer_norm
shared_decoder: null
min_target_var: 0.1
min_pred_var: 0.01
supported_modality: AUDIO
mae_init: false
seed: 1
skip_ema: false
cls_loss: 1.0
recon_loss: 0.0
d2v_loss: 1.0
decoder_group: false
adversarial_training: false
adversarial_hidden_dim: 128
adversarial_weight: 0.1
cls_type: chunk
normalize: true
modalities:
audio:
type: AUDIO
prenet_depth: 4
prenet_layerdrop: 0.05
prenet_dropout: 0.1
start_drop_path_rate: 0.0
end_drop_path_rate: 0.0
num_extra_tokens: 10
init_extra_token_zero: true
mask_noise_std: 0.01
mask_prob_min: null
mask_prob: 0.5
inverse_mask: false
mask_prob_adjust: 0.05
keep_masked_pct: 0.0
mask_length: 5
add_masks: false
remove_masks: false
mask_dropout: 0.0
encoder_zero_mask: true
mask_channel_prob: 0.0
mask_channel_length: 64
ema_local_encoder: false
local_grad_mult: 1.0
use_alibi_encoder: true
alibi_scale: 1.0
learned_alibi: false
alibi_max_pos: null
learned_alibi_scale: true
learned_alibi_scale_per_head: true
learned_alibi_scale_per_layer: false
num_alibi_heads: 12
model_depth: 8
decoder:
decoder_dim: 384
decoder_groups: 16
decoder_kernel: 7
decoder_layers: 4
input_dropout: 0.1
add_positions_masked: false
add_positions_all: false
decoder_residual: true
projection_layers: 1
projection_ratio: 2.0
extractor_mode: layer_norm
feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
conv_pos_width: 95
conv_pos_groups: 16
conv_pos_depth: 5
conv_pos_pre_ln: false
|