File size: 5,552 Bytes
7714a5c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
# network architecture
model: Emotion2vec
model_conf:
_name: data2vec_multi
activation_dropout: 0.0
adversarial_hidden_dim: 128
adversarial_training: false
adversarial_weight: 0.1
attention_dropout: 0.1
average_top_k_layers: 16
batch_norm_target_layer: false
clone_batch: 12
cls_loss: 1.0
cls_type: chunk
d2v_loss: 1.0
decoder_group: false
depth: 8
dropout_input: 0.0
ema_anneal_end_step: 20000
ema_decay: 0.9997
ema_encoder_only: false
ema_end_decay: 1.0
ema_same_dtype: true
embed_dim: 1024
encoder_dropout: 0.1
end_drop_path_rate: 0.0
end_of_block_targets: false
instance_norm_target_layer: true
instance_norm_targets: false
layer_norm_first: false
layer_norm_target_layer: false
layer_norm_targets: false
layerdrop: 0.0
log_norms: true
loss_beta: 0.0
loss_scale: null
mae_init: false
max_update: 100000
min_pred_var: 0.01
min_target_var: 0.1
mlp_ratio: 4.0
normalize: true
modalities:
_name: null
audio:
add_masks: false
alibi_max_pos: null
alibi_scale: 1.0
conv_pos_depth: 5
conv_pos_groups: 16
conv_pos_pre_ln: false
conv_pos_width: 95
decoder:
add_positions_all: false
add_positions_masked: false
decoder_dim: 768
decoder_groups: 16
decoder_kernel: 7
decoder_layers: 4
decoder_residual: true
input_dropout: 0.1
projection_layers: 1
projection_ratio: 2.0
ema_local_encoder: false
encoder_zero_mask: true
end_drop_path_rate: 0.0
extractor_mode: layer_norm
feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
init_extra_token_zero: true
inverse_mask: false
keep_masked_pct: 0.0
learned_alibi: false
learned_alibi_scale: true
learned_alibi_scale_per_head: true
learned_alibi_scale_per_layer: false
local_grad_mult: 1.0
mask_channel_length: 64
mask_channel_prob: 0.0
mask_dropout: 0.0
mask_length: 5
mask_noise_std: 0.01
mask_prob: 0.55
mask_prob_adjust: 0.1
mask_prob_min: null
model_depth: 8
num_alibi_heads: 16
num_extra_tokens: 10
prenet_depth: 4
prenet_dropout: 0.1
prenet_layerdrop: 0.0
remove_masks: false
start_drop_path_rate: 0.0
type: AUDIO
use_alibi_encoder: true
image:
add_masks: false
alibi_dims: 2
alibi_distance: manhattan
alibi_max_pos: null
alibi_scale: 1.0
decoder:
add_positions_all: false
add_positions_masked: false
decoder_dim: 384
decoder_groups: 16
decoder_kernel: 5
decoder_layers: 5
decoder_residual: true
input_dropout: 0.1
projection_layers: 1
projection_ratio: 2.0
ema_local_encoder: false
embed_dim: 768
enc_dec_transformer: false
encoder_zero_mask: true
end_drop_path_rate: 0.0
fixed_positions: true
in_chans: 3
init_extra_token_zero: true
input_size: 224
inverse_mask: false
keep_masked_pct: 0.0
learned_alibi: false
learned_alibi_scale: false
learned_alibi_scale_per_head: false
learned_alibi_scale_per_layer: false
local_grad_mult: 1.0
mask_channel_length: 64
mask_channel_prob: 0.0
mask_dropout: 0.0
mask_length: 5
mask_noise_std: 0.01
mask_prob: 0.7
mask_prob_adjust: 0.0
mask_prob_min: null
model_depth: 8
num_alibi_heads: 16
num_extra_tokens: 0
patch_size: 16
prenet_depth: 4
prenet_dropout: 0.0
prenet_layerdrop: 0.0
remove_masks: false
start_drop_path_rate: 0.0
transformer_decoder: false
type: IMAGE
use_alibi_encoder: false
text:
add_masks: false
alibi_max_pos: null
alibi_scale: 1.0
decoder:
add_positions_all: false
add_positions_masked: false
decoder_dim: 384
decoder_groups: 16
decoder_kernel: 5
decoder_layers: 5
decoder_residual: true
input_dropout: 0.1
projection_layers: 1
projection_ratio: 2.0
dropout: 0.1
ema_local_encoder: false
encoder_zero_mask: true
end_drop_path_rate: 0.0
init_extra_token_zero: true
inverse_mask: false
keep_masked_pct: 0.0
layernorm_embedding: true
learned_alibi: false
learned_alibi_scale: false
learned_alibi_scale_per_head: false
learned_alibi_scale_per_layer: false
learned_pos: true
local_grad_mult: 1.0
mask_channel_length: 64
mask_channel_prob: 0.0
mask_dropout: 0.0
mask_length: 5
mask_noise_std: 0.01
mask_prob: 0.7
mask_prob_adjust: 0.0
mask_prob_min: null
max_source_positions: 512
model_depth: 8
no_scale_embedding: true
no_token_positional_embeddings: false
num_alibi_heads: 16
num_extra_tokens: 0
prenet_depth: 4
prenet_dropout: 0.0
prenet_layerdrop: 0.0
remove_masks: false
start_drop_path_rate: 0.0
type: TEXT
use_alibi_encoder: false
norm_affine: true
norm_eps: 1.0e-05
num_heads: 16
post_mlp_drop: 0.1
recon_loss: 0.0
seed: 1
shared_decoder: null
skip_ema: false
start_drop_path_rate: 0.0
supported_modality: AUDIO
tokenizer: CharTokenizer
tokenizer_conf:
unk_symbol: <unk>
split_with_space: true
scope_map:
- 'd2v_model.'
- none
|