emotion2vec_plus_large / config.yaml
BoJack's picture
Upload 8 files
7714a5c verified
raw
history blame
5.55 kB
# network architecture
model: Emotion2vec
model_conf:
_name: data2vec_multi
activation_dropout: 0.0
adversarial_hidden_dim: 128
adversarial_training: false
adversarial_weight: 0.1
attention_dropout: 0.1
average_top_k_layers: 16
batch_norm_target_layer: false
clone_batch: 12
cls_loss: 1.0
cls_type: chunk
d2v_loss: 1.0
decoder_group: false
depth: 8
dropout_input: 0.0
ema_anneal_end_step: 20000
ema_decay: 0.9997
ema_encoder_only: false
ema_end_decay: 1.0
ema_same_dtype: true
embed_dim: 1024
encoder_dropout: 0.1
end_drop_path_rate: 0.0
end_of_block_targets: false
instance_norm_target_layer: true
instance_norm_targets: false
layer_norm_first: false
layer_norm_target_layer: false
layer_norm_targets: false
layerdrop: 0.0
log_norms: true
loss_beta: 0.0
loss_scale: null
mae_init: false
max_update: 100000
min_pred_var: 0.01
min_target_var: 0.1
mlp_ratio: 4.0
normalize: true
modalities:
_name: null
audio:
add_masks: false
alibi_max_pos: null
alibi_scale: 1.0
conv_pos_depth: 5
conv_pos_groups: 16
conv_pos_pre_ln: false
conv_pos_width: 95
decoder:
add_positions_all: false
add_positions_masked: false
decoder_dim: 768
decoder_groups: 16
decoder_kernel: 7
decoder_layers: 4
decoder_residual: true
input_dropout: 0.1
projection_layers: 1
projection_ratio: 2.0
ema_local_encoder: false
encoder_zero_mask: true
end_drop_path_rate: 0.0
extractor_mode: layer_norm
feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
init_extra_token_zero: true
inverse_mask: false
keep_masked_pct: 0.0
learned_alibi: false
learned_alibi_scale: true
learned_alibi_scale_per_head: true
learned_alibi_scale_per_layer: false
local_grad_mult: 1.0
mask_channel_length: 64
mask_channel_prob: 0.0
mask_dropout: 0.0
mask_length: 5
mask_noise_std: 0.01
mask_prob: 0.55
mask_prob_adjust: 0.1
mask_prob_min: null
model_depth: 8
num_alibi_heads: 16
num_extra_tokens: 10
prenet_depth: 4
prenet_dropout: 0.1
prenet_layerdrop: 0.0
remove_masks: false
start_drop_path_rate: 0.0
type: AUDIO
use_alibi_encoder: true
image:
add_masks: false
alibi_dims: 2
alibi_distance: manhattan
alibi_max_pos: null
alibi_scale: 1.0
decoder:
add_positions_all: false
add_positions_masked: false
decoder_dim: 384
decoder_groups: 16
decoder_kernel: 5
decoder_layers: 5
decoder_residual: true
input_dropout: 0.1
projection_layers: 1
projection_ratio: 2.0
ema_local_encoder: false
embed_dim: 768
enc_dec_transformer: false
encoder_zero_mask: true
end_drop_path_rate: 0.0
fixed_positions: true
in_chans: 3
init_extra_token_zero: true
input_size: 224
inverse_mask: false
keep_masked_pct: 0.0
learned_alibi: false
learned_alibi_scale: false
learned_alibi_scale_per_head: false
learned_alibi_scale_per_layer: false
local_grad_mult: 1.0
mask_channel_length: 64
mask_channel_prob: 0.0
mask_dropout: 0.0
mask_length: 5
mask_noise_std: 0.01
mask_prob: 0.7
mask_prob_adjust: 0.0
mask_prob_min: null
model_depth: 8
num_alibi_heads: 16
num_extra_tokens: 0
patch_size: 16
prenet_depth: 4
prenet_dropout: 0.0
prenet_layerdrop: 0.0
remove_masks: false
start_drop_path_rate: 0.0
transformer_decoder: false
type: IMAGE
use_alibi_encoder: false
text:
add_masks: false
alibi_max_pos: null
alibi_scale: 1.0
decoder:
add_positions_all: false
add_positions_masked: false
decoder_dim: 384
decoder_groups: 16
decoder_kernel: 5
decoder_layers: 5
decoder_residual: true
input_dropout: 0.1
projection_layers: 1
projection_ratio: 2.0
dropout: 0.1
ema_local_encoder: false
encoder_zero_mask: true
end_drop_path_rate: 0.0
init_extra_token_zero: true
inverse_mask: false
keep_masked_pct: 0.0
layernorm_embedding: true
learned_alibi: false
learned_alibi_scale: false
learned_alibi_scale_per_head: false
learned_alibi_scale_per_layer: false
learned_pos: true
local_grad_mult: 1.0
mask_channel_length: 64
mask_channel_prob: 0.0
mask_dropout: 0.0
mask_length: 5
mask_noise_std: 0.01
mask_prob: 0.7
mask_prob_adjust: 0.0
mask_prob_min: null
max_source_positions: 512
model_depth: 8
no_scale_embedding: true
no_token_positional_embeddings: false
num_alibi_heads: 16
num_extra_tokens: 0
prenet_depth: 4
prenet_dropout: 0.0
prenet_layerdrop: 0.0
remove_masks: false
start_drop_path_rate: 0.0
type: TEXT
use_alibi_encoder: false
norm_affine: true
norm_eps: 1.0e-05
num_heads: 16
post_mlp_drop: 0.1
recon_loss: 0.0
seed: 1
shared_decoder: null
skip_ema: false
start_drop_path_rate: 0.0
supported_modality: AUDIO
tokenizer: CharTokenizer
tokenizer_conf:
unk_symbol: <unk>
split_with_space: true
scope_map:
- 'd2v_model.'
- none