# network architecture model: Emotion2vec model_conf: _name: data2vec_multi activation_dropout: 0.0 adversarial_hidden_dim: 128 adversarial_training: false adversarial_weight: 0.1 attention_dropout: 0.1 average_top_k_layers: 16 batch_norm_target_layer: false clone_batch: 12 cls_loss: 1.0 cls_type: chunk d2v_loss: 1.0 decoder_group: false depth: 8 dropout_input: 0.0 ema_anneal_end_step: 20000 ema_decay: 0.9997 ema_encoder_only: false ema_end_decay: 1.0 ema_same_dtype: true embed_dim: 1024 encoder_dropout: 0.1 end_drop_path_rate: 0.0 end_of_block_targets: false instance_norm_target_layer: true instance_norm_targets: false layer_norm_first: false layer_norm_target_layer: false layer_norm_targets: false layerdrop: 0.0 log_norms: true loss_beta: 0.0 loss_scale: null mae_init: false max_update: 100000 min_pred_var: 0.01 min_target_var: 0.1 mlp_ratio: 4.0 normalize: true modalities: _name: null audio: add_masks: false alibi_max_pos: null alibi_scale: 1.0 conv_pos_depth: 5 conv_pos_groups: 16 conv_pos_pre_ln: false conv_pos_width: 95 decoder: add_positions_all: false add_positions_masked: false decoder_dim: 768 decoder_groups: 16 decoder_kernel: 7 decoder_layers: 4 decoder_residual: true input_dropout: 0.1 projection_layers: 1 projection_ratio: 2.0 ema_local_encoder: false encoder_zero_mask: true end_drop_path_rate: 0.0 extractor_mode: layer_norm feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]' init_extra_token_zero: true inverse_mask: false keep_masked_pct: 0.0 learned_alibi: false learned_alibi_scale: true learned_alibi_scale_per_head: true learned_alibi_scale_per_layer: false local_grad_mult: 1.0 mask_channel_length: 64 mask_channel_prob: 0.0 mask_dropout: 0.0 mask_length: 5 mask_noise_std: 0.01 mask_prob: 0.55 mask_prob_adjust: 0.1 mask_prob_min: null model_depth: 8 num_alibi_heads: 16 num_extra_tokens: 10 prenet_depth: 4 prenet_dropout: 0.1 prenet_layerdrop: 0.0 remove_masks: false start_drop_path_rate: 0.0 type: AUDIO use_alibi_encoder: true image: add_masks: false alibi_dims: 2 alibi_distance: manhattan alibi_max_pos: null alibi_scale: 1.0 decoder: add_positions_all: false add_positions_masked: false decoder_dim: 384 decoder_groups: 16 decoder_kernel: 5 decoder_layers: 5 decoder_residual: true input_dropout: 0.1 projection_layers: 1 projection_ratio: 2.0 ema_local_encoder: false embed_dim: 768 enc_dec_transformer: false encoder_zero_mask: true end_drop_path_rate: 0.0 fixed_positions: true in_chans: 3 init_extra_token_zero: true input_size: 224 inverse_mask: false keep_masked_pct: 0.0 learned_alibi: false learned_alibi_scale: false learned_alibi_scale_per_head: false learned_alibi_scale_per_layer: false local_grad_mult: 1.0 mask_channel_length: 64 mask_channel_prob: 0.0 mask_dropout: 0.0 mask_length: 5 mask_noise_std: 0.01 mask_prob: 0.7 mask_prob_adjust: 0.0 mask_prob_min: null model_depth: 8 num_alibi_heads: 16 num_extra_tokens: 0 patch_size: 16 prenet_depth: 4 prenet_dropout: 0.0 prenet_layerdrop: 0.0 remove_masks: false start_drop_path_rate: 0.0 transformer_decoder: false type: IMAGE use_alibi_encoder: false text: add_masks: false alibi_max_pos: null alibi_scale: 1.0 decoder: add_positions_all: false add_positions_masked: false decoder_dim: 384 decoder_groups: 16 decoder_kernel: 5 decoder_layers: 5 decoder_residual: true input_dropout: 0.1 projection_layers: 1 projection_ratio: 2.0 dropout: 0.1 ema_local_encoder: false encoder_zero_mask: true end_drop_path_rate: 0.0 init_extra_token_zero: true inverse_mask: false keep_masked_pct: 0.0 layernorm_embedding: true learned_alibi: false learned_alibi_scale: false learned_alibi_scale_per_head: false learned_alibi_scale_per_layer: false learned_pos: true local_grad_mult: 1.0 mask_channel_length: 64 mask_channel_prob: 0.0 mask_dropout: 0.0 mask_length: 5 mask_noise_std: 0.01 mask_prob: 0.7 mask_prob_adjust: 0.0 mask_prob_min: null max_source_positions: 512 model_depth: 8 no_scale_embedding: true no_token_positional_embeddings: false num_alibi_heads: 16 num_extra_tokens: 0 prenet_depth: 4 prenet_dropout: 0.0 prenet_layerdrop: 0.0 remove_masks: false start_drop_path_rate: 0.0 type: TEXT use_alibi_encoder: false norm_affine: true norm_eps: 1.0e-05 num_heads: 16 post_mlp_drop: 0.1 recon_loss: 0.0 seed: 1 shared_decoder: null skip_ema: false start_drop_path_rate: 0.0 supported_modality: AUDIO tokenizer: CharTokenizer tokenizer_conf: unk_symbol: split_with_space: true scope_map: - 'd2v_model.' - none