File size: 5,552 Bytes
7714a5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220

# network architecture
model: Emotion2vec
model_conf:
  _name: data2vec_multi
  activation_dropout: 0.0
  adversarial_hidden_dim: 128
  adversarial_training: false
  adversarial_weight: 0.1
  attention_dropout: 0.1
  average_top_k_layers: 16
  batch_norm_target_layer: false
  clone_batch: 12
  cls_loss: 1.0
  cls_type: chunk
  d2v_loss: 1.0
  decoder_group: false
  depth: 8
  dropout_input: 0.0
  ema_anneal_end_step: 20000
  ema_decay: 0.9997
  ema_encoder_only: false
  ema_end_decay: 1.0
  ema_same_dtype: true
  embed_dim: 1024
  encoder_dropout: 0.1
  end_drop_path_rate: 0.0
  end_of_block_targets: false
  instance_norm_target_layer: true
  instance_norm_targets: false
  layer_norm_first: false
  layer_norm_target_layer: false
  layer_norm_targets: false
  layerdrop: 0.0
  log_norms: true
  loss_beta: 0.0
  loss_scale: null
  mae_init: false
  max_update: 100000
  min_pred_var: 0.01
  min_target_var: 0.1
  mlp_ratio: 4.0
  normalize: true
  modalities:
    _name: null
    audio:
      add_masks: false
      alibi_max_pos: null
      alibi_scale: 1.0
      conv_pos_depth: 5
      conv_pos_groups: 16
      conv_pos_pre_ln: false
      conv_pos_width: 95
      decoder:
        add_positions_all: false
        add_positions_masked: false
        decoder_dim: 768
        decoder_groups: 16
        decoder_kernel: 7
        decoder_layers: 4
        decoder_residual: true
        input_dropout: 0.1
        projection_layers: 1
        projection_ratio: 2.0
      ema_local_encoder: false
      encoder_zero_mask: true
      end_drop_path_rate: 0.0
      extractor_mode: layer_norm
      feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
      init_extra_token_zero: true
      inverse_mask: false
      keep_masked_pct: 0.0
      learned_alibi: false
      learned_alibi_scale: true
      learned_alibi_scale_per_head: true
      learned_alibi_scale_per_layer: false
      local_grad_mult: 1.0
      mask_channel_length: 64
      mask_channel_prob: 0.0
      mask_dropout: 0.0
      mask_length: 5
      mask_noise_std: 0.01
      mask_prob: 0.55
      mask_prob_adjust: 0.1
      mask_prob_min: null
      model_depth: 8
      num_alibi_heads: 16
      num_extra_tokens: 10
      prenet_depth: 4
      prenet_dropout: 0.1
      prenet_layerdrop: 0.0
      remove_masks: false
      start_drop_path_rate: 0.0
      type: AUDIO
      use_alibi_encoder: true
    image:
      add_masks: false
      alibi_dims: 2
      alibi_distance: manhattan
      alibi_max_pos: null
      alibi_scale: 1.0
      decoder:
        add_positions_all: false
        add_positions_masked: false
        decoder_dim: 384
        decoder_groups: 16
        decoder_kernel: 5
        decoder_layers: 5
        decoder_residual: true
        input_dropout: 0.1
        projection_layers: 1
        projection_ratio: 2.0
      ema_local_encoder: false
      embed_dim: 768
      enc_dec_transformer: false
      encoder_zero_mask: true
      end_drop_path_rate: 0.0
      fixed_positions: true
      in_chans: 3
      init_extra_token_zero: true
      input_size: 224
      inverse_mask: false
      keep_masked_pct: 0.0
      learned_alibi: false
      learned_alibi_scale: false
      learned_alibi_scale_per_head: false
      learned_alibi_scale_per_layer: false
      local_grad_mult: 1.0
      mask_channel_length: 64
      mask_channel_prob: 0.0
      mask_dropout: 0.0
      mask_length: 5
      mask_noise_std: 0.01
      mask_prob: 0.7
      mask_prob_adjust: 0.0
      mask_prob_min: null
      model_depth: 8
      num_alibi_heads: 16
      num_extra_tokens: 0
      patch_size: 16
      prenet_depth: 4
      prenet_dropout: 0.0
      prenet_layerdrop: 0.0
      remove_masks: false
      start_drop_path_rate: 0.0
      transformer_decoder: false
      type: IMAGE
      use_alibi_encoder: false
    text:
      add_masks: false
      alibi_max_pos: null
      alibi_scale: 1.0
      decoder:
        add_positions_all: false
        add_positions_masked: false
        decoder_dim: 384
        decoder_groups: 16
        decoder_kernel: 5
        decoder_layers: 5
        decoder_residual: true
        input_dropout: 0.1
        projection_layers: 1
        projection_ratio: 2.0
      dropout: 0.1
      ema_local_encoder: false
      encoder_zero_mask: true
      end_drop_path_rate: 0.0
      init_extra_token_zero: true
      inverse_mask: false
      keep_masked_pct: 0.0
      layernorm_embedding: true
      learned_alibi: false
      learned_alibi_scale: false
      learned_alibi_scale_per_head: false
      learned_alibi_scale_per_layer: false
      learned_pos: true
      local_grad_mult: 1.0
      mask_channel_length: 64
      mask_channel_prob: 0.0
      mask_dropout: 0.0
      mask_length: 5
      mask_noise_std: 0.01
      mask_prob: 0.7
      mask_prob_adjust: 0.0
      mask_prob_min: null
      max_source_positions: 512
      model_depth: 8
      no_scale_embedding: true
      no_token_positional_embeddings: false
      num_alibi_heads: 16
      num_extra_tokens: 0
      prenet_depth: 4
      prenet_dropout: 0.0
      prenet_layerdrop: 0.0
      remove_masks: false
      start_drop_path_rate: 0.0
      type: TEXT
      use_alibi_encoder: false
  norm_affine: true
  norm_eps: 1.0e-05
  num_heads: 16
  post_mlp_drop: 0.1
  recon_loss: 0.0
  seed: 1
  shared_decoder: null
  skip_ema: false
  start_drop_path_rate: 0.0
  supported_modality: AUDIO

tokenizer: CharTokenizer
tokenizer_conf:
  unk_symbol: <unk>
  split_with_space: true

scope_map:
  - 'd2v_model.'
  - none