|
{ |
|
"architectures": [ |
|
"SMAForSSL" |
|
], |
|
"attention_dropout_prob": 0.0, |
|
"cross_attention_widening_factor": 1, |
|
"cross_eval_noising_args": null, |
|
"cross_train_noising_args": [ |
|
[ |
|
"RandomlySelectedCrossAttentionMasking", |
|
{ |
|
"exclude_seen_reconstruction": true, |
|
"head_aggregation": "random_mix", |
|
"mask_self": true, |
|
"masking_ratio": 0.2, |
|
"num_per_query": 3, |
|
"select_initial_ratio": 1.0, |
|
"varying_length": true |
|
} |
|
] |
|
], |
|
"decoder_attention_channels": 128, |
|
"decoder_heads": 1, |
|
"decoder_latent_channels": 128, |
|
"decoder_type": "cross_attention", |
|
"dense_use_bias": true, |
|
"drop_path_rate": 0.0, |
|
"embedded_channels": 128, |
|
"encoder_cross_attention_channels": 128, |
|
"encoder_type": "self_attention", |
|
"final_project": true, |
|
"hidden_act": "gelu", |
|
"hidden_dropout_prob": 0.0, |
|
"initializer_range": 0.02, |
|
"input_channels": 1, |
|
"input_type": "continuous", |
|
"latent_channels": 128, |
|
"layer_norm_eps": 1e-12, |
|
"layernorm_eps": 1e-12, |
|
"loss_fn": "mse", |
|
"max_position_embeddings": 28, |
|
"model_type": "sma", |
|
"num_blocks": 1, |
|
"num_cross_attention_heads": 8, |
|
"num_discrete_tokens": 262, |
|
"num_latents": 128, |
|
"num_outputs": 2048, |
|
"num_self_attends_per_block": 4, |
|
"num_self_attention_heads": 8, |
|
"output_channels": 262, |
|
"pe_initializer_range": 0.02, |
|
"post_decoder_layers": null, |
|
"project_after_concat": true, |
|
"qk_channels": 128, |
|
"self_attention_widening_factor": 1, |
|
"share_decoder_queries": true, |
|
"share_embedding_weights": true, |
|
"teacher_args": { |
|
"auxiliary_loss_fn": "mse", |
|
"auxiliary_loss_weight": 1.0, |
|
"ema_args": { |
|
"ema_decay_end": 0.0, |
|
"ema_decay_start": 0.0 |
|
}, |
|
"eval_transform_args": [ |
|
[ |
|
"RandomlySelectedCrossAttentionMasking", |
|
{ |
|
"exclude_seen_reconstruction": true, |
|
"head_aggregation": "random_mix", |
|
"mask_self": true, |
|
"masking_ratio": 0.2, |
|
"num_per_query": 3, |
|
"select_initial_ratio": 1.0, |
|
"varying_length": true |
|
} |
|
] |
|
], |
|
"mask_replace": 3, |
|
"num_layer_target_avg": null, |
|
"reconstruction_decoder_args": { |
|
"num_heads": 8, |
|
"num_outputs": 28, |
|
"output_channels": 1, |
|
"qk_channels": 128, |
|
"query_num_channels": 128, |
|
"share_decoder_queries": true, |
|
"share_embedding_weights": true, |
|
"use_query_residual": true, |
|
"v_channels": 128 |
|
}, |
|
"reconstruction_loss_fn": "mse", |
|
"reconstruction_loss_weight": 1.0, |
|
"reconstruction_weighted_loss": false, |
|
"target_normalization_fn": "layernorm", |
|
"train_transform_args": null |
|
}, |
|
"teacher_name": "ReconstructionTeacher", |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.26.0.dev0", |
|
"use_decoder": false, |
|
"use_position_embeddings": true, |
|
"use_query_residual": true, |
|
"v_channels": 128 |
|
} |
|
|