jxie's picture
Upload SMAForSSL
4c90f36 verified
{
"_name_or_path": null,
"architectures": [
"SMAForSSL"
],
"attention_dropout_prob": 0.0,
"cross_attention_widening_factor": 1,
"cross_eval_noising_args": null,
"cross_train_noising_args": [
[
"RandomlySelectedCrossAttentionMasking",
{
"exclude_seen_reconstruction": true,
"head_aggregation": "random_mix",
"mask_self": true,
"masking_ratio": 0.3,
"num_per_query": 3,
"select_initial_ratio": 1.0,
"varying_length": true
}
]
],
"decoder_attention_channels": 768,
"decoder_heads": 1,
"decoder_latent_channels": 768,
"decoder_type": "cross_attention",
"dense_use_bias": true,
"drop_path_rate": 0.0,
"embedded_channels": 768,
"encoder_cross_attention_channels": 768,
"encoder_type": "self_attention",
"final_project": true,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.0,
"initializer_range": 0.02,
"input_channels": 3,
"input_type": "discrete",
"latent_channels": 768,
"layer_norm_eps": 1e-12,
"layernorm_eps": 1e-12,
"loss_fn": "mse",
"max_position_embeddings": 512,
"model_type": "sma",
"num_blocks": 1,
"num_cross_attention_heads": 12,
"num_discrete_tokens": 262,
"num_latents": 256,
"num_outputs": 2048,
"num_self_attends_per_block": 12,
"num_self_attention_heads": 12,
"output_channels": 262,
"pe_initializer_range": 0.02,
"post_decoder_layers": null,
"project_after_concat": true,
"qk_channels": 768,
"self_attention_widening_factor": 1,
"share_decoder_queries": true,
"share_embedding_weights": true,
"teacher_args": {
"auxiliary_loss_fn": "mse",
"auxiliary_loss_weight": 1.0,
"ema_args": {
"ema_decay_end": 0.0,
"ema_decay_start": 0.0
},
"eval_transform_args": [
[
"RandomlySelectedCrossAttentionMasking",
{
"exclude_seen_reconstruction": true,
"head_aggregation": "random_mix",
"mask_self": true,
"masking_ratio": 0.3,
"num_per_query": 3,
"select_initial_ratio": 1.0,
"varying_length": true
}
]
],
"mask_replace": 3,
"num_layer_target_avg": null,
"reconstruction_decoder_args": {
"num_heads": 12,
"num_outputs": 512,
"output_channels": 262,
"qk_channels": 768,
"query_num_channels": 768,
"share_decoder_queries": true,
"share_embedding_weights": true,
"use_query_residual": true,
"v_channels": 768
},
"reconstruction_loss_fn": "crossentropy",
"reconstruction_loss_weight": 1.0,
"reconstruction_weighted_loss": false,
"target_normalization_fn": "layernorm",
"train_transform_args": null
},
"teacher_name": "ReconstructionTeacher",
"torch_dtype": "float32",
"transformers_version": "4.26.0.dev0",
"use_decoder": false,
"use_position_embeddings": true,
"use_query_residual": true,
"v_channels": 768
}