{ "_name_or_path": null, "architectures": [ "SMAForSSL" ], "attention_dropout_prob": 0.0, "cross_attention_widening_factor": 1, "cross_eval_noising_args": null, "cross_train_noising_args": [ [ "RandomlySelectedCrossAttentionMasking", { "exclude_seen_reconstruction": true, "head_aggregation": "random_mix", "mask_self": true, "masking_ratio": 0.3, "num_per_query": 3, "select_initial_ratio": 1.0, "varying_length": true } ] ], "decoder_attention_channels": 768, "decoder_heads": 1, "decoder_latent_channels": 768, "decoder_type": "cross_attention", "dense_use_bias": true, "drop_path_rate": 0.0, "embedded_channels": 768, "encoder_cross_attention_channels": 768, "encoder_type": "self_attention", "final_project": true, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "initializer_range": 0.02, "input_channels": 3, "input_type": "discrete", "latent_channels": 768, "layer_norm_eps": 1e-12, "layernorm_eps": 1e-12, "loss_fn": "mse", "max_position_embeddings": 512, "model_type": "sma", "num_blocks": 1, "num_cross_attention_heads": 12, "num_discrete_tokens": 262, "num_latents": 256, "num_outputs": 2048, "num_self_attends_per_block": 12, "num_self_attention_heads": 12, "output_channels": 262, "pe_initializer_range": 0.02, "post_decoder_layers": null, "project_after_concat": true, "qk_channels": 768, "self_attention_widening_factor": 1, "share_decoder_queries": true, "share_embedding_weights": true, "teacher_args": { "auxiliary_loss_fn": "mse", "auxiliary_loss_weight": 1.0, "ema_args": { "ema_decay_end": 0.0, "ema_decay_start": 0.0 }, "eval_transform_args": [ [ "RandomlySelectedCrossAttentionMasking", { "exclude_seen_reconstruction": true, "head_aggregation": "random_mix", "mask_self": true, "masking_ratio": 0.3, "num_per_query": 3, "select_initial_ratio": 1.0, "varying_length": true } ] ], "mask_replace": 3, "num_layer_target_avg": null, "reconstruction_decoder_args": { "num_heads": 12, "num_outputs": 512, "output_channels": 262, "qk_channels": 768, "query_num_channels": 768, "share_decoder_queries": true, "share_embedding_weights": true, "use_query_residual": true, "v_channels": 768 }, "reconstruction_loss_fn": "crossentropy", "reconstruction_loss_weight": 1.0, "reconstruction_weighted_loss": false, "target_normalization_fn": "layernorm", "train_transform_args": null }, "teacher_name": "ReconstructionTeacher", "torch_dtype": "float32", "transformers_version": "4.26.0.dev0", "use_decoder": false, "use_position_embeddings": true, "use_query_residual": true, "v_channels": 768 }