| from transformers import PretrainedConfig | |
| class F2PDecoderConfig(PretrainedConfig): | |
| """Configuration for a feature-to-pixel reconstruction decoder.""" | |
| model_type = "f2p_decoder" | |
| def __init__( | |
| self, | |
| pretrained_encoder_name: str = "google/siglip2-so400m-patch14-224", | |
| source_decoder_repo: str = "nyu-visionx/siglip2_decoder", | |
| image_size: int = 224, | |
| patch_size: int = 14, | |
| num_channels: int = 3, | |
| hidden_size: int = 1152, | |
| decoder_hidden_size: int = 1152, | |
| decoder_num_hidden_layers: int = 28, | |
| decoder_num_attention_heads: int = 16, | |
| decoder_intermediate_size: int = 4096, | |
| hidden_act: str = "gelu", | |
| hidden_dropout_prob: float = 0.0, | |
| attention_probs_dropout_prob: float = 0.0, | |
| initializer_range: float = 0.02, | |
| layer_norm_eps: float = 1e-12, | |
| qkv_bias: bool = True, | |
| num_patches: int = 256, | |
| drop_cls_token: bool = True, | |
| image_mean: list[float] | None = None, | |
| image_std: list[float] | None = None, | |
| **kwargs, | |
| ) -> None: | |
| super().__init__(**kwargs) | |
| if getattr(self, "auto_map", None) is None: | |
| self.auto_map = { | |
| "AutoConfig": "configuration_f2p_decoder.F2PDecoderConfig", | |
| "AutoModel": "modeling_f2p_decoder.F2PDecoderModel", | |
| } | |
| if image_mean is None: | |
| image_mean = [0.5, 0.5, 0.5] | |
| if image_std is None: | |
| image_std = [0.5, 0.5, 0.5] | |
| if len(image_mean) != num_channels or len(image_std) != num_channels: | |
| raise ValueError("image_mean and image_std must match num_channels.") | |
| if not drop_cls_token: | |
| raise ValueError("Only drop_cls_token=True is supported by this decoder.") | |
| self.pretrained_encoder_name = pretrained_encoder_name | |
| self.source_decoder_repo = source_decoder_repo | |
| self.image_size = int(image_size) | |
| self.patch_size = int(patch_size) | |
| self.num_channels = int(num_channels) | |
| self.hidden_size = int(hidden_size) | |
| self.decoder_hidden_size = int(decoder_hidden_size) | |
| self.decoder_num_hidden_layers = int(decoder_num_hidden_layers) | |
| self.decoder_num_attention_heads = int(decoder_num_attention_heads) | |
| self.decoder_intermediate_size = int(decoder_intermediate_size) | |
| self.hidden_act = hidden_act | |
| self.hidden_dropout_prob = float(hidden_dropout_prob) | |
| self.attention_probs_dropout_prob = float(attention_probs_dropout_prob) | |
| self.initializer_range = float(initializer_range) | |
| self.layer_norm_eps = float(layer_norm_eps) | |
| self.qkv_bias = bool(qkv_bias) | |
| self.num_patches = int(num_patches) | |
| self.drop_cls_token = bool(drop_cls_token) | |
| self.image_mean = [float(value) for value in image_mean] | |
| self.image_std = [float(value) for value in image_std] | |