# coding=utf-8 # Copyright The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ VLE model configuration""" import copy from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging from transformers.models.auto.configuration_auto import AutoConfig from transformers.models.clip.configuration_clip import CLIPVisionConfig from typing import Union, Dict logger = logging.get_logger(__name__) class VLEConfig(PretrainedConfig): r""" [`VLEConfig`] is the configuration class to store the configuration of a [`VLEModel`]. It is used to instantiate [`VLEModel`] model according to the specified arguments, defining the text model and vision model configs. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: text_config (`dict`): Dictionary of configuration options that defines text model config. vision_config (`dict`): Dictionary of configuration options that defines vison model config. #TODO logit_scale_init_value (`float`, *optional*, defaults to 2.6592): The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation. kwargs (*optional*): Dictionary of keyword arguments. Examples: ```python >>> from transformers import ViTConfig, BertConfig >>> from configuration_vle import VLEconfig >>> from modeling_vle import VLEModel >>> # Initializing a BERT and ViT configuration >>> config_vision = ViTConfig() >>> config_text = BertConfig() >>> config = VLEConfig.from_vision_text_configs(config_vision, config_text) #TODO >>> # Initializing a BERT and ViT model (with random weights) >>> model = VLEModel(config=config) >>> # Accessing the model configuration >>> config_vision = model.config.vision_config >>> config_text = model.config.text_config >>> # Saving the model, including its configuration >>> model.save_pretrained("vit-bert") >>> # loading model and config from pretrained folder >>> vision_text_config = VLEConfig.from_pretrained("vit-bert") >>> model = VLEModel.from_pretrained("vit-bert", config=vision_text_config) ```""" model_type = "vle" is_composition = True def __init__( self, text_config: Union[PretrainedConfig, Dict], vision_config: Union[PretrainedConfig, Dict], num_token_types=2, hidden_size=768, num_hidden_layers=6, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, layer_norm_eps=1e-12, classifier_dropout=None, **kwargs): super().__init__(**kwargs) if not isinstance(text_config,PretrainedConfig): text_model_type = text_config.pop('model_type') text_config = AutoConfig.for_model(text_model_type, **text_config) self.text_config = text_config if not isinstance(vision_config, PretrainedConfig): vision_model_type = vision_config.pop('model_type') if vision_model_type == "clip": vision_config = AutoConfig.for_model(vision_model_type, **vision_config).vision_config elif vision_model_type == "clip_vision_model": vision_config = CLIPVisionConfig(**vision_config) else: vision_config = AutoConfig.for_model(vision_model_type, **vision_config) self.vision_config = vision_config else: vision_model_type = vision_config.model_type if vision_model_type== "clip": vision_config = vision_config.vision_config self.vision_config = vision_config # co-attention self.num_token_types=num_token_types self.hidden_size=hidden_size self.num_hidden_layers=num_hidden_layers self.num_attention_heads=num_attention_heads self.intermediate_size=intermediate_size self.hidden_act=hidden_act self.hidden_dropout_prob=hidden_dropout_prob self.attention_probs_dropout_prob=attention_probs_dropout_prob self.initializer_range=initializer_range self.layer_norm_eps=layer_norm_eps self.classifier_dropout=classifier_dropout def to_dict(self): """ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns: `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, """ output = copy.deepcopy(self.__dict__) output["vision_config"] = self.vision_config.to_dict() output["text_config"] = self.text_config.to_dict() output["model_type"] = self.__class__.model_type return output