Spaces:
Runtime error
Runtime error
# coding=utf-8 | |
# Copyright The HuggingFace Inc. team. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
""" VLE model configuration""" | |
import copy | |
from transformers.configuration_utils import PretrainedConfig | |
from transformers.utils import logging | |
from transformers.models.auto.configuration_auto import AutoConfig | |
from transformers.models.clip.configuration_clip import CLIPVisionConfig | |
from typing import Union, Dict | |
logger = logging.get_logger(__name__) | |
class VLEConfig(PretrainedConfig): | |
r""" | |
[`VLEConfig`] is the configuration class to store the configuration of a | |
[`VLEModel`]. It is used to instantiate [`VLEModel`] model according to the | |
specified arguments, defining the text model and vision model configs. | |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the | |
documentation from [`PretrainedConfig`] for more information. | |
Args: | |
text_config (`dict`): | |
Dictionary of configuration options that defines text model config. | |
vision_config (`dict`): | |
Dictionary of configuration options that defines vison model config. | |
#TODO | |
logit_scale_init_value (`float`, *optional*, defaults to 2.6592): | |
The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation. | |
kwargs (*optional*): | |
Dictionary of keyword arguments. | |
Examples: | |
```python | |
>>> from transformers import ViTConfig, BertConfig | |
>>> from configuration_vle import VLEconfig | |
>>> from modeling_vle import VLEModel | |
>>> # Initializing a BERT and ViT configuration | |
>>> config_vision = ViTConfig() | |
>>> config_text = BertConfig() | |
>>> config = VLEConfig.from_vision_text_configs(config_vision, config_text) #TODO | |
>>> # Initializing a BERT and ViT model (with random weights) | |
>>> model = VLEModel(config=config) | |
>>> # Accessing the model configuration | |
>>> config_vision = model.config.vision_config | |
>>> config_text = model.config.text_config | |
>>> # Saving the model, including its configuration | |
>>> model.save_pretrained("vit-bert") | |
>>> # loading model and config from pretrained folder | |
>>> vision_text_config = VLEConfig.from_pretrained("vit-bert") | |
>>> model = VLEModel.from_pretrained("vit-bert", config=vision_text_config) | |
```""" | |
model_type = "vle" | |
is_composition = True | |
def __init__( | |
self, | |
text_config: Union[PretrainedConfig, Dict], | |
vision_config: Union[PretrainedConfig, Dict], | |
num_token_types=2, | |
hidden_size=768, | |
num_hidden_layers=6, | |
num_attention_heads=12, | |
intermediate_size=3072, | |
hidden_act="gelu", | |
hidden_dropout_prob=0.1, | |
attention_probs_dropout_prob=0.1, | |
initializer_range=0.02, | |
layer_norm_eps=1e-12, | |
classifier_dropout=None, | |
**kwargs): | |
super().__init__(**kwargs) | |
if not isinstance(text_config,PretrainedConfig): | |
text_model_type = text_config.pop('model_type') | |
text_config = AutoConfig.for_model(text_model_type, **text_config) | |
self.text_config = text_config | |
if not isinstance(vision_config, PretrainedConfig): | |
vision_model_type = vision_config.pop('model_type') | |
if vision_model_type == "clip": | |
vision_config = AutoConfig.for_model(vision_model_type, **vision_config).vision_config | |
elif vision_model_type == "clip_vision_model": | |
vision_config = CLIPVisionConfig(**vision_config) | |
else: | |
vision_config = AutoConfig.for_model(vision_model_type, **vision_config) | |
self.vision_config = vision_config | |
else: | |
vision_model_type = vision_config.model_type | |
if vision_model_type== "clip": | |
vision_config = vision_config.vision_config | |
self.vision_config = vision_config | |
# co-attention | |
self.num_token_types=num_token_types | |
self.hidden_size=hidden_size | |
self.num_hidden_layers=num_hidden_layers | |
self.num_attention_heads=num_attention_heads | |
self.intermediate_size=intermediate_size | |
self.hidden_act=hidden_act | |
self.hidden_dropout_prob=hidden_dropout_prob | |
self.attention_probs_dropout_prob=attention_probs_dropout_prob | |
self.initializer_range=initializer_range | |
self.layer_norm_eps=layer_norm_eps | |
self.classifier_dropout=classifier_dropout | |
def to_dict(self): | |
""" | |
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. | |
Returns: | |
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, | |
""" | |
output = copy.deepcopy(self.__dict__) | |
output["vision_config"] = self.vision_config.to_dict() | |
output["text_config"] = self.text_config.to_dict() | |
output["model_type"] = self.__class__.model_type | |
return output | |