File size: 3,651 Bytes
f1298e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
from transformers import PretrainedConfig, PreTrainedModel
import json
class Idefics2ConnectorConfig(PretrainedConfig):
r"""
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the perceiver block.
resampler_n_latents (`int`, *optional*, defaults to 64):
Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
resampler_depth (`int`, *optional*, defaults to 3):
Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (<= 3).
resampler_n_heads (`int`, *optional*, defaults to 16):
Number of heads in each Transformer block (for multi-headed self-attention).
resampler_head_dim (`int`, *optional*, defaults to 96):
Dimensionality of each head projection in the Transformer block.
num_key_value_heads (`int`, *optional*, defaults to 4):
Number of key-value heads in the perceiver attention block.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
"""
_auto_class = 'AutoConfig'
model_type = "Idefics2ConnectorConfig"
def __init__(
self,
vision_hidden_size=1152,
hidden_size=4096,
hidden_act="silu",
resampler_n_latents=64,
resampler_depth=3,
rms_norm_eps=1e-05,
resampler_n_heads=16,
resampler_head_dim=96,
num_key_value_heads=4,
attention_dropout=0.0,
intermediate_size=14336,
integrate_sub_images=None,
num_sub_images=None,
**kwargs,
):
super().__init__(**kwargs)
self.vision_hidden_size = vision_hidden_size
self.hidden_size = hidden_size
self.hidden_act = hidden_act
self.resampler_n_latents = resampler_n_latents
self.resampler_depth = resampler_depth
self.rms_norm_eps = rms_norm_eps
self.resampler_n_heads = resampler_n_heads
self.num_key_value_heads = num_key_value_heads
self.resampler_head_dim = resampler_head_dim
self.attention_dropout = attention_dropout
self.intermediate_size = intermediate_size
self.integrate_sub_images = integrate_sub_images
self.num_sub_images = num_sub_images
if self.num_key_value_heads > self.resampler_n_heads:
raise ValueError(
f"num_key_value_heads={self.num_key_value_heads} must be less than or equal to"
f" resampler_n_heads={self.resampler_n_heads}"
)
@classmethod
def from_pretrained(cls, config_path, **kwargs) -> "PretrainedConfig":
with open(config_path, "r", encoding="utf-8") as f:
config_dict = json.load(f)
cls = Idefics2ConnectorConfig(
vision_hidden_size=config_dict['vision_hidden_size'],
hidden_size=config_dict['hidden_size'],
hidden_act="silu",
resampler_n_latents=config_dict['resampler_n_latents'],
resampler_depth=config_dict['resampler_depth'],
rms_norm_eps=config_dict['rms_norm_eps'],
intermediate_size=config_dict['intermediate_size'],
integrate_sub_images=config_dict['integrate_sub_images'],
num_sub_images=config_dict['num_sub_images']
)
return cls
|