|
from transformers import PretrainedConfig, PreTrainedModel |
|
import json |
|
|
|
class Idefics2ConnectorConfig(PretrainedConfig): |
|
r""" |
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
|
documentation from [`PretrainedConfig`] for more information. |
|
|
|
Args: |
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): |
|
The non-linear activation function (function or string) in the perceiver block. |
|
resampler_n_latents (`int`, *optional*, defaults to 64): |
|
Number of latent embeddings to resample ("compress") the input sequence to (usually < 128). |
|
resampler_depth (`int`, *optional*, defaults to 3): |
|
Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (<= 3). |
|
resampler_n_heads (`int`, *optional*, defaults to 16): |
|
Number of heads in each Transformer block (for multi-headed self-attention). |
|
resampler_head_dim (`int`, *optional*, defaults to 96): |
|
Dimensionality of each head projection in the Transformer block. |
|
num_key_value_heads (`int`, *optional*, defaults to 4): |
|
Number of key-value heads in the perceiver attention block. |
|
attention_dropout (`float`, *optional*, defaults to 0.0): |
|
The dropout ratio for the attention probabilities. |
|
""" |
|
_auto_class = 'AutoConfig' |
|
model_type = "Idefics2ConnectorConfig" |
|
|
|
def __init__( |
|
self, |
|
vision_hidden_size=1152, |
|
hidden_size=4096, |
|
hidden_act="silu", |
|
resampler_n_latents=64, |
|
resampler_depth=3, |
|
rms_norm_eps=1e-05, |
|
resampler_n_heads=16, |
|
resampler_head_dim=96, |
|
num_key_value_heads=4, |
|
attention_dropout=0.0, |
|
intermediate_size=14336, |
|
integrate_sub_images=None, |
|
num_sub_images=None, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
self.vision_hidden_size = vision_hidden_size |
|
self.hidden_size = hidden_size |
|
self.hidden_act = hidden_act |
|
self.resampler_n_latents = resampler_n_latents |
|
self.resampler_depth = resampler_depth |
|
self.rms_norm_eps = rms_norm_eps |
|
self.resampler_n_heads = resampler_n_heads |
|
self.num_key_value_heads = num_key_value_heads |
|
self.resampler_head_dim = resampler_head_dim |
|
self.attention_dropout = attention_dropout |
|
self.intermediate_size = intermediate_size |
|
self.integrate_sub_images = integrate_sub_images |
|
self.num_sub_images = num_sub_images |
|
|
|
if self.num_key_value_heads > self.resampler_n_heads: |
|
raise ValueError( |
|
f"num_key_value_heads={self.num_key_value_heads} must be less than or equal to" |
|
f" resampler_n_heads={self.resampler_n_heads}" |
|
) |
|
|
|
@classmethod |
|
def from_pretrained(cls, config_path, **kwargs) -> "PretrainedConfig": |
|
|
|
with open(config_path, "r", encoding="utf-8") as f: |
|
config_dict = json.load(f) |
|
cls = Idefics2ConnectorConfig( |
|
vision_hidden_size=config_dict['vision_hidden_size'], |
|
hidden_size=config_dict['hidden_size'], |
|
hidden_act="silu", |
|
resampler_n_latents=config_dict['resampler_n_latents'], |
|
resampler_depth=config_dict['resampler_depth'], |
|
rms_norm_eps=config_dict['rms_norm_eps'], |
|
intermediate_size=config_dict['intermediate_size'], |
|
integrate_sub_images=config_dict['integrate_sub_images'], |
|
num_sub_images=config_dict['num_sub_images'] |
|
) |
|
|
|
return cls |
|
|