File size: 3,651 Bytes
f1298e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from transformers import PretrainedConfig, PreTrainedModel
import json

class Idefics2ConnectorConfig(PretrainedConfig):
    r"""
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the perceiver block.
        resampler_n_latents (`int`, *optional*, defaults to 64):
            Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
        resampler_depth (`int`, *optional*, defaults to 3):
            Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (<= 3).
        resampler_n_heads (`int`, *optional*, defaults to 16):
            Number of heads in each Transformer block (for multi-headed self-attention).
        resampler_head_dim (`int`, *optional*, defaults to 96):
            Dimensionality of each head projection in the Transformer block.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            Number of key-value heads in the perceiver attention block.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
    """
    _auto_class = 'AutoConfig'
    model_type = "Idefics2ConnectorConfig"

    def __init__(
        self,
        vision_hidden_size=1152,
        hidden_size=4096,
        hidden_act="silu",
        resampler_n_latents=64,
        resampler_depth=3,
        rms_norm_eps=1e-05,
        resampler_n_heads=16,
        resampler_head_dim=96,
        num_key_value_heads=4,
        attention_dropout=0.0,
        intermediate_size=14336,
        integrate_sub_images=None,
        num_sub_images=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.vision_hidden_size = vision_hidden_size
        self.hidden_size = hidden_size
        self.hidden_act = hidden_act
        self.resampler_n_latents = resampler_n_latents
        self.resampler_depth = resampler_depth
        self.rms_norm_eps = rms_norm_eps
        self.resampler_n_heads = resampler_n_heads
        self.num_key_value_heads = num_key_value_heads
        self.resampler_head_dim = resampler_head_dim
        self.attention_dropout = attention_dropout
        self.intermediate_size = intermediate_size
        self.integrate_sub_images = integrate_sub_images
        self.num_sub_images = num_sub_images

        if self.num_key_value_heads > self.resampler_n_heads:
            raise ValueError(
                f"num_key_value_heads={self.num_key_value_heads} must be less than or equal to"
                f" resampler_n_heads={self.resampler_n_heads}"
            )

    @classmethod
    def from_pretrained(cls, config_path, **kwargs) -> "PretrainedConfig":
        
        with open(config_path, "r", encoding="utf-8") as f:
            config_dict = json.load(f)
        cls = Idefics2ConnectorConfig(
            vision_hidden_size=config_dict['vision_hidden_size'],
            hidden_size=config_dict['hidden_size'],
            hidden_act="silu",
            resampler_n_latents=config_dict['resampler_n_latents'],
            resampler_depth=config_dict['resampler_depth'],
            rms_norm_eps=config_dict['rms_norm_eps'],
            intermediate_size=config_dict['intermediate_size'],
            integrate_sub_images=config_dict['integrate_sub_images'],
            num_sub_images=config_dict['num_sub_images']
        )
        
        return cls