Abhaykoul commited on
Commit
31a2295
1 Parent(s): ee391eb

Update configuration_llava.py

Browse files
Files changed (1) hide show
  1. configuration_llava.py +104 -14
configuration_llava.py CHANGED
@@ -1,18 +1,107 @@
1
- # coding=utf-8
 
 
2
 
3
- from transformers.configuration_utils import PretrainedConfig
4
- from open_clip import get_model_config
5
- from configuration_phi import PhiConfig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
 
8
  class LlavaConfig(PretrainedConfig):
9
- model_type = "llava"
10
  is_composition = False
11
 
12
  def __init__(
13
  self,
14
  text_config=None,
15
- vision_tower_name="ViT-SO400M-14-SigLIP-384",
16
  ignore_index=-100,
17
  image_token_index=50297,
18
  projector_hidden_act="gelu",
@@ -26,16 +115,17 @@ class LlavaConfig(PretrainedConfig):
26
  self.projector_tokens_num = projector_tokens_num
27
  self.vocab_size = vocab_size
28
 
29
- self.vision_tower_name = vision_tower_name
30
- vision_config = get_model_config(vision_tower_name)
31
- self.vision_embed_dim = vision_config["embed_dim"]
32
-
33
- self.vocab_size = self.vocab_size
34
-
35
  self.text_config = text_config
36
  if isinstance(self.text_config, dict):
37
- text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
 
 
38
  self.text_config = PhiConfig(**text_config)
39
  self.vocab_size = self.text_config.vocab_size
40
 
41
- super().__init__(**kwargs)
 
 
 
 
 
 
1
+ rom transformers.configuration_utils import PretrainedConfig
2
+ from transformers.utils import logging
3
+ from transformers import SiglipVisionConfig
4
 
5
+
6
+ logger = logging.get_logger(__name__)
7
+
8
+
9
+ class PhiConfig(PretrainedConfig):
10
+ model_type = "phi"
11
+ keys_to_ignore_at_inference = ["past_key_values"]
12
+
13
+ def __init__(
14
+ self,
15
+ vocab_size=51200,
16
+ hidden_size=2048,
17
+ intermediate_size=8192,
18
+ num_hidden_layers=24,
19
+ num_attention_heads=32,
20
+ num_key_value_heads=None,
21
+ resid_pdrop=0.0,
22
+ embd_pdrop=0.0,
23
+ attention_dropout=0.0,
24
+ hidden_act="gelu_new",
25
+ max_position_embeddings=2048,
26
+ initializer_range=0.02,
27
+ layer_norm_eps=1e-5,
28
+ use_cache=True,
29
+ tie_word_embeddings=False,
30
+ rope_theta=10000.0,
31
+ rope_scaling=None,
32
+ partial_rotary_factor=0.5,
33
+ qk_layernorm=False,
34
+ bos_token_id=1,
35
+ eos_token_id=2,
36
+ **kwargs,
37
+ ):
38
+ self.vocab_size = vocab_size
39
+ self.hidden_size = hidden_size
40
+ self.intermediate_size = intermediate_size
41
+ self.num_hidden_layers = num_hidden_layers
42
+ self.num_attention_heads = num_attention_heads
43
+
44
+ if num_key_value_heads is None:
45
+ num_key_value_heads = num_attention_heads
46
+
47
+ self.num_key_value_heads = num_key_value_heads
48
+ self.resid_pdrop = resid_pdrop
49
+ self.embd_pdrop = embd_pdrop
50
+ self.attention_dropout = attention_dropout
51
+ self.hidden_act = hidden_act
52
+ self.max_position_embeddings = max_position_embeddings
53
+ self.initializer_range = initializer_range
54
+ self.layer_norm_eps = layer_norm_eps
55
+ self.use_cache = use_cache
56
+ self.rope_theta = rope_theta
57
+ self.rope_scaling = rope_scaling
58
+ self.partial_rotary_factor = partial_rotary_factor
59
+ self.qk_layernorm = qk_layernorm
60
+ self._rope_scaling_validation()
61
+
62
+ super().__init__(
63
+ bos_token_id=bos_token_id,
64
+ eos_token_id=eos_token_id,
65
+ tie_word_embeddings=tie_word_embeddings,
66
+ **kwargs,
67
+ )
68
+
69
+ def _rope_scaling_validation(self):
70
+ """
71
+ Validate the `rope_scaling` configuration.
72
+ """
73
+ if self.rope_scaling is None:
74
+ return
75
+
76
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
77
+ raise ValueError(
78
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
79
+ f"got {self.rope_scaling}"
80
+ )
81
+ rope_scaling_type = self.rope_scaling.get("type", None)
82
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
83
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
84
+ raise ValueError(
85
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
86
+ )
87
+ if (
88
+ rope_scaling_factor is None
89
+ or not isinstance(rope_scaling_factor, float)
90
+ or rope_scaling_factor <= 1.0
91
+ ):
92
+ raise ValueError(
93
+ f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}"
94
+ )
95
 
96
 
97
  class LlavaConfig(PretrainedConfig):
98
+ model_type = "HelpingAI-V"
99
  is_composition = False
100
 
101
  def __init__(
102
  self,
103
  text_config=None,
104
+ vision_config=None,
105
  ignore_index=-100,
106
  image_token_index=50297,
107
  projector_hidden_act="gelu",
 
115
  self.projector_tokens_num = projector_tokens_num
116
  self.vocab_size = vocab_size
117
 
 
 
 
 
 
 
118
  self.text_config = text_config
119
  if isinstance(self.text_config, dict):
120
+ text_config["model_type"] = (
121
+ text_config["model_type"] if "model_type" in text_config else "phi"
122
+ )
123
  self.text_config = PhiConfig(**text_config)
124
  self.vocab_size = self.text_config.vocab_size
125
 
126
+ self.vision_config = vision_config
127
+ if isinstance(self.vision_config, dict):
128
+ self.vision_config = SiglipVisionConfig(**vision_config)
129
+ self.vision_embed_dim = self.vision_config.hidden_size
130
+
131
+ super().__init__(**kwargs)