File size: 6,266 Bytes
4974490 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
from transformers import PretrainedConfig
from transformers import logging
from transformers import CONFIG_MAPPING
logger = logging.get_logger(__name__)
class XGenMMVisionEncoderConfig(PretrainedConfig):
model_type = "xgenmm_vision_encoder"
def __init__(self,
model_name: str = 'google/siglip-so400m-patch14-384',
**kwargs):
self.model_name = model_name
super().__init__(**kwargs)
class XGenMMVisionTokenizerConfig(PretrainedConfig):
model_type = "xgenmm_vision_tokenizer"
def __init__(self,
vis_feature_dim: int = 1152,
lang_embedding_dim: int = 3072,
num_vis_tokens: int = 128,
image_aspect_ratio: str = 'none',
**kwargs):
self.vis_feature_dim = vis_feature_dim
self.lang_embedding_dim = lang_embedding_dim
self.num_vis_tokens = num_vis_tokens
self.image_aspect_ratio = image_aspect_ratio
super().__init__(**kwargs)
class XGenMMConfig(PretrainedConfig):
model_type = "xgenmm"
def __init__(self,
vision_encoder_config: dict = None,
vision_tokenizer_config: dict = None,
text_config: dict = None,
**kwargs):
if vision_encoder_config is None:
vision_encoder_config = {'image_aspect_ratio': 'anyres', 'anyres_patch_sampling': True}
logger.info("vision_encoder_config is None. initializing the XGenMMVisionEncoderConfig with default values.")
if vision_tokenizer_config is None:
vision_tokenizer_config = {}
logger.info("vision_tokenizer_config is None. Initializing the XGenMMVisionTokenizerConfig with default values.")
if text_config is None:
text_config = {
'initial_tokenizer_len':32012,
'pad_token_id':32011,
'bos_token_id':1,
'eos_token_id':32000,
'vocab_size': 32064,
'hidden_size': 3072,
'intermediate_size': 8192,
'num_hidden_layers': 32,
'num_attention_heads': 32,
'num_key_value_heads': 32,
'resid_pdrop': 0.0,
'embd_pdrop': 0.0,
'attention_dropout': 0.0,
'hidden_act': 'silu',
'max_position_embeddings': 4096,
'original_max_position_embeddings': 4096,
'initializer_range': 0.02,
'rms_norm_eps': 1e-05,
'use_cache': True,
'rope_theta': 10000.0,
'rope_scaling': None,
'sliding_window': 2047,
'return_dict': True,
'output_hidden_states': False,
'output_attentions': False,
'torchscript': False,
'torch_dtype': 'bfloat16',
'use_bfloat16': False,
'tf_legacy_loss': False,
'pruned_heads': {},
'tie_word_embeddings': False,
'chunk_size_feed_forward': 0,
'is_encoder_decoder': False,
'is_decoder': False,
'cross_attention_hidden_size': None,
'add_cross_attention': False,
'tie_encoder_decoder': False,
'max_length': 20,
'min_length': 0,
'do_sample': False,
'early_stopping': False,
'num_beams': 1,
'num_beam_groups': 1,
'diversity_penalty': 0.0,
'temperature': 1.0,
'top_k': 50,
'top_p': 1.0,
'typical_p': 1.0,
'repetition_penalty': 1.0,
'length_penalty': 1.0,
'no_repeat_ngram_size': 0,
'encoder_no_repeat_ngram_size': 0,
'bad_words_ids': None,
'num_return_sequences': 1,
'output_scores': False,
'return_dict_in_generate': False,
'forced_bos_token_id': None,
'forced_eos_token_id': None,
'remove_invalid_values': False,
'exponential_decay_length_penalty': None,
'suppress_tokens': None,
'begin_suppress_tokens': None,
'finetuning_task': None,
'id2label': {0: 'LABEL_0', 1: 'LABEL_1'},
'label2id': {'LABEL_0': 0, 'LABEL_1': 1},
'tokenizer_class': None,
'prefix': None,
'bos_token_id': 1,
'pad_token_id': 32000,
'eos_token_id': 32000,
'sep_token_id': None,
'decoder_start_token_id': None,
'task_specific_params': None,
'problem_type': None,
'model_type': 'phi3'
}
logger.info("text_config is None. Initializing the text config with default values (`Phi3Config`).")
self.vision_encoder_config = XGenMMVisionEncoderConfig(**vision_encoder_config)
self.vision_tokenizer_config = XGenMMVisionTokenizerConfig(**vision_tokenizer_config)
text_model_type = text_config["model_type"] if "model_type" in text_config else "phi3"
self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
for key in ['initial_tokenizer_len', 'pad_token_id']:
if key not in self.text_config.to_dict():
raise ValueError(f"The key `{key}` is missing in the text_config.")
super().__init__(**kwargs)
@classmethod
def from_vision_encoder_vision_tokenizer_text_configs(
cls,
vision_encoder_config: XGenMMVisionEncoderConfig,
vision_tokenizer_config: XGenMMVisionTokenizerConfig,
text_config: PretrainedConfig,
**kwargs):
return cls(
vision_encoder_config=vision_encoder_config.to_dict(),
vision_tokenizer_config=vision_tokenizer_config.to_dict(),
text_config=text_config.to_dict(),
**kwargs,
)
|