Spaces:
Sleeping
Sleeping
| import logging | |
| from transformers.configuration_utils import PretrainedConfig | |
| from transformers.models.llama.configuration_llama import LlamaConfig | |
| from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD | |
| logger = logging.getLogger("kanana-1.5-v") | |
| class KananaVVisionConfig(PretrainedConfig): | |
| model_type = "kanana-1.5-v-visual-encoder" | |
| base_config_key = "vision_config" | |
| def __init__( | |
| self, | |
| depth=32, | |
| embed_dim=1280, | |
| mlp_ratio=4, | |
| num_heads=16, | |
| in_chans=3, | |
| hidden_size=1280, | |
| patch_size=14, | |
| spatial_merge_size=2, | |
| spatial_patch_size=14, | |
| temporal_patch_size=2, | |
| initializer_range=0.02, | |
| image_size="dynamic", | |
| image_mean=OPENAI_CLIP_MEAN, | |
| image_std=OPENAI_CLIP_STD, | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| self.depth = depth | |
| self.embed_dim = embed_dim | |
| self.mlp_ratio = mlp_ratio | |
| self.num_heads = num_heads | |
| self.in_chans = in_chans | |
| self.hidden_size = hidden_size | |
| self.patch_size = patch_size | |
| self.spatial_merge_size = spatial_merge_size | |
| self.spatial_patch_size = spatial_patch_size | |
| self.temporal_patch_size = temporal_patch_size | |
| self.initializer_range = initializer_range | |
| self.image_size = image_size | |
| self.image_mean = image_mean | |
| self.image_std = image_std | |
| class KananaVVisualProjectorConfig(PretrainedConfig): | |
| model_type = "kanana-1.5-v-visual_projector" | |
| base_config_key = "projector_config" | |
| def __init__( | |
| self, | |
| depth=2, | |
| encoder_hidden_size=1280, | |
| feature_layer_index=-1, | |
| hidden_size=1024, | |
| merge_size=2, | |
| mlp_depth=2, | |
| num_eos_tokens=0, | |
| output_hidden_size=2048, | |
| pos_emb=True, | |
| pos_emb_size=576, | |
| prenorm=False, | |
| projector_type="dynamic-c-abs", | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| self.depth = depth | |
| self.encoder_hidden_size = encoder_hidden_size | |
| self.feature_layer_index = feature_layer_index | |
| self.hidden_size = hidden_size | |
| self.merge_size = merge_size | |
| self.mlp_depth = mlp_depth | |
| self.num_eos_tokens = num_eos_tokens | |
| self.output_hidden_size = output_hidden_size | |
| self.pos_emb = pos_emb | |
| self.pos_emb_size = pos_emb_size | |
| self.prenorm = prenorm | |
| self.projector_type = projector_type | |
| class KananaLanguageConfig(LlamaConfig): | |
| model_type = "kanana-1.5-3b-instruct" | |
| base_config_key = "text_config" | |
| def __init__( | |
| self, | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| class KananaVConfig(PretrainedConfig): | |
| model_type = "kanana-1.5-v" | |
| is_composition = True | |
| def __init__( | |
| self, | |
| vision_config: dict = {}, | |
| projector_config: dict = {}, | |
| text_config: dict = {}, | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| # Vision config | |
| self.vision_config = KananaVVisionConfig(**vision_config) | |
| # Visual projector config | |
| self.projector_config = KananaVVisualProjectorConfig(**projector_config) | |
| # Language model config | |
| self.text_config = KananaLanguageConfig(**text_config) | |
| def num_visual_tokens(self): | |
| return "dynamic" | |
| def hidden_size(self): | |
| return self.text_config.hidden_size | |