from typing import Optional from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig import torch from .phEYE import phEYE from .wrapper_lm import phEYELMMixin from .utils import extend_instance from .encoder import Encoder def create_model_and_transforms( clip_vision_encoder_path: str, lang_decoder_path: str, tokenizer_path: str, dtype, cross_attn_every_n_layers: int = 1, use_local_files: bool = False, decoder_layers_attr_name: str = None, freeze_lm_embeddings: bool = True, cache_dir: Optional[str] = None, level: int = 2, encoder_dtype : torch.dtype = None, decoder_dtype : torch.dtype = None, use_dropout : bool = False, **pheye_kwargs, ): """ Initialize a phEYE model from a pretrained vision encoder and language encoder. Appends special tokens to the tokenizer and freezes backbones. Args: clip_vision_encoder_path (str): path to pretrained clip model (e.g. "ViT-B-32") clip_vision_encoder_pretrained (str): name of pretraining dataset for clip model (e.g. "laion2b_s32b_b79k") lang_encoder_path (str): path to pretrained language encoder tokenizer_path (str): path to pretrained tokenizer cross_attn_every_n_layers (int, optional): determines how often to add a cross-attention layer. Defaults to 1. use_local_files (bool, optional): whether to use local files. Defaults to False. decoder_layers_attr_name (str, optional): name of the decoder layers attribute. Defaults to None. freeze_lm_embeddings (bool, optional): whether to freeze LM input embeddings when configuring Perceiver. cache_dir (str, optional): path to cache directory for downloading OpenClip/HF weights. Returns: phEYE: phEYE model from pretrained vision and language encoders Image processor: Pipeline to preprocess input images Tokenizer: A tokenizer for the language model """ vision_encoder = Encoder(clip_vision_encoder_path, level=level, dtype=encoder_dtype, use_dropout=use_dropout) text_tokenizer = AutoTokenizer.from_pretrained( tokenizer_path, local_files_only=use_local_files, trust_remote_code=True, cache_dir=cache_dir, ) if text_tokenizer.pad_token is None: text_tokenizer.pad_token = text_tokenizer.eos_token #print(lang_decoder_path) lang_config = AutoConfig.from_pretrained(lang_decoder_path) #print(lang_config) lang_encoder = AutoModelForCausalLM.from_config( lang_config, #local_files_only=use_local_files, #trust_remote_code=True, torch_dtype=decoder_dtype ) lang_encoder.config.decoder_start_token_id = None lang_encoder.config.pad_token_id = text_tokenizer.pad_token_id # convert LM to phEYELM extend_instance(lang_encoder, phEYELMMixin) if decoder_layers_attr_name is None: decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder) lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name) model = phEYE( vision_encoder, lang_encoder, vis_dim=vision_encoder.vision_model.config.hidden_size, cross_attn_every_n_layers=cross_attn_every_n_layers, dtype=dtype, **pheye_kwargs, ) # Freeze all parameters model.lang_encoder.requires_grad_(False) assert sum(p.numel() for p in model.lang_encoder.parameters() if p.requires_grad) == 0 # Unfreeze perceiver, cross_attn_layers, and LM input embeddings model.lang_encoder.cross_attn_layers.requires_grad_(True) if not freeze_lm_embeddings: model.lang_encoder.get_input_embeddings().requires_grad_(True) print( f"phEYE model initialized with {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters" ) return model, text_tokenizer def _infer_decoder_layers_attr_name(model): for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES: if k.lower() in model.__class__.__name__.lower(): return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k] raise ValueError( f"We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. Please supply this string manually." ) __KNOWN_DECODER_LAYERS_ATTR_NAMES = { "opt": "model.decoder.layers", "gpt": "transformer.h", "gpt-j": "transformer.h", "pythia": "gpt_neox.layers", "llama": "model.layers", "gptneoxforcausallm": "gpt_neox.layers", "mpt": "transformer.blocks", "mosaicgpt": "transformer.blocks", "phi" : "model.layers" }