from typing import Dict, List, Optional, Tuple, Union import torch from transformers import ( AutoTokenizer, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel ) from transformers.utils import logging from transformers.modeling_outputs import CausalLMOutputWithPast from .build_vit import build_projector, create_clip_vit from .rantselector import build_adapter logging.set_verbosity_info() # Turn on this for debug mode logger = logging.get_logger(__name__) DTYPE_MAPPING = { "float16": torch.float16, "fp16": torch.float16, "bf16": torch.bfloat16, "bfloat16": torch.bfloat16, "float": torch.float32, "fp32": torch.float32, } # image level IMAGE_TOKEN_INDEX = -200 DEFAULT_IMAGE_TOKEN = "" # video level VIDEO_TOKEN_INDEX = -201 DEFAULT_VIDEO_TOKEN = "