WEBing commited on Jul 18, 2024

Commit

4cbb2ad

1 Parent(s): 388bd8b

merge eva_clip to vision_tower_builder

Browse files

Files changed (45) hide show

eva_clip/model_configs/EVA02-CLIP-L-14-448.json → EVA02-CLIP-L-14-448.json +0 -0
eva_clip/__init__.py +0 -11
eva_clip/__pycache__/__init__.cpython-39.pyc +0 -0
eva_clip/__pycache__/constants.cpython-39.pyc +0 -0
eva_clip/__pycache__/eva_vit_model.cpython-39.pyc +0 -0
eva_clip/__pycache__/factory.cpython-39.pyc +0 -0
eva_clip/__pycache__/hf_configs.cpython-39.pyc +0 -0
eva_clip/__pycache__/hf_model.cpython-39.pyc +0 -0
eva_clip/__pycache__/loss.cpython-39.pyc +0 -0
eva_clip/__pycache__/model.cpython-39.pyc +0 -0
eva_clip/__pycache__/modified_resnet.cpython-39.pyc +0 -0
eva_clip/__pycache__/openai.cpython-39.pyc +0 -0
eva_clip/__pycache__/pretrained.cpython-39.pyc +0 -0
eva_clip/__pycache__/rope.cpython-39.pyc +0 -0
eva_clip/__pycache__/timm_model.cpython-39.pyc +0 -0
eva_clip/__pycache__/tokenizer.cpython-39.pyc +0 -0
eva_clip/__pycache__/transform.cpython-39.pyc +0 -0
eva_clip/__pycache__/transformer.cpython-39.pyc +0 -0
eva_clip/__pycache__/utils.cpython-39.pyc +0 -0
eva_clip/bpe_simple_vocab_16e6.txt.gz +0 -3
eva_clip/constants.py +0 -2
eva_clip/factory.py +0 -459
eva_clip/hf_configs.py +0 -57
eva_clip/hf_model.py +0 -248
eva_clip/loss.py +0 -138
eva_clip/model.py +0 -439
eva_clip/model_configs/EVA01-CLIP-B-16.json +0 -19
eva_clip/model_configs/EVA01-CLIP-g-14-plus.json +0 -24
eva_clip/model_configs/EVA01-CLIP-g-14.json +0 -24
eva_clip/model_configs/EVA02-CLIP-B-16.json +0 -29
eva_clip/model_configs/EVA02-CLIP-L-14-336.json +0 -29
eva_clip/model_configs/EVA02-CLIP-L-14.json +0 -29
eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json +0 -25
eva_clip/model_configs/EVA02-CLIP-bigE-14.json +0 -25
eva_clip/modified_resnet.py +0 -181
eva_clip/openai.py +0 -144
eva_clip/pretrained.py +0 -332
eva_clip/rope.py +0 -137
eva_clip/timm_model.py +0 -122
eva_clip/tokenizer.py +0 -201
eva_clip/transform.py +0 -103
eva_clip/transformer.py +0 -737
eva_clip/utils.py +0 -326
modeling_kangaroo.py +10 -61
eva_clip/eva_vit_model.py → vision_tower_builder.py +242 -6

eva_clip/model_configs/EVA02-CLIP-L-14-448.json → EVA02-CLIP-L-14-448.json RENAMED Viewed

File without changes

eva_clip/__init__.py DELETED Viewed

@@ -1,11 +0,0 @@
-from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
-from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer
-from .factory import list_models, add_model_config, get_model_config, load_checkpoint
-from .loss import ClipLoss
-from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg,\
-    convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
-from .openai import load_openai_model, list_openai_models
-from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model,\
-    get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
-from .tokenizer import SimpleTokenizer, tokenize
-from .transform import image_transform

eva_clip/__pycache__/__init__.cpython-39.pyc DELETED Viewed

Binary file (1.28 kB)

eva_clip/__pycache__/constants.cpython-39.pyc DELETED Viewed

Binary file (313 Bytes)

eva_clip/__pycache__/eva_vit_model.cpython-39.pyc DELETED Viewed

Binary file (15.8 kB)

eva_clip/__pycache__/factory.cpython-39.pyc DELETED Viewed

Binary file (11.2 kB)

eva_clip/__pycache__/hf_configs.cpython-39.pyc DELETED Viewed

Binary file (714 Bytes)

eva_clip/__pycache__/hf_model.cpython-39.pyc DELETED Viewed

Binary file (7.38 kB)

eva_clip/__pycache__/loss.cpython-39.pyc DELETED Viewed

Binary file (3.32 kB)

eva_clip/__pycache__/model.cpython-39.pyc DELETED Viewed

Binary file (13.2 kB)

eva_clip/__pycache__/modified_resnet.cpython-39.pyc DELETED Viewed

Binary file (6.33 kB)

eva_clip/__pycache__/openai.cpython-39.pyc DELETED Viewed

Binary file (4.79 kB)

eva_clip/__pycache__/pretrained.cpython-39.pyc DELETED Viewed

Binary file (9.01 kB)

eva_clip/__pycache__/rope.cpython-39.pyc DELETED Viewed

Binary file (5.25 kB)

eva_clip/__pycache__/timm_model.cpython-39.pyc DELETED Viewed

Binary file (3.94 kB)

eva_clip/__pycache__/tokenizer.cpython-39.pyc DELETED Viewed

Binary file (8.42 kB)

eva_clip/__pycache__/transform.cpython-39.pyc DELETED Viewed

Binary file (2.78 kB)

eva_clip/__pycache__/transformer.cpython-39.pyc DELETED Viewed

Binary file (20.7 kB)

eva_clip/__pycache__/utils.cpython-39.pyc DELETED Viewed

Binary file (9.61 kB)

eva_clip/bpe_simple_vocab_16e6.txt.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
-size 1356917

eva_clip/constants.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2	- OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)

eva_clip/factory.py DELETED Viewed

@@ -1,459 +0,0 @@
-import json
-import logging
-import os
-import pathlib
-import re
-from copy import deepcopy
-from pathlib import Path
-from typing import Optional, Tuple, Union, Dict, Any
-import torch
-from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
-from .model import CLIP, CustomCLIP, convert_weights_to_lp, convert_to_custom_text_state_dict,\
-    get_cast_dtype
-from .openai import load_openai_model
-from .pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained, list_pretrained_tags_by_model
-from .transform import image_transform
-from .tokenizer import HFTokenizer, tokenize
-from .utils import resize_clip_pos_embed, resize_evaclip_pos_embed, resize_visual_pos_embed, resize_eva_pos_embed
-_MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
-_MODEL_CONFIGS = {}  # directory (model_name: config) of model architecture configs
-def _natural_key(string_):
-    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())]
-def _rescan_model_configs():
-    global _MODEL_CONFIGS
-    config_ext = ('.json',)
-    config_files = []
-    for config_path in _MODEL_CONFIG_PATHS:
-        if config_path.is_file() and config_path.suffix in config_ext:
-            config_files.append(config_path)
-        elif config_path.is_dir():
-            for ext in config_ext:
-                config_files.extend(config_path.glob(f'*{ext}'))
-    for cf in config_files:
-        with open(cf, "r", encoding="utf8") as f:
-            model_cfg = json.load(f)
-            if all(a in model_cfg for a in ('embed_dim', 'vision_cfg', 'text_cfg')):
-                _MODEL_CONFIGS[cf.stem] = model_cfg
-    _MODEL_CONFIGS = dict(sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0])))
-_rescan_model_configs()  # initial populate of model config registry
-def list_models():
-    """ enumerate available model architectures based on config files """
-    return list(_MODEL_CONFIGS.keys())
-def add_model_config(path):
-    """ add model config path or file and update registry """
-    if not isinstance(path, Path):
-        path = Path(path)
-    _MODEL_CONFIG_PATHS.append(path)
-    _rescan_model_configs()
-def get_model_config(model_name):
-    if model_name in _MODEL_CONFIGS:
-        return deepcopy(_MODEL_CONFIGS[model_name])
-    else:
-        return None
-def get_tokenizer(model_name):
-    config = get_model_config(model_name)
-    tokenizer = HFTokenizer(config['text_cfg']['hf_tokenizer_name']) if 'hf_tokenizer_name' in config['text_cfg'] else tokenize
-    return tokenizer
-# loading openai CLIP weights when is_openai=True for training
-def load_state_dict(checkpoint_path: str, map_location: str='cpu', model_key: str='model|module|state_dict', is_openai: bool=False, skip_list: list=[]):
-    if is_openai:
-        model = torch.jit.load(checkpoint_path, map_location="cpu").eval()
-        state_dict = model.state_dict()
-        for key in ["input_resolution", "context_length", "vocab_size"]:
-            state_dict.pop(key, None)
-    else:
-        checkpoint = torch.load(checkpoint_path, map_location=map_location)
-        for mk in model_key.split('|'):
-            if isinstance(checkpoint, dict) and mk in checkpoint:
-                state_dict = checkpoint[mk]
-                break
-            else:
-                state_dict = checkpoint
-        if next(iter(state_dict.items()))[0].startswith('module'):
-            state_dict = {k[7:]: v for k, v in state_dict.items()}
-    for k in skip_list:
-        if k in list(state_dict.keys()):
-            logging.info(f"Removing key {k} from pretrained checkpoint")
-            del state_dict[k]
-    if os.getenv('RoPE') == '1':
-        for k in list(state_dict.keys()):
-            if 'freqs_cos' in k or 'freqs_sin' in k:
-                del state_dict[k]
-    return state_dict
-def load_checkpoint(model, checkpoint_path, model_key="model|module|state_dict", strict=True):
-    state_dict = load_state_dict(checkpoint_path, model_key=model_key, is_openai=False)
-    # detect old format and make compatible with new format
-    if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'):
-        state_dict = convert_to_custom_text_state_dict(state_dict)
-    if 'text.logit_scale' in state_dict and hasattr(model, 'logit_scale'):
-        state_dict['logit_scale'] = state_dict['text.logit_scale']
-        del state_dict['text.logit_scale']
-    # resize_clip_pos_embed for CLIP and open CLIP
-    if 'visual.positional_embedding' in state_dict:
-        resize_clip_pos_embed(state_dict, model)
-    # specified to eva_vit_model
-    elif 'visual.pos_embed' in state_dict:
-        resize_evaclip_pos_embed(state_dict, model)
-    # resize_clip_pos_embed(state_dict, model)
-    incompatible_keys = model.load_state_dict(state_dict, strict=strict)
-    logging.info(f"incompatible_keys.missing_keys: {incompatible_keys.missing_keys}")
-    return incompatible_keys
-def load_clip_visual_state_dict(checkpoint_path: str, map_location: str='cpu', is_openai: bool=False, skip_list:list=[]):
-    state_dict = load_state_dict(checkpoint_path, map_location=map_location, is_openai=is_openai, skip_list=skip_list)
-    for k in list(state_dict.keys()):
-        if not k.startswith('visual.'):
-            del state_dict[k]
-    for k in list(state_dict.keys()):
-        if k.startswith('visual.'):
-            new_k = k[7:]
-            state_dict[new_k] = state_dict[k]
-            del state_dict[k]
-    return state_dict
-def load_clip_text_state_dict(checkpoint_path: str, map_location: str='cpu', is_openai: bool=False, skip_list:list=[]):
-    state_dict = load_state_dict(checkpoint_path, map_location=map_location, is_openai=is_openai, skip_list=skip_list)
-    for k in list(state_dict.keys()):
-        if k.startswith('visual.'):
-            del state_dict[k]
-    return state_dict
-def get_pretrained_tag(pretrained_model):
-    pretrained_model = pretrained_model.lower()
-    if "laion" in pretrained_model or "open_clip" in pretrained_model:
-        return "open_clip"
-    elif "openai" in pretrained_model:
-        return "clip"
-    elif "eva" in pretrained_model and "clip" in pretrained_model:
-        return "eva_clip"
-    else:
-        return "other"
-def load_pretrained_checkpoint(
-        model,
-        visual_checkpoint_path,
-        text_checkpoint_path,
-        strict=True,
-        visual_model=None,
-        text_model=None,
-        model_key="model|module|state_dict",
-        skip_list=[]):
-    visual_tag = get_pretrained_tag(visual_model)
-    text_tag = get_pretrained_tag(text_model)
-    logging.info(f"num of model state_dict keys: {len(model.state_dict().keys())}")
-    visual_incompatible_keys, text_incompatible_keys = None, None
-    if visual_checkpoint_path:
-        if visual_tag == "eva_clip" or visual_tag == "open_clip":
-            visual_state_dict = load_clip_visual_state_dict(visual_checkpoint_path, is_openai=False, skip_list=skip_list)
-        elif visual_tag == "clip":
-            visual_state_dict = load_clip_visual_state_dict(visual_checkpoint_path, is_openai=True, skip_list=skip_list)
-        else:
-            visual_state_dict = load_state_dict(visual_checkpoint_path, model_key=model_key, is_openai=False, skip_list=skip_list)
-        # resize_clip_pos_embed for CLIP and open CLIP
-        if 'positional_embedding' in visual_state_dict:
-            resize_visual_pos_embed(visual_state_dict, model)
-        # specified to EVA model
-        elif 'pos_embed' in visual_state_dict:
-            resize_eva_pos_embed(visual_state_dict, model)
-        visual_incompatible_keys = model.visual.load_state_dict(visual_state_dict, strict=strict)
-        logging.info(f"num of loaded visual_state_dict keys: {len(visual_state_dict.keys())}")
-        logging.info(f"visual_incompatible_keys.missing_keys: {visual_incompatible_keys.missing_keys}")
-    if text_checkpoint_path:
-        if text_tag == "eva_clip" or text_tag == "open_clip":
-            text_state_dict = load_clip_text_state_dict(text_checkpoint_path, is_openai=False, skip_list=skip_list)
-        elif text_tag == "clip":
-            text_state_dict = load_clip_text_state_dict(text_checkpoint_path, is_openai=True, skip_list=skip_list)
-        else:
-            text_state_dict = load_state_dict(visual_checkpoint_path, model_key=model_key, is_openai=False, skip_list=skip_list)
-        text_incompatible_keys = model.text.load_state_dict(text_state_dict, strict=strict)
-        logging.info(f"num of loaded text_state_dict keys: {len(text_state_dict.keys())}")
-        logging.info(f"text_incompatible_keys.missing_keys: {text_incompatible_keys.missing_keys}")
-    return visual_incompatible_keys, text_incompatible_keys
-def create_model(
-        model_name: str,
-        pretrained: Optional[str] = None,
-        precision: str = 'fp32',
-        device: Union[str, torch.device] = 'cpu',
-        jit: bool = False,
-        force_quick_gelu: bool = False,
-        force_custom_clip: bool = False,
-        force_patch_dropout: Optional[float] = None,
-        pretrained_image: str = '',
-        pretrained_text: str = '',
-        pretrained_hf: bool = True,
-        pretrained_visual_model: str = None,
-        pretrained_text_model: str = None,
-        cache_dir: Optional[str] = None,
-        skip_list: list  = [],
-):
-    model_name = model_name.replace('/', '-')  # for callers using old naming with / in ViT names
-    if isinstance(device, str):
-        device = torch.device(device)
-    if pretrained and pretrained.lower() == 'openai':
-        logging.info(f'Loading pretrained {model_name} from OpenAI.')
-        model = load_openai_model(
-            model_name,
-            precision=precision,
-            device=device,
-            jit=jit,
-            cache_dir=cache_dir,
-        )
-    else:
-        model_cfg = get_model_config(model_name)
-        if model_cfg is not None:
-            logging.info(f'Loaded {model_name} model config.')
-        else:
-            logging.error(f'Model config for {model_name} not found; available models {list_models()}.')
-            raise RuntimeError(f'Model config for {model_name} not found.')
-        if 'rope' in model_cfg.get('vision_cfg', {}):
-            if model_cfg['vision_cfg']['rope']:
-                os.environ['RoPE'] = "1"
-        else:
-            os.environ['RoPE'] = "0"
-        if force_quick_gelu:
-            # override for use of QuickGELU on non-OpenAI transformer models
-            model_cfg["quick_gelu"] = True
-        if force_patch_dropout is not None:
-            # override the default patch dropout value
-            model_cfg['vision_cfg']["patch_dropout"] = force_patch_dropout
-        cast_dtype = get_cast_dtype(precision)
-        custom_clip = model_cfg.pop('custom_text', False) or force_custom_clip or ('hf_model_name' in model_cfg['text_cfg'])
-        if custom_clip:
-            if 'hf_model_name' in model_cfg.get('text_cfg', {}):
-                model_cfg['text_cfg']['hf_model_pretrained'] = pretrained_hf
-            model = CustomCLIP(**model_cfg, cast_dtype=cast_dtype)
-        else:
-            model = CLIP(**model_cfg, cast_dtype=cast_dtype)
-        pretrained_cfg = {}
-        if pretrained:
-            checkpoint_path = ''
-            pretrained_cfg = get_pretrained_cfg(model_name, pretrained)
-            if pretrained_cfg:
-                checkpoint_path = download_pretrained(pretrained_cfg, cache_dir=cache_dir)
-            elif os.path.exists(pretrained):
-                checkpoint_path = pretrained
-            if checkpoint_path:
-                logging.info(f'Loading pretrained {model_name} weights ({pretrained}).')
-                load_checkpoint(model,
-                               checkpoint_path,
-                               model_key="model|module|state_dict",
-                               strict=False
-                               )
-            else:
-                error_str = (
-                    f'Pretrained weights ({pretrained}) not found for model {model_name}.'
-                    f'Available pretrained tags ({list_pretrained_tags_by_model(model_name)}.')
-                logging.warning(error_str)
-                raise RuntimeError(error_str)
-        else:
-            visual_checkpoint_path = ''
-            text_checkpoint_path = ''
-            if pretrained_image:
-                pretrained_visual_model = pretrained_visual_model.replace('/', '-')  # for callers using old naming with / in ViT names
-                pretrained_image_cfg = get_pretrained_cfg(pretrained_visual_model, pretrained_image)
-                if 'timm_model_name' in model_cfg.get('vision_cfg', {}):
-                    # pretrained weight loading for timm models set via vision_cfg
-                    model_cfg['vision_cfg']['timm_model_pretrained'] = True
-                elif pretrained_image_cfg:
-                    visual_checkpoint_path = download_pretrained(pretrained_image_cfg, cache_dir=cache_dir)
-                elif os.path.exists(pretrained_image):
-                    visual_checkpoint_path = pretrained_image
-                else:
-                    logging.warning(f'Pretrained weights ({visual_checkpoint_path}) not found for model {model_name}.visual.')
-                    raise RuntimeError(f'Pretrained weights ({visual_checkpoint_path}) not found for model {model_name}.visual.')
-            if pretrained_text:
-                pretrained_text_model = pretrained_text_model.replace('/', '-')  # for callers using old naming with / in ViT names
-                pretrained_text_cfg = get_pretrained_cfg(pretrained_text_model, pretrained_text)
-                if pretrained_image_cfg:
-                    text_checkpoint_path = download_pretrained(pretrained_text_cfg, cache_dir=cache_dir)
-                elif os.path.exists(pretrained_text):
-                    text_checkpoint_path = pretrained_text
-                else:
-                    logging.warning(f'Pretrained weights ({text_checkpoint_path}) not found for model {model_name}.text.')
-                    raise RuntimeError(f'Pretrained weights ({text_checkpoint_path}) not found for model {model_name}.text.')
-            if visual_checkpoint_path:
-                logging.info(f'Loading pretrained {model_name}.visual weights ({visual_checkpoint_path}).')
-            if text_checkpoint_path:
-                logging.info(f'Loading pretrained {model_name}.text weights ({text_checkpoint_path}).')
-            if visual_checkpoint_path or text_checkpoint_path:
-                load_pretrained_checkpoint(
-                    model,
-                    visual_checkpoint_path,
-                    text_checkpoint_path,
-                    strict=False,
-                    visual_model=pretrained_visual_model,
-                    text_model=pretrained_text_model,
-                    model_key="model|module|state_dict",
-                    skip_list=skip_list
-                )
-        if "fp16" in precision or "bf16" in precision:
-            logging.info(f'convert precision to {precision}')
-            model = model.to(torch.bfloat16) if 'bf16' in precision else model.to(torch.float16)
-        model.to(device=device)
-        # set image / mean metadata from pretrained_cfg if available, or use default
-        model.visual.image_mean = pretrained_cfg.get('mean', None) or OPENAI_DATASET_MEAN
-        model.visual.image_std = pretrained_cfg.get('std', None) or OPENAI_DATASET_STD
-        if jit:
-            model = torch.jit.script(model)
-    return model
-def create_model_and_transforms(
-        model_name: str,
-        pretrained: Optional[str] = None,
-        precision: str = 'fp32',
-        device: Union[str, torch.device] = 'cpu',
-        jit: bool = False,
-        force_quick_gelu: bool = False,
-        force_custom_clip: bool = False,
-        force_patch_dropout: Optional[float] = None,
-        pretrained_image: str = '',
-        pretrained_text: str = '',
-        pretrained_hf: bool = True,
-        pretrained_visual_model: str = None,
-        pretrained_text_model: str = None,
-        image_mean: Optional[Tuple[float, ...]] = None,
-        image_std: Optional[Tuple[float, ...]] = None,
-        cache_dir: Optional[str] = None,
-        skip_list: list = [],
-):
-    model = create_model(
-        model_name,
-        pretrained,
-        precision=precision,
-        device=device,
-        jit=jit,
-        force_quick_gelu=force_quick_gelu,
-        force_custom_clip=force_custom_clip,
-        force_patch_dropout=force_patch_dropout,
-        pretrained_image=pretrained_image,
-        pretrained_text=pretrained_text,
-        pretrained_hf=pretrained_hf,
-        pretrained_visual_model=pretrained_visual_model,
-        pretrained_text_model=pretrained_text_model,
-        cache_dir=cache_dir,
-        skip_list=skip_list,
-    )
-    image_mean = image_mean or getattr(model.visual, 'image_mean', None)
-    image_std = image_std or getattr(model.visual, 'image_std', None)
-    preprocess_train = image_transform(
-        model.visual.image_size,
-        is_train=True,
-        mean=image_mean,
-        std=image_std
-    )
-    preprocess_val = image_transform(
-        model.visual.image_size,
-        is_train=False,
-        mean=image_mean,
-        std=image_std
-    )
-    return model, preprocess_train, preprocess_val
-def create_model_from_pretrained(
-        model_name: str,
-        pretrained: str,
-        precision: str = 'fp32',
-        device: Union[str, torch.device] = 'cpu',
-        jit: bool = False,
-        force_quick_gelu: bool = False,
-        force_custom_clip: bool = False,
-        force_patch_dropout: Optional[float] = None,
-        return_transform: bool = True,
-        image_mean: Optional[Tuple[float, ...]] = None,
-        image_std: Optional[Tuple[float, ...]] = None,
-        cache_dir: Optional[str] = None,
-        is_frozen: bool = False,
-):
-    if not is_pretrained_cfg(model_name, pretrained) and not os.path.exists(pretrained):
-        raise RuntimeError(
-            f'{pretrained} is not a valid pretrained cfg or checkpoint for {model_name}.'
-            f' Use open_clip.list_pretrained() to find one.')
-    model = create_model(
-        model_name,
-        pretrained,
-        precision=precision,
-        device=device,
-        jit=jit,
-        force_quick_gelu=force_quick_gelu,
-        force_custom_clip=force_custom_clip,
-        force_patch_dropout=force_patch_dropout,
-        cache_dir=cache_dir,
-    )
-    if is_frozen:
-        for param in model.parameters():
-            param.requires_grad = False
-    if not return_transform:
-        return model
-    image_mean = image_mean or getattr(model.visual, 'image_mean', None)
-    image_std = image_std or getattr(model.visual, 'image_std', None)
-    preprocess = image_transform(
-        model.visual.image_size,
-        is_train=False,
-        mean=image_mean,
-        std=image_std
-    )
-    return model, preprocess

eva_clip/hf_configs.py DELETED Viewed

@@ -1,57 +0,0 @@
-# HF architecture dict:
-arch_dict = {
-  # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
-  "roberta": {
-      "config_names": {
-          "context_length": "max_position_embeddings",
-          "vocab_size": "vocab_size",
-          "width": "hidden_size",
-          "heads": "num_attention_heads",
-          "layers": "num_hidden_layers",
-          "layer_attr": "layer",
-          "token_embeddings_attr": "embeddings"
-      },
-      "pooler": "mean_pooler",
-  },
-  # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
-  "xlm-roberta": {
-      "config_names": {
-          "context_length": "max_position_embeddings",
-          "vocab_size": "vocab_size",
-          "width": "hidden_size",
-          "heads": "num_attention_heads",
-          "layers": "num_hidden_layers",
-          "layer_attr": "layer",
-          "token_embeddings_attr": "embeddings"
-      },
-      "pooler": "mean_pooler",
-  },
-  # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
-  "mt5": {
-      "config_names": {
-          # unlimited seqlen
-          # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
-          # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
-          "context_length": "",
-          "vocab_size": "vocab_size",
-          "width": "d_model",
-          "heads": "num_heads",
-          "layers": "num_layers",
-          "layer_attr": "block",
-          "token_embeddings_attr": "embed_tokens"
-      },
-      "pooler": "mean_pooler",
-  },
-  "bert": {
-    "config_names": {
-      "context_length": "max_position_embeddings",
-      "vocab_size": "vocab_size",
-      "width": "hidden_size",
-      "heads": "num_attention_heads",
-      "layers": "num_hidden_layers",
-      "layer_attr": "layer",
-      "token_embeddings_attr": "embeddings"
-    },
-    "pooler": "mean_pooler",
-  }
-}

eva_clip/hf_model.py DELETED Viewed

@@ -1,248 +0,0 @@
-""" huggingface model adapter
-Wraps HuggingFace transformers (https://github.com/huggingface/transformers) models for use as a text tower in CLIP model.
-"""
-import re
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-from torch import TensorType
-try:
-    import transformers
-    from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer, AutoConfig, PretrainedConfig
-    from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, \
-        BaseModelOutputWithPoolingAndCrossAttentions
-except ImportError as e:
-    transformers = None
-    class BaseModelOutput:
-        pass
-    class PretrainedConfig:
-        pass
-from .hf_configs import arch_dict
-# utils
-def _camel2snake(s):
-    return re.sub(r'(?<!^)(?=[A-Z])', '_', s).lower()
-# TODO: ?last - for gpt-like models
-_POOLERS = {}
-def register_pooler(cls):
-    """Decorator registering pooler class"""
-    _POOLERS[_camel2snake(cls.__name__)] = cls
-    return cls
-@register_pooler
-class MeanPooler(nn.Module):
-    """Mean pooling"""
-    def forward(self, x:BaseModelOutput, attention_mask:TensorType):
-        masked_output = x.last_hidden_state * attention_mask.unsqueeze(-1)
-        return masked_output.sum(dim=1) / attention_mask.sum(-1, keepdim=True)
-@register_pooler
-class MaxPooler(nn.Module):
-    """Max pooling"""
-    def forward(self, x:BaseModelOutput, attention_mask:TensorType):
-        masked_output = x.last_hidden_state.masked_fill(attention_mask.unsqueeze(-1), -torch.inf)
-        return masked_output.max(1).values
-@register_pooler
-class ClsPooler(nn.Module):
-    """CLS token pooling"""
-    def __init__(self, use_pooler_output=True):
-        super().__init__()
-        self.cls_token_position = 0
-        self.use_pooler_output = use_pooler_output
-    def forward(self, x:BaseModelOutput, attention_mask:TensorType):
-        if (self.use_pooler_output and
-            isinstance(x, (BaseModelOutputWithPooling, BaseModelOutputWithPoolingAndCrossAttentions)) and
-            (x.pooler_output is not None)
-            ):
-            return x.pooler_output
-        return x.last_hidden_state[:, self.cls_token_position, :]
-class HFTextEncoder(nn.Module):
-    """HuggingFace model adapter"""
-    def __init__(
-            self,
-            model_name_or_path: str,
-            output_dim: int,
-            tokenizer_name: str = None,
-            config: PretrainedConfig = None,
-            pooler_type: str = None,
-            proj: str = None,
-            pretrained: bool = True,
-            masked_language_modeling: bool = False):
-        super().__init__()
-        self.output_dim = output_dim
-        # TODO: find better way to get this information
-        uses_transformer_pooler = (pooler_type == "cls_pooler")
-        if transformers is None:
-            raise RuntimeError("Please `pip install transformers` to use pre-trained HuggingFace models")
-        if config is None:
-            self.config = AutoConfig.from_pretrained(model_name_or_path)
-            if masked_language_modeling:
-                create_func, model_args = (AutoModelForMaskedLM.from_pretrained, model_name_or_path) if pretrained else (
-                    AutoModelForMaskedLM.from_config, self.config)
-            else:
-                create_func, model_args = (AutoModel.from_pretrained, model_name_or_path) if pretrained else (
-                    AutoModel.from_config, self.config)
-            # TODO: do all model configs have this attribute? PretrainedConfig does so yes??
-            if hasattr(self.config, "is_encoder_decoder") and self.config.is_encoder_decoder:
-                self.transformer = create_func(model_args)
-                self.transformer = self.transformer.encoder
-            else:
-                self.transformer = create_func(model_args, add_pooling_layer=uses_transformer_pooler)
-        else:
-            self.config = config
-            if masked_language_modeling:
-                self.transformer = AutoModelForMaskedLM.from_config(config)
-            else:
-                self.transformer = AutoModel.from_config(config)
-        if pooler_type is None: # get default arch pooler
-            self.pooler = _POOLERS[(arch_dict[self.config.model_type]["pooler"])]()
-        else:
-            self.pooler = _POOLERS[pooler_type]()
-        d_model = getattr(self.config, arch_dict[self.config.model_type]["config_names"]["width"])
-        if (d_model == output_dim) and (proj is None): # do we always need a proj?
-            self.proj = nn.Identity()
-        elif proj == 'linear':
-            self.proj = nn.Linear(d_model, output_dim, bias=False)
-        elif proj == 'mlp':
-            hidden_size = (d_model + output_dim) // 2
-            self.proj = nn.Sequential(
-                nn.Linear(d_model, hidden_size, bias=False),
-                nn.GELU(),
-                nn.Linear(hidden_size, output_dim, bias=False),
-            )
-        # self.itm_proj = nn.Linear(d_model, 2, bias=False)
-        # self.mlm_proj = nn.Linear(d_model, self.config.vocab_size), bias=False)
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-    # def forward_itm(self, x:TensorType, image_embeds:TensorType) -> TensorType:
-    #     image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(x.device)
-    #     attn_mask = (x != self.config.pad_token_id).long()
-    #     out = self.transformer(
-    #         input_ids=x,
-    #         attention_mask=attn_mask,
-    #         encoder_hidden_states = image_embeds,
-    #         encoder_attention_mask = image_atts,
-    #         )
-    #     pooled_out = self.pooler(out, attn_mask)
-    #     return self.itm_proj(pooled_out)
-    def mask(self, input_ids, vocab_size, device, targets=None, masked_indices=None, probability_matrix=None):
-        if masked_indices is None:
-            masked_indices = torch.bernoulli(probability_matrix).bool()
-        masked_indices[input_ids == self.tokenizer.pad_token_id] = False
-        masked_indices[input_ids == self.tokenizer.cls_token_id] = False
-        if targets is not None:
-            targets[~masked_indices] = -100 # We only compute loss on masked tokens
-        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-        indices_replaced = torch.bernoulli(torch.full(input_ids.shape, 0.8)).bool() & masked_indices
-        input_ids[indices_replaced] = self.tokenizer.mask_token_id
-        # 10% of the time, we replace masked input tokens with random word
-        indices_random = torch.bernoulli(torch.full(input_ids.shape, 0.5)).bool() & masked_indices & ~indices_replaced
-        random_words = torch.randint(vocab_size, input_ids.shape, dtype=torch.long).to(device)
-        input_ids[indices_random] = random_words[indices_random]
-        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
-        if targets is not None:
-            return input_ids, targets
-        else:
-            return input_ids
-    def forward_mlm(self, input_ids, image_embeds, mlm_probability=0.25):
-        labels = input_ids.clone()
-        attn_mask = (input_ids != self.config.pad_token_id).long()
-        image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(input_ids.device)
-        vocab_size = getattr(self.config, arch_dict[self.config.model_type]["config_names"]["vocab_size"])
-        probability_matrix = torch.full(labels.shape, mlm_probability)
-        input_ids, labels = self.mask(input_ids, vocab_size, input_ids.device, targets=labels,
-                                      probability_matrix = probability_matrix)
-        mlm_output = self.transformer(input_ids,
-                        attention_mask = attn_mask,
-                        encoder_hidden_states = image_embeds,
-                        encoder_attention_mask = image_atts,
-                        return_dict = True,
-                        labels = labels,
-                    )
-        return mlm_output.loss
-        # mlm_output = self.transformer(input_ids,
-        #                 attention_mask = attn_mask,
-        #                 encoder_hidden_states = image_embeds,
-        #                 encoder_attention_mask = image_atts,
-        #                 return_dict = True,
-        #             ).last_hidden_state
-        # logits = self.mlm_proj(mlm_output)
-        # # logits = logits[:, :-1, :].contiguous().view(-1, vocab_size)
-        # logits = logits[:, 1:, :].contiguous().view(-1, vocab_size)
-        # labels = labels[:, 1:].contiguous().view(-1)
-        # mlm_loss = F.cross_entropy(
-        #     logits,
-        #     labels,
-        #     # label_smoothing=0.1,
-        # )
-        # return mlm_loss
-    def forward(self, x:TensorType) -> TensorType:
-        attn_mask = (x != self.config.pad_token_id).long()
-        out = self.transformer(input_ids=x, attention_mask=attn_mask)
-        pooled_out = self.pooler(out, attn_mask)
-        return self.proj(pooled_out)
-    def lock(self, unlocked_layers:int=0, freeze_layer_norm:bool=True):
-        if not unlocked_layers: # full freezing
-             for n, p in self.transformer.named_parameters():
-                 p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
-             return
-        encoder = self.transformer.encoder if hasattr(self.transformer, 'encoder') else self.transformer
-        layer_list = getattr(encoder, arch_dict[self.config.model_type]["config_names"]["layer_attr"])
-        print(f"Unlocking {unlocked_layers}/{len(layer_list) + 1} layers of hf model")
-        embeddings = getattr(
-            self.transformer, arch_dict[self.config.model_type]["config_names"]["token_embeddings_attr"])
-        modules = [embeddings, *layer_list][:-unlocked_layers]
-        # freeze layers
-        for module in modules:
-            for n, p in module.named_parameters():
-                p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
-    @torch.jit.ignore
-    def set_grad_checkpointing(self, enable=True):
-        self.transformer.gradient_checkpointing_enable()
-    def get_num_layers(self):
-        encoder = self.transformer.encoder if hasattr(self.transformer, 'encoder') else self.transformer
-        layer_list = getattr(encoder, arch_dict[self.config.model_type]["config_names"]["layer_attr"])
-        return len(layer_list)
-    def init_parameters(self):
-        pass

eva_clip/loss.py DELETED Viewed

@@ -1,138 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-try:
-    import torch.distributed.nn
-    from torch import distributed as dist
-    has_distributed = True
-except ImportError:
-    has_distributed = False
-try:
-    import horovod.torch as hvd
-except ImportError:
-    hvd = None
-from timm.loss import LabelSmoothingCrossEntropy
-def gather_features(
-        image_features,
-        text_features,
-        local_loss=False,
-        gather_with_grad=False,
-        rank=0,
-        world_size=1,
-        use_horovod=False
-):
-    assert has_distributed, 'torch.distributed did not import correctly, please use a PyTorch version with support.'
-    if use_horovod:
-        assert hvd is not None, 'Please install horovod'
-        if gather_with_grad:
-            all_image_features = hvd.allgather(image_features)
-            all_text_features = hvd.allgather(text_features)
-        else:
-            with torch.no_grad():
-                all_image_features = hvd.allgather(image_features)
-                all_text_features = hvd.allgather(text_features)
-            if not local_loss:
-                # ensure grads for local rank when all_* features don't have a gradient
-                gathered_image_features = list(all_image_features.chunk(world_size, dim=0))
-                gathered_text_features = list(all_text_features.chunk(world_size, dim=0))
-                gathered_image_features[rank] = image_features
-                gathered_text_features[rank] = text_features
-                all_image_features = torch.cat(gathered_image_features, dim=0)
-                all_text_features = torch.cat(gathered_text_features, dim=0)
-    else:
-        # We gather tensors from all gpus
-        if gather_with_grad:
-            all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0)
-            all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0)
-            # all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features, async_op=True), dim=0)
-            # all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features, async_op=True), dim=0)
-        else:
-            gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)]
-            gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)]
-            dist.all_gather(gathered_image_features, image_features)
-            dist.all_gather(gathered_text_features, text_features)
-            if not local_loss:
-                # ensure grads for local rank when all_* features don't have a gradient
-                gathered_image_features[rank] = image_features
-                gathered_text_features[rank] = text_features
-            all_image_features = torch.cat(gathered_image_features, dim=0)
-            all_text_features = torch.cat(gathered_text_features, dim=0)
-    return all_image_features, all_text_features
-class ClipLoss(nn.Module):
-    def __init__(
-            self,
-            local_loss=False,
-            gather_with_grad=False,
-            cache_labels=False,
-            rank=0,
-            world_size=1,
-            use_horovod=False,
-            smoothing=0.,
-    ):
-        super().__init__()
-        self.local_loss = local_loss
-        self.gather_with_grad = gather_with_grad
-        self.cache_labels = cache_labels
-        self.rank = rank
-        self.world_size = world_size
-        self.use_horovod = use_horovod
-        self.label_smoothing_cross_entropy = LabelSmoothingCrossEntropy(smoothing=smoothing) if smoothing > 0 else None
-        # cache state
-        self.prev_num_logits = 0
-        self.labels = {}
-    def forward(self, image_features, text_features, logit_scale=1.):
-        device = image_features.device
-        if self.world_size > 1:
-            all_image_features, all_text_features = gather_features(
-                image_features, text_features,
-                self.local_loss, self.gather_with_grad, self.rank, self.world_size, self.use_horovod)
-            if self.local_loss:
-                logits_per_image = logit_scale * image_features @ all_text_features.T
-                logits_per_text = logit_scale * text_features @ all_image_features.T
-            else:
-                logits_per_image = logit_scale * all_image_features @ all_text_features.T
-                logits_per_text = logits_per_image.T
-        else:
-            logits_per_image = logit_scale * image_features @ text_features.T
-            logits_per_text = logit_scale * text_features @ image_features.T
-        # calculated ground-truth and cache if enabled
-        num_logits = logits_per_image.shape[0]
-        if self.prev_num_logits != num_logits or device not in self.labels:
-            labels = torch.arange(num_logits, device=device, dtype=torch.long)
-            if self.world_size > 1 and self.local_loss:
-                labels = labels + num_logits * self.rank
-            if self.cache_labels:
-                self.labels[device] = labels
-                self.prev_num_logits = num_logits
-        else:
-            labels = self.labels[device]
-        if self.label_smoothing_cross_entropy:
-            total_loss = (
-                self.label_smoothing_cross_entropy(logits_per_image, labels) +
-                self.label_smoothing_cross_entropy(logits_per_text, labels)
-                ) / 2
-        else:
-            total_loss = (
-                F.cross_entropy(logits_per_image, labels) +
-                F.cross_entropy(logits_per_text, labels)
-                ) / 2
-        acc = None
-        i2t_acc = (logits_per_image.argmax(-1) == labels).sum() / len(logits_per_image)
-        t2i_acc = (logits_per_text.argmax(-1) == labels).sum() / len(logits_per_text)
-        acc = {"i2t": i2t_acc, "t2i": t2i_acc}
-        return total_loss, acc

eva_clip/model.py DELETED Viewed

@@ -1,439 +0,0 @@
-""" CLIP Model
-Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
-"""
-import os
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-from functools import partial
-import numpy as np
-import torch
-import torch.nn.functional as F
-from torch import nn
-try:
-    from .hf_model import HFTextEncoder
-except:
-    HFTextEncoder = None
-from .modified_resnet import ModifiedResNet
-from .timm_model import TimmModel
-from .eva_vit_model import EVAVisionTransformer
-from .transformer import LayerNorm, QuickGELU, Attention, VisionTransformer, TextTransformer
-try:
-    from apex.normalization import FusedLayerNorm
-except:
-    FusedLayerNorm = LayerNorm
-    print("Please 'pip install apex'")
-try:
-    import xformers.ops as xops
-except ImportError:
-    xops = None
-    print("Please 'pip install xformers'")
-@dataclass
-class CLIPVisionCfg:
-    layers: Union[Tuple[int, int, int, int], int] = 12
-    width: int = 768
-    head_width: int = 64
-    mlp_ratio: float = 4.0
-    patch_size: int = 16
-    image_size: Union[Tuple[int, int], int] = 224
-    ls_init_value: Optional[float] = None  # layer scale initial value
-    patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
-    global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
-    drop_path_rate: Optional[float] = None  # drop path rate
-    timm_model_name: str = None  # a valid model name overrides layers, width, patch_size
-    timm_model_pretrained: bool = False  # use (imagenet) pretrained weights for named model
-    timm_pool: str = 'avg'  # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
-    timm_proj: str = 'linear'  # linear projection for timm model output ('linear', 'mlp', '')
-    timm_proj_bias: bool = False  # enable bias final projection
-    eva_model_name: str = None # a valid eva model name overrides layers, width, patch_size
-    qkv_bias: bool = True
-    fusedLN: bool = False
-    xattn: bool = False
-    postnorm: bool = False
-    rope: bool = False
-    pt_hw_seq_len: int = 16   # 224/14
-    intp_freq: bool = False
-    naiveswiglu: bool = False
-    subln: bool = False
-@dataclass
-class CLIPTextCfg:
-    context_length: int = 77
-    vocab_size: int = 49408
-    width: int = 512
-    heads: int = 8
-    layers: int = 12
-    ls_init_value: Optional[float] = None  # layer scale initial value
-    hf_model_name: str = None
-    hf_tokenizer_name: str = None
-    hf_model_pretrained: bool = True
-    proj: str = 'mlp'
-    pooler_type: str = 'mean_pooler'
-    masked_language_modeling: bool = False
-    fusedLN: bool = False
-    xattn: bool = False
-    attn_mask: bool = True
-def get_cast_dtype(precision: str):
-    cast_dtype = None
-    if precision == 'bf16':
-        cast_dtype = torch.bfloat16
-    elif precision == 'fp16':
-        cast_dtype = torch.float16
-    return cast_dtype
-def _build_vision_tower(
-        embed_dim: int,
-        vision_cfg: CLIPVisionCfg,
-        quick_gelu: bool = False,
-        cast_dtype: Optional[torch.dtype] = None
-):
-    if isinstance(vision_cfg, dict):
-        vision_cfg = CLIPVisionCfg(**vision_cfg)
-    # OpenAI models are pretrained w/ QuickGELU but native nn.GELU is both faster and more
-    # memory efficient in recent PyTorch releases (>= 1.10).
-    # NOTE: timm models always use native GELU regardless of quick_gelu flag.
-    act_layer = QuickGELU if quick_gelu else nn.GELU
-    if vision_cfg.eva_model_name:
-        vision_heads = vision_cfg.width // vision_cfg.head_width
-        norm_layer = LayerNorm
-        visual = EVAVisionTransformer(
-            img_size=vision_cfg.image_size,
-            patch_size=vision_cfg.patch_size,
-            num_classes=embed_dim,
-            use_mean_pooling=vision_cfg.global_average_pool, #False
-            init_values=vision_cfg.ls_init_value,
-            patch_dropout=vision_cfg.patch_dropout,
-            embed_dim=vision_cfg.width,
-            depth=vision_cfg.layers,
-            num_heads=vision_heads,
-            mlp_ratio=vision_cfg.mlp_ratio,
-            qkv_bias=vision_cfg.qkv_bias,
-            drop_path_rate=vision_cfg.drop_path_rate,
-            norm_layer= partial(FusedLayerNorm, eps=1e-6) if vision_cfg.fusedLN else partial(norm_layer, eps=1e-6),
-            xattn=vision_cfg.xattn,
-            rope=vision_cfg.rope,
-            postnorm=vision_cfg.postnorm,
-            pt_hw_seq_len= vision_cfg.pt_hw_seq_len,   # 224/14
-            intp_freq= vision_cfg.intp_freq,
-            naiveswiglu= vision_cfg.naiveswiglu,
-            subln= vision_cfg.subln
-        )
-    elif vision_cfg.timm_model_name:
-        visual = TimmModel(
-            vision_cfg.timm_model_name,
-            pretrained=vision_cfg.timm_model_pretrained,
-            pool=vision_cfg.timm_pool,
-            proj=vision_cfg.timm_proj,
-            proj_bias=vision_cfg.timm_proj_bias,
-            embed_dim=embed_dim,
-            image_size=vision_cfg.image_size
-        )
-        act_layer = nn.GELU  # so that text transformer doesn't use QuickGELU w/ timm models
-    elif isinstance(vision_cfg.layers, (tuple, list)):
-        vision_heads = vision_cfg.width * 32 // vision_cfg.head_width
-        visual = ModifiedResNet(
-            layers=vision_cfg.layers,
-            output_dim=embed_dim,
-            heads=vision_heads,
-            image_size=vision_cfg.image_size,
-            width=vision_cfg.width
-        )
-    else:
-        vision_heads = vision_cfg.width // vision_cfg.head_width
-        norm_layer = LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
-        visual = VisionTransformer(
-            image_size=vision_cfg.image_size,
-            patch_size=vision_cfg.patch_size,
-            width=vision_cfg.width,
-            layers=vision_cfg.layers,
-            heads=vision_heads,
-            mlp_ratio=vision_cfg.mlp_ratio,
-            ls_init_value=vision_cfg.ls_init_value,
-            patch_dropout=vision_cfg.patch_dropout,
-            global_average_pool=vision_cfg.global_average_pool,
-            output_dim=embed_dim,
-            act_layer=act_layer,
-            norm_layer=norm_layer,
-        )
-    return visual
-def _build_text_tower(
-        embed_dim: int,
-        text_cfg: CLIPTextCfg,
-        quick_gelu: bool = False,
-        cast_dtype: Optional[torch.dtype] = None,
-):
-    if isinstance(text_cfg, dict):
-        text_cfg = CLIPTextCfg(**text_cfg)
-    if text_cfg.hf_model_name:
-        text = HFTextEncoder(
-            text_cfg.hf_model_name,
-            output_dim=embed_dim,
-            tokenizer_name=text_cfg.hf_tokenizer_name,
-            proj=text_cfg.proj,
-            pooler_type=text_cfg.pooler_type,
-            masked_language_modeling=text_cfg.masked_language_modeling
-       )
-    else:
-        act_layer = QuickGELU if quick_gelu else nn.GELU
-        norm_layer = LayerNorm
-        text = TextTransformer(
-            context_length=text_cfg.context_length,
-            vocab_size=text_cfg.vocab_size,
-            width=text_cfg.width,
-            heads=text_cfg.heads,
-            layers=text_cfg.layers,
-            ls_init_value=text_cfg.ls_init_value,
-            output_dim=embed_dim,
-            act_layer=act_layer,
-            norm_layer= FusedLayerNorm if text_cfg.fusedLN else norm_layer,
-            xattn=text_cfg.xattn,
-            attn_mask=text_cfg.attn_mask,
-        )
-    return text
-class CLIP(nn.Module):
-    def __init__(
-            self,
-            embed_dim: int,
-            vision_cfg: CLIPVisionCfg,
-            text_cfg: CLIPTextCfg,
-            quick_gelu: bool = False,
-            cast_dtype: Optional[torch.dtype] = None,
-    ):
-        super().__init__()
-        self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
-        text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype)
-        self.transformer = text.transformer
-        self.vocab_size = text.vocab_size
-        self.token_embedding = text.token_embedding
-        self.positional_embedding = text.positional_embedding
-        self.ln_final = text.ln_final
-        self.text_projection = text.text_projection
-        self.register_buffer('attn_mask', text.attn_mask, persistent=False)
-        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-    def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
-        # lock image tower as per LiT - https://arxiv.org/abs/2111.07991
-        self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
-    @torch.jit.ignore
-    def set_grad_checkpointing(self, enable=True):
-        self.visual.set_grad_checkpointing(enable)
-        self.transformer.grad_checkpointing = enable
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'logit_scale'}
-    def encode_image(self, image, normalize: bool = False):
-        features = self.visual(image)
-        return F.normalize(features, dim=-1) if normalize else features
-    def encode_text(self, text, normalize: bool = False):
-        cast_dtype = self.transformer.get_cast_dtype()
-        x = self.token_embedding(text).to(cast_dtype)  # [batch_size, n_ctx, d_model]
-        x = x + self.positional_embedding.to(cast_dtype)
-        x = x.permute(1, 0, 2)  # NLD -> LND
-        x = self.transformer(x, attn_mask=self.attn_mask)
-        x = x.permute(1, 0, 2)  # LND -> NLD
-        x = self.ln_final(x)  # [batch_size, n_ctx, transformer.width]
-        # take features from the eot embedding (eot_token is the highest number in each sequence)
-        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
-        return F.normalize(x, dim=-1) if normalize else x
-    def forward(self, image, text):
-        image_features = self.encode_image(image, normalize=True)
-        text_features = self.encode_text(text, normalize=True)
-        return image_features, text_features, self.logit_scale.exp()
-class CustomCLIP(nn.Module):
-    def __init__(
-            self,
-            embed_dim: int,
-            vision_cfg: CLIPVisionCfg,
-            text_cfg: CLIPTextCfg,
-            quick_gelu: bool = False,
-            cast_dtype: Optional[torch.dtype] = None,
-            itm_task: bool = False,
-    ):
-        super().__init__()
-        self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
-        self.text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype)
-        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-    def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
-        # lock image tower as per LiT - https://arxiv.org/abs/2111.07991
-        self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
-    def lock_text_tower(self, unlocked_layers:int=0, freeze_layer_norm:bool=True):
-        self.text.lock(unlocked_layers, freeze_layer_norm)
-    @torch.jit.ignore
-    def set_grad_checkpointing(self, enable=True):
-        self.visual.set_grad_checkpointing(enable)
-        self.text.set_grad_checkpointing(enable)
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'logit_scale'}
-    def encode_image(self, image, normalize: bool = False):
-        features = self.visual(image)
-        return F.normalize(features, dim=-1) if normalize else features
-    def encode_text(self, text, normalize: bool = False):
-        features = self.text(text)
-        return F.normalize(features, dim=-1) if normalize else features
-    def forward(self, image, text):
-        image_features = self.encode_image(image, normalize=True)
-        text_features = self.encode_text(text, normalize=True)
-        return image_features, text_features, self.logit_scale.exp()
-def convert_weights_to_lp(model: nn.Module, dtype=torch.float16):
-    """Convert applicable model parameters to low-precision (bf16 or fp16)"""
-    def _convert_weights(l):
-        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
-            l.weight.data = l.weight.data.to(dtype)
-            if l.bias is not None:
-                l.bias.data = l.bias.data.to(dtype)
-        if isinstance(l, (nn.MultiheadAttention, Attention)):
-            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
-                tensor = getattr(l, attr, None)
-                if tensor is not None:
-                    tensor.data = tensor.data.to(dtype)
-        if isinstance(l, nn.Parameter):
-            l.data = l.data.to(dtype)
-        for name in ["text_projection", "proj"]:
-            if hasattr(l, name) and isinstance(l, nn.Parameter):
-                attr = getattr(l, name, None)
-                if attr is not None:
-                    attr.data = attr.data.to(dtype)
-    model.apply(_convert_weights)
-convert_weights_to_fp16 = convert_weights_to_lp  # backwards compat
-# used to maintain checkpoint compatibility
-def convert_to_custom_text_state_dict(state_dict: dict):
-    if 'text_projection' in state_dict:
-        # old format state_dict, move text tower -> .text
-        new_state_dict = {}
-        for k, v in state_dict.items():
-            if any(k.startswith(p) for p in (
-                'text_projection',
-                'positional_embedding',
-                'token_embedding',
-                'transformer',
-                'ln_final',
-                'logit_scale'
-            )):
-                k = 'text.' + k
-            new_state_dict[k] = v
-        return new_state_dict
-    return state_dict
-def build_model_from_openai_state_dict(
-        state_dict: dict,
-        quick_gelu=True,
-        cast_dtype=torch.float16,
-):
-    vit = "visual.proj" in state_dict
-    if vit:
-        vision_width = state_dict["visual.conv1.weight"].shape[0]
-        vision_layers = len(
-            [k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
-        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
-        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
-        image_size = vision_patch_size * grid_size
-    else:
-        counts: list = [
-            len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
-        vision_layers = tuple(counts)
-        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
-        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
-        vision_patch_size = None
-        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
-        image_size = output_width * 32
-    embed_dim = state_dict["text_projection"].shape[1]
-    context_length = state_dict["positional_embedding"].shape[0]
-    vocab_size = state_dict["token_embedding.weight"].shape[0]
-    transformer_width = state_dict["ln_final.weight"].shape[0]
-    transformer_heads = transformer_width // 64
-    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
-    vision_cfg = CLIPVisionCfg(
-        layers=vision_layers,
-        width=vision_width,
-        patch_size=vision_patch_size,
-        image_size=image_size,
-    )
-    text_cfg = CLIPTextCfg(
-        context_length=context_length,
-        vocab_size=vocab_size,
-        width=transformer_width,
-        heads=transformer_heads,
-        layers=transformer_layers
-    )
-    model = CLIP(
-        embed_dim,
-        vision_cfg=vision_cfg,
-        text_cfg=text_cfg,
-        quick_gelu=quick_gelu,  # OpenAI models were trained with QuickGELU
-        cast_dtype=cast_dtype,
-    )
-    for key in ["input_resolution", "context_length", "vocab_size"]:
-        state_dict.pop(key, None)
-    convert_weights_to_fp16(model)  # OpenAI state dicts are partially converted to float16
-    model.load_state_dict(state_dict)
-    return model.eval()
-def trace_model(model, batch_size=256, device=torch.device('cpu')):
-    model.eval()
-    image_size = model.visual.image_size
-    example_images = torch.ones((batch_size, 3, image_size, image_size), device=device)
-    example_text = torch.zeros((batch_size, model.context_length), dtype=torch.int, device=device)
-    model = torch.jit.trace_module(
-        model,
-        inputs=dict(
-            forward=(example_images, example_text),
-            encode_text=(example_text,),
-            encode_image=(example_images,)
-        ))
-    model.visual.image_size = image_size
-    return model

eva_clip/model_configs/EVA01-CLIP-B-16.json DELETED Viewed

@@ -1,19 +0,0 @@
-{
-    "embed_dim": 512,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 768,
-        "patch_size": 16,
-        "eva_model_name": "eva-clip-b-16",
-        "ls_init_value": 0.1,
-        "drop_path_rate": 0.0
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12
-    }
-}

eva_clip/model_configs/EVA01-CLIP-g-14-plus.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-    "embed_dim": 1024,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 40,
-        "width": 1408,
-        "head_width": 88,
-        "mlp_ratio": 4.3637,
-        "patch_size": 14,
-        "eva_model_name": "eva-clip-g-14-x",
-        "drop_path_rate": 0,
-        "xattn": true,
-        "fusedLN": true
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 1024,
-        "heads": 16,
-        "layers": 24,
-        "xattn": false,
-        "fusedLN": true
-    }
-}

eva_clip/model_configs/EVA01-CLIP-g-14.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-    "embed_dim": 1024,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 40,
-        "width": 1408,
-        "head_width": 88,
-        "mlp_ratio": 4.3637,
-        "patch_size": 14,
-        "eva_model_name": "eva-clip-g-14-x",
-        "drop_path_rate": 0.4,
-        "xattn": true,
-        "fusedLN": true
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 768,
-        "heads": 12,
-        "layers": 12,
-        "xattn": false,
-        "fusedLN": true
-    }
-}

eva_clip/model_configs/EVA02-CLIP-B-16.json DELETED Viewed

@@ -1,29 +0,0 @@
-{
-    "embed_dim": 512,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 12,
-        "width": 768,
-        "head_width": 64,
-        "patch_size": 16,
-        "mlp_ratio": 2.6667,
-        "eva_model_name": "eva-clip-b-16-X",
-        "drop_path_rate": 0.0,
-        "xattn": true,
-        "fusedLN": true,
-        "rope": true,
-        "pt_hw_seq_len": 16,
-        "intp_freq": true,
-        "naiveswiglu": true,
-        "subln": true
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 512,
-        "heads": 8,
-        "layers": 12,
-        "xattn": true,
-        "fusedLN": true
-    }
-}

eva_clip/model_configs/EVA02-CLIP-L-14-336.json DELETED Viewed

@@ -1,29 +0,0 @@
-{
-    "embed_dim": 768,
-    "vision_cfg": {
-        "image_size": 336,
-        "layers": 24,
-        "width": 1024,
-        "drop_path_rate": 0,
-        "head_width": 64,
-        "mlp_ratio": 2.6667,
-        "patch_size": 14,
-        "eva_model_name": "eva-clip-l-14-336",
-        "xattn": true,
-        "fusedLN": true,
-        "rope": true,
-        "pt_hw_seq_len": 16,
-        "intp_freq": true,
-        "naiveswiglu": true,
-        "subln": true
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 768,
-        "heads": 12,
-        "layers": 12,
-        "xattn": false,
-        "fusedLN": true
-    }
-}

eva_clip/model_configs/EVA02-CLIP-L-14.json DELETED Viewed

@@ -1,29 +0,0 @@
-{
-    "embed_dim": 768,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 24,
-        "width": 1024,
-        "drop_path_rate": 0,
-        "head_width": 64,
-        "mlp_ratio": 2.6667,
-        "patch_size": 14,
-        "eva_model_name": "eva-clip-l-14",
-        "xattn": true,
-        "fusedLN": true,
-        "rope": true,
-        "pt_hw_seq_len": 16,
-        "intp_freq": true,
-        "naiveswiglu": true,
-        "subln": true
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 768,
-        "heads": 12,
-        "layers": 12,
-        "xattn": false,
-        "fusedLN": true
-    }
-}

eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json DELETED Viewed

@@ -1,25 +0,0 @@
-{
-    "embed_dim": 1024,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 64,
-        "width": 1792,
-        "head_width": 112,
-        "mlp_ratio": 8.571428571428571,
-        "patch_size": 14,
-        "eva_model_name": "eva-clip-4b-14-x",
-        "drop_path_rate": 0,
-        "xattn": true,
-        "postnorm": true,
-        "fusedLN": true
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 1280,
-        "heads": 20,
-        "layers": 32,
-        "xattn": false,
-        "fusedLN": true
-    }
-}

eva_clip/model_configs/EVA02-CLIP-bigE-14.json DELETED Viewed

@@ -1,25 +0,0 @@
-{
-    "embed_dim": 1024,
-    "vision_cfg": {
-        "image_size": 224,
-        "layers": 64,
-        "width": 1792,
-        "head_width": 112,
-        "mlp_ratio": 8.571428571428571,
-        "patch_size": 14,
-        "eva_model_name": "eva-clip-4b-14-x",
-        "drop_path_rate": 0,
-        "xattn": true,
-        "postnorm": true,
-        "fusedLN": true
-    },
-    "text_cfg": {
-        "context_length": 77,
-        "vocab_size": 49408,
-        "width": 1024,
-        "heads": 16,
-        "layers": 24,
-        "xattn": false,
-        "fusedLN": true
-    }
-}

eva_clip/modified_resnet.py DELETED Viewed

@@ -1,181 +0,0 @@
-from collections import OrderedDict
-import torch
-from torch import nn
-from torch.nn import functional as F
-from .utils import freeze_batch_norm_2d
-class Bottleneck(nn.Module):
-    expansion = 4
-    def __init__(self, inplanes, planes, stride=1):
-        super().__init__()
-        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
-        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.act1 = nn.ReLU(inplace=True)
-        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.act2 = nn.ReLU(inplace=True)
-        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
-        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
-        self.act3 = nn.ReLU(inplace=True)
-        self.downsample = None
-        self.stride = stride
-        if stride > 1 or inplanes != planes * Bottleneck.expansion:
-            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
-            self.downsample = nn.Sequential(OrderedDict([
-                ("-1", nn.AvgPool2d(stride)),
-                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
-                ("1", nn.BatchNorm2d(planes * self.expansion))
-            ]))
-    def forward(self, x: torch.Tensor):
-        identity = x
-        out = self.act1(self.bn1(self.conv1(x)))
-        out = self.act2(self.bn2(self.conv2(out)))
-        out = self.avgpool(out)
-        out = self.bn3(self.conv3(out))
-        if self.downsample is not None:
-            identity = self.downsample(x)
-        out += identity
-        out = self.act3(out)
-        return out
-class AttentionPool2d(nn.Module):
-    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
-        super().__init__()
-        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
-        self.k_proj = nn.Linear(embed_dim, embed_dim)
-        self.q_proj = nn.Linear(embed_dim, embed_dim)
-        self.v_proj = nn.Linear(embed_dim, embed_dim)
-        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
-        self.num_heads = num_heads
-    def forward(self, x):
-        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
-        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
-        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
-        x, _ = F.multi_head_attention_forward(
-            query=x, key=x, value=x,
-            embed_dim_to_check=x.shape[-1],
-            num_heads=self.num_heads,
-            q_proj_weight=self.q_proj.weight,
-            k_proj_weight=self.k_proj.weight,
-            v_proj_weight=self.v_proj.weight,
-            in_proj_weight=None,
-            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
-            bias_k=None,
-            bias_v=None,
-            add_zero_attn=False,
-            dropout_p=0.,
-            out_proj_weight=self.c_proj.weight,
-            out_proj_bias=self.c_proj.bias,
-            use_separate_proj_weight=True,
-            training=self.training,
-            need_weights=False
-        )
-        return x[0]
-class ModifiedResNet(nn.Module):
-    """
-    A ResNet class that is similar to torchvision's but contains the following changes:
-    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
-    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
-    - The final pooling layer is a QKV attention instead of an average pool
-    """
-    def __init__(self, layers, output_dim, heads, image_size=224, width=64):
-        super().__init__()
-        self.output_dim = output_dim
-        self.image_size = image_size
-        # the 3-layer stem
-        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(width // 2)
-        self.act1 = nn.ReLU(inplace=True)
-        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(width // 2)
-        self.act2 = nn.ReLU(inplace=True)
-        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(width)
-        self.act3 = nn.ReLU(inplace=True)
-        self.avgpool = nn.AvgPool2d(2)
-        # residual layers
-        self._inplanes = width  # this is a *mutable* variable used during construction
-        self.layer1 = self._make_layer(width, layers[0])
-        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
-        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
-        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
-        embed_dim = width * 32  # the ResNet feature dimension
-        self.attnpool = AttentionPool2d(image_size // 32, embed_dim, heads, output_dim)
-        self.init_parameters()
-    def _make_layer(self, planes, blocks, stride=1):
-        layers = [Bottleneck(self._inplanes, planes, stride)]
-        self._inplanes = planes * Bottleneck.expansion
-        for _ in range(1, blocks):
-            layers.append(Bottleneck(self._inplanes, planes))
-        return nn.Sequential(*layers)
-    def init_parameters(self):
-        if self.attnpool is not None:
-            std = self.attnpool.c_proj.in_features ** -0.5
-            nn.init.normal_(self.attnpool.q_proj.weight, std=std)
-            nn.init.normal_(self.attnpool.k_proj.weight, std=std)
-            nn.init.normal_(self.attnpool.v_proj.weight, std=std)
-            nn.init.normal_(self.attnpool.c_proj.weight, std=std)
-        for resnet_block in [self.layer1, self.layer2, self.layer3, self.layer4]:
-            for name, param in resnet_block.named_parameters():
-                if name.endswith("bn3.weight"):
-                    nn.init.zeros_(param)
-    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
-        assert unlocked_groups == 0, 'partial locking not currently supported for this model'
-        for param in self.parameters():
-            param.requires_grad = False
-        if freeze_bn_stats:
-            freeze_batch_norm_2d(self)
-    @torch.jit.ignore
-    def set_grad_checkpointing(self, enable=True):
-        # FIXME support for non-transformer
-        pass
-    def stem(self, x):
-        x = self.act1(self.bn1(self.conv1(x)))
-        x = self.act2(self.bn2(self.conv2(x)))
-        x = self.act3(self.bn3(self.conv3(x)))
-        x = self.avgpool(x)
-        return x
-    def forward(self, x):
-        x = self.stem(x)
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-        x = self.attnpool(x)
-        return x

eva_clip/openai.py DELETED Viewed

@@ -1,144 +0,0 @@
-""" OpenAI pretrained model functions
-Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
-"""
-import os
-import warnings
-from typing import List, Optional, Union
-import torch
-from .model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype
-from .pretrained import get_pretrained_url, list_pretrained_models_by_tag, download_pretrained_from_url
-__all__ = ["list_openai_models", "load_openai_model"]
-def list_openai_models() -> List[str]:
-    """Returns the names of available CLIP models"""
-    return list_pretrained_models_by_tag('openai')
-def load_openai_model(
-        name: str,
-        precision: Optional[str] = None,
-        device: Optional[Union[str, torch.device]] = None,
-        jit: bool = True,
-        cache_dir: Optional[str] = None,
-):
-    """Load a CLIP model
-    Parameters
-    ----------
-    name : str
-        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
-    precision: str
-        Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'.
-    device : Union[str, torch.device]
-        The device to put the loaded model
-    jit : bool
-        Whether to load the optimized JIT model (default) or more hackable non-JIT model.
-    cache_dir : Optional[str]
-        The directory to cache the downloaded model weights
-    Returns
-    -------
-    model : torch.nn.Module
-        The CLIP model
-    preprocess : Callable[[PIL.Image], torch.Tensor]
-        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
-    """
-    if device is None:
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-    if precision is None:
-        precision = 'fp32' if device == 'cpu' else 'fp16'
-    if get_pretrained_url(name, 'openai'):
-        model_path = download_pretrained_from_url(get_pretrained_url(name, 'openai'), cache_dir=cache_dir)
-    elif os.path.isfile(name):
-        model_path = name
-    else:
-        raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}")
-    try:
-        # loading JIT archive
-        model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
-        state_dict = None
-    except RuntimeError:
-        # loading saved state dict
-        if jit:
-            warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
-            jit = False
-        state_dict = torch.load(model_path, map_location="cpu")
-    if not jit:
-        # Build a non-jit model from the OpenAI jitted model state dict
-        cast_dtype = get_cast_dtype(precision)
-        try:
-            model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype)
-        except KeyError:
-            sd = {k[7:]: v for k, v in state_dict["state_dict"].items()}
-            model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype)
-        # model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use
-        model = model.to(device)
-        if precision.startswith('amp') or precision == 'fp32':
-            model.float()
-        elif precision == 'bf16':
-            convert_weights_to_lp(model, dtype=torch.bfloat16)
-        return model
-    # patch the device names
-    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
-    device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
-    def patch_device(module):
-        try:
-            graphs = [module.graph] if hasattr(module, "graph") else []
-        except RuntimeError:
-            graphs = []
-        if hasattr(module, "forward1"):
-            graphs.append(module.forward1.graph)
-        for graph in graphs:
-            for node in graph.findAllNodes("prim::Constant"):
-                if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
-                    node.copyAttributes(device_node)
-    model.apply(patch_device)
-    patch_device(model.encode_image)
-    patch_device(model.encode_text)
-    # patch dtype to float32 (typically for CPU)
-    if precision == 'fp32':
-        float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
-        float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
-        float_node = float_input.node()
-        def patch_float(module):
-            try:
-                graphs = [module.graph] if hasattr(module, "graph") else []
-            except RuntimeError:
-                graphs = []
-            if hasattr(module, "forward1"):
-                graphs.append(module.forward1.graph)
-            for graph in graphs:
-                for node in graph.findAllNodes("aten::to"):
-                    inputs = list(node.inputs())
-                    for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
-                        if inputs[i].node()["value"] == 5:
-                            inputs[i].node().copyAttributes(float_node)
-        model.apply(patch_float)
-        patch_float(model.encode_image)
-        patch_float(model.encode_text)
-        model.float()
-    # ensure image_size attr available at consistent location for both jit and non-jit
-    model.visual.image_size = model.input_resolution.item()
-    return model

eva_clip/pretrained.py DELETED Viewed

@@ -1,332 +0,0 @@
-import hashlib
-import os
-import urllib
-import warnings
-from functools import partial
-from typing import Dict, Union
-from tqdm import tqdm
-try:
-    from huggingface_hub import hf_hub_download
-    _has_hf_hub = True
-except ImportError:
-    hf_hub_download = None
-    _has_hf_hub = False
-def _pcfg(url='', hf_hub='', filename='', mean=None, std=None):
-    return dict(
-        url=url,
-        hf_hub=hf_hub,
-        mean=mean,
-        std=std,
-    )
-_VITB32 = dict(
-    openai=_pcfg(
-        "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt"),
-    laion400m_e31=_pcfg(
-        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt"),
-    laion400m_e32=_pcfg(
-        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt"),
-    laion2b_e16=_pcfg(
-        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-laion2b_e16-af8dbd0c.pth"),
-    laion2b_s34b_b79k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-laion2B-s34B-b79K/')
-)
-_VITB32_quickgelu = dict(
-    openai=_pcfg(
-        "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt"),
-    laion400m_e31=_pcfg(
-        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt"),
-    laion400m_e32=_pcfg(
-        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt"),
-)
-_VITB16 = dict(
-    openai=_pcfg(
-        "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt"),
-    laion400m_e31=_pcfg(
-        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16-laion400m_e31-00efa78f.pt"),
-    laion400m_e32=_pcfg(
-        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16-laion400m_e32-55e67d44.pt"),
-    laion2b_s34b_b88k=_pcfg(hf_hub='laion/CLIP-ViT-B-16-laion2B-s34B-b88K/'),
-)
-_EVAB16 = dict(
-    eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_B_psz14to16.pt'),
-    eva02=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_B_psz14to16.pt'),
-    eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_B_psz16_s8B.pt'),
-    eva02_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_B_psz16_s8B.pt'),
-)
-_VITB16_PLUS_240 = dict(
-    laion400m_e31=_pcfg(
-        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16_plus_240-laion400m_e31-8fb26589.pt"),
-    laion400m_e32=_pcfg(
-        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16_plus_240-laion400m_e32-699c4b84.pt"),
-)
-_VITL14 = dict(
-    openai=_pcfg(
-        "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt"),
-    laion400m_e31=_pcfg(
-        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_l_14-laion400m_e31-69988bb6.pt"),
-    laion400m_e32=_pcfg(
-        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_l_14-laion400m_e32-3d133497.pt"),
-    laion2b_s32b_b82k=_pcfg(
-        hf_hub='laion/CLIP-ViT-L-14-laion2B-s32B-b82K/',
-        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
-)
-_EVAL14 = dict(
-    eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_L_psz14.pt'),
-    eva02=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_L_psz14.pt'),
-    eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_s4B.pt'),
-    eva02_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_s4B.pt'),
-)
-_VITL14_336 = dict(
-    openai=_pcfg(
-        "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt"),
-)
-_EVAL14_336 = dict(
-    eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_336_psz14_s6B.pt'),
-    eva02_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_336_psz14_s6B.pt'),
-    eva_clip_224to336=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_224to336.pt'),
-    eva02_clip_224to336=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_224to336.pt'),
-)
-_VITH14 = dict(
-    laion2b_s32b_b79k=_pcfg(hf_hub='laion/CLIP-ViT-H-14-laion2B-s32B-b79K/'),
-)
-_VITg14 = dict(
-    laion2b_s12b_b42k=_pcfg(hf_hub='laion/CLIP-ViT-g-14-laion2B-s12B-b42K/'),
-    laion2b_s34b_b88k=_pcfg(hf_hub='laion/CLIP-ViT-g-14-laion2B-s34B-b88K/'),
-)
-_EVAg14 = dict(
-    eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/'),
-    eva01=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_g_psz14.pt'),
-    eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_CLIP_g_14_psz14_s11B.pt'),
-    eva01_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_CLIP_g_14_psz14_s11B.pt'),
-)
-_EVAg14_PLUS = dict(
-    eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/'),
-    eva01=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_g_psz14.pt'),
-    eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_CLIP_g_14_plus_psz14_s11B.pt'),
-    eva01_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_CLIP_g_14_plus_psz14_s11B.pt'),
-)
-_VITbigG14 = dict(
-    laion2b_s39b_b160k=_pcfg(hf_hub='laion/CLIP-ViT-bigG-14-laion2B-39B-b160k/'),
-)
-_EVAbigE14 = dict(
-    eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_E_psz14.pt'),
-    eva02=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_E_psz14.pt'),
-    eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_s4B.pt'),
-    eva02_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_s4B.pt'),
-)
-_EVAbigE14_PLUS = dict(
-    eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_E_psz14.pt'),
-    eva02=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_E_psz14.pt'),
-    eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt'),
-    eva02_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt'),
-)
-_PRETRAINED = {
-    # "ViT-B-32": _VITB32,
-    "OpenaiCLIP-B-32": _VITB32,
-    "OpenCLIP-B-32": _VITB32,
-    # "ViT-B-32-quickgelu": _VITB32_quickgelu,
-    "OpenaiCLIP-B-32-quickgelu": _VITB32_quickgelu,
-    "OpenCLIP-B-32-quickgelu": _VITB32_quickgelu,
-    # "ViT-B-16": _VITB16,
-    "OpenaiCLIP-B-16": _VITB16,
-    "OpenCLIP-B-16": _VITB16,
-    "EVA02-B-16": _EVAB16,
-    "EVA02-CLIP-B-16": _EVAB16,
-    # "ViT-B-16-plus-240": _VITB16_PLUS_240,
-    "OpenCLIP-B-16-plus-240": _VITB16_PLUS_240,
-    # "ViT-L-14": _VITL14,
-    "OpenaiCLIP-L-14": _VITL14,
-    "OpenCLIP-L-14": _VITL14,
-    "EVA02-L-14": _EVAL14,
-    "EVA02-CLIP-L-14": _EVAL14,
-    # "ViT-L-14-336": _VITL14_336,
-    "OpenaiCLIP-L-14-336": _VITL14_336,
-    "EVA02-CLIP-L-14-336": _EVAL14_336,
-    # "ViT-H-14": _VITH14,
-    # "ViT-g-14": _VITg14,
-    "OpenCLIP-H-14": _VITH14,
-    "OpenCLIP-g-14": _VITg14,
-    "EVA01-CLIP-g-14": _EVAg14,
-    "EVA01-CLIP-g-14-plus": _EVAg14_PLUS,
-    # "ViT-bigG-14": _VITbigG14,
-    "OpenCLIP-bigG-14": _VITbigG14,
-    "EVA02-CLIP-bigE-14": _EVAbigE14,
-    "EVA02-CLIP-bigE-14-plus": _EVAbigE14_PLUS,
-}
-def _clean_tag(tag: str):
-    # normalize pretrained tags
-    return tag.lower().replace('-', '_')
-def list_pretrained(as_str: bool = False):
-    """ returns list of pretrained models
-    Returns a tuple (model_name, pretrain_tag) by default or 'name:tag' if as_str == True
-    """
-    return [':'.join([k, t]) if as_str else (k, t) for k in _PRETRAINED.keys() for t in _PRETRAINED[k].keys()]
-def list_pretrained_models_by_tag(tag: str):
-    """ return all models having the specified pretrain tag """
-    models = []
-    tag = _clean_tag(tag)
-    for k in _PRETRAINED.keys():
-        if tag in _PRETRAINED[k]:
-            models.append(k)
-    return models
-def list_pretrained_tags_by_model(model: str):
-    """ return all pretrain tags for the specified model architecture """
-    tags = []
-    if model in _PRETRAINED:
-        tags.extend(_PRETRAINED[model].keys())
-    return tags
-def is_pretrained_cfg(model: str, tag: str):
-    if model not in _PRETRAINED:
-        return False
-    return _clean_tag(tag) in _PRETRAINED[model]
-def get_pretrained_cfg(model: str, tag: str):
-    if model not in _PRETRAINED:
-        return {}
-    model_pretrained = _PRETRAINED[model]
-    return model_pretrained.get(_clean_tag(tag), {})
-def get_pretrained_url(model: str, tag: str):
-    cfg = get_pretrained_cfg(model, _clean_tag(tag))
-    return cfg.get('url', '')
-def download_pretrained_from_url(
-        url: str,
-        cache_dir: Union[str, None] = None,
-):
-    if not cache_dir:
-        cache_dir = os.path.expanduser("~/.cache/clip")
-    os.makedirs(cache_dir, exist_ok=True)
-    filename = os.path.basename(url)
-    if 'openaipublic' in url:
-        expected_sha256 = url.split("/")[-2]
-    elif 'mlfoundations' in url:
-        expected_sha256 = os.path.splitext(filename)[0].split("-")[-1]
-    else:
-        expected_sha256 = ''
-    download_target = os.path.join(cache_dir, filename)
-    if os.path.exists(download_target) and not os.path.isfile(download_target):
-        raise RuntimeError(f"{download_target} exists and is not a regular file")
-    if os.path.isfile(download_target):
-        if expected_sha256:
-            if hashlib.sha256(open(download_target, "rb").read()).hexdigest().startswith(expected_sha256):
-                return download_target
-            else:
-                warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
-        else:
-            return download_target
-    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
-        with tqdm(total=int(source.headers.get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
-            while True:
-                buffer = source.read(8192)
-                if not buffer:
-                    break
-                output.write(buffer)
-                loop.update(len(buffer))
-    if expected_sha256 and not hashlib.sha256(open(download_target, "rb").read()).hexdigest().startswith(expected_sha256):
-        raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
-    return download_target
-def has_hf_hub(necessary=False):
-    if not _has_hf_hub and necessary:
-        # if no HF Hub module installed, and it is necessary to continue, raise error
-        raise RuntimeError(
-            'Hugging Face hub model specified but package not installed. Run `pip install huggingface_hub`.')
-    return _has_hf_hub
-def download_pretrained_from_hf(
-        model_id: str,
-        filename: str = 'open_clip_pytorch_model.bin',
-        revision=None,
-        cache_dir: Union[str, None] = None,
-):
-    has_hf_hub(True)
-    cached_file = hf_hub_download(model_id, filename, revision=revision, cache_dir=cache_dir)
-    return cached_file
-def download_pretrained(
-        cfg: Dict,
-        force_hf_hub: bool = False,
-        cache_dir: Union[str, None] = None,
-):
-    target = ''
-    if not cfg:
-        return target
-    download_url = cfg.get('url', '')
-    download_hf_hub = cfg.get('hf_hub', '')
-    if download_hf_hub and force_hf_hub:
-        # use HF hub even if url exists
-        download_url = ''
-    if download_url:
-        target = download_pretrained_from_url(download_url, cache_dir=cache_dir)
-    elif download_hf_hub:
-        has_hf_hub(True)
-        # we assume the hf_hub entries in pretrained config combine model_id + filename in
-        # 'org/model_name/filename.pt' form. To specify just the model id w/o filename and
-        # use 'open_clip_pytorch_model.bin' default, there must be a trailing slash 'org/model_name/'.
-        model_id, filename = os.path.split(download_hf_hub)
-        if filename:
-            target = download_pretrained_from_hf(model_id, filename=filename, cache_dir=cache_dir)
-        else:
-            target = download_pretrained_from_hf(model_id, cache_dir=cache_dir)
-    return target

eva_clip/rope.py DELETED Viewed

@@ -1,137 +0,0 @@
-from math import pi
-import torch
-from torch import nn
-from einops import rearrange, repeat
-import logging
-def broadcat(tensors, dim = -1):
-    num_tensors = len(tensors)
-    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
-    assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions'
-    shape_len = list(shape_lens)[0]
-    dim = (dim + shape_len) if dim < 0 else dim
-    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
-    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
-    assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation'
-    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
-    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
-    expanded_dims.insert(dim, (dim, dims[dim]))
-    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
-    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
-    return torch.cat(tensors, dim = dim)
-def rotate_half(x):
-    x = rearrange(x, '... (d r) -> ... d r', r = 2)
-    x1, x2 = x.unbind(dim = -1)
-    x = torch.stack((-x2, x1), dim = -1)
-    return rearrange(x, '... d r -> ... (d r)')
-class VisionRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        dim,
-        pt_seq_len,
-        ft_seq_len=None,
-        custom_freqs = None,
-        freqs_for = 'lang',
-        theta = 10000,
-        max_freq = 10,
-        num_freqs = 1,
-    ):
-        super().__init__()
-        if custom_freqs:
-            freqs = custom_freqs
-        elif freqs_for == 'lang':
-            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
-        elif freqs_for == 'pixel':
-            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
-        elif freqs_for == 'constant':
-            freqs = torch.ones(num_freqs).float()
-        else:
-            raise ValueError(f'unknown modality {freqs_for}')
-        if ft_seq_len is None: ft_seq_len = pt_seq_len
-        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
-        freqs_h = torch.einsum('..., f -> ... f', t, freqs)
-        freqs_h = repeat(freqs_h, '... n -> ... (n r)', r = 2)
-        freqs_w = torch.einsum('..., f -> ... f', t, freqs)
-        freqs_w = repeat(freqs_w, '... n -> ... (n r)', r = 2)
-        freqs = broadcat((freqs_h[:, None, :], freqs_w[None, :, :]), dim = -1)
-        self.register_buffer("freqs_cos", freqs.cos())
-        self.register_buffer("freqs_sin", freqs.sin())
-        logging.info(f'Shape of rope freq: {self.freqs_cos.shape}')
-    def forward(self, t, start_index = 0):
-        rot_dim = self.freqs_cos.shape[-1]
-        end_index = start_index + rot_dim
-        assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
-        t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
-        t = (t * self.freqs_cos) + (rotate_half(t) * self.freqs_sin)
-        return torch.cat((t_left, t, t_right), dim = -1)
-class VisionRotaryEmbeddingFast(nn.Module):
-    def __init__(
-        self,
-        dim,
-        pt_seq_len,
-        ft_seq_len=None,
-        custom_freqs = None,
-        freqs_for = 'lang',
-        theta = 10000,
-        max_freq = 10,
-        num_freqs = 1,
-        patch_dropout = 0.
-    ):
-        super().__init__()
-        if custom_freqs:
-            freqs = custom_freqs
-        elif freqs_for == 'lang':
-            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
-        elif freqs_for == 'pixel':
-            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
-        elif freqs_for == 'constant':
-            freqs = torch.ones(num_freqs).float()
-        else:
-            raise ValueError(f'unknown modality {freqs_for}')
-        if ft_seq_len is None: ft_seq_len = pt_seq_len
-        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
-        freqs = torch.einsum('..., f -> ... f', t, freqs)
-        freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
-        freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim = -1)
-        freqs_cos = freqs.cos().view(-1, freqs.shape[-1])
-        freqs_sin = freqs.sin().view(-1, freqs.shape[-1])
-        self.patch_dropout = patch_dropout
-        self.register_buffer("freqs_cos", freqs_cos)
-        self.register_buffer("freqs_sin", freqs_sin)
-        logging.info(f'Shape of rope freq: {self.freqs_cos.shape}')
-    def forward(self, t, patch_indices_keep=None):
-        if patch_indices_keep is not None:
-            batch = t.size()[0]
-            batch_indices = torch.arange(batch)
-            batch_indices = batch_indices[..., None]
-            freqs_cos = repeat(self.freqs_cos, 'i j -> n i m j', n=t.shape[0], m=t.shape[1])
-            freqs_sin = repeat(self.freqs_sin, 'i j -> n i m j', n=t.shape[0], m=t.shape[1])
-            freqs_cos = freqs_cos[batch_indices, patch_indices_keep]
-            freqs_cos = rearrange(freqs_cos, 'n i m j -> n m i j')
-            freqs_sin = freqs_sin[batch_indices, patch_indices_keep]
-            freqs_sin = rearrange(freqs_sin, 'n i m j -> n m i j')
-            return  t * freqs_cos + rotate_half(t) * freqs_sin
-        return  t * self.freqs_cos + rotate_half(t) * self.freqs_sin

eva_clip/timm_model.py DELETED Viewed

@@ -1,122 +0,0 @@
-""" timm model adapter
-Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model.
-"""
-import logging
-from collections import OrderedDict
-import torch
-import torch.nn as nn
-try:
-    import timm
-    from timm.models.layers import Mlp, to_2tuple
-    try:
-        # old timm imports < 0.8.1
-        from timm.models.layers.attention_pool2d import RotAttentionPool2d
-        from timm.models.layers.attention_pool2d import AttentionPool2d as AbsAttentionPool2d
-    except ImportError:
-        # new timm imports >= 0.8.1
-        from timm.layers import RotAttentionPool2d
-        from timm.layers import AttentionPool2d as AbsAttentionPool2d
-except ImportError:
-    timm = None
-from .utils import freeze_batch_norm_2d
-class TimmModel(nn.Module):
-    """ timm model adapter
-    # FIXME this adapter is a work in progress, may change in ways that break weight compat
-    """
-    def __init__(
-            self,
-            model_name,
-            embed_dim,
-            image_size=224,
-            pool='avg',
-            proj='linear',
-            proj_bias=False,
-            drop=0.,
-            pretrained=False):
-        super().__init__()
-        if timm is None:
-            raise RuntimeError("Please `pip install timm` to use timm models.")
-        self.image_size = to_2tuple(image_size)
-        self.trunk = timm.create_model(model_name, pretrained=pretrained)
-        feat_size = self.trunk.default_cfg.get('pool_size', None)
-        feature_ndim = 1 if not feat_size else 2
-        if pool in ('abs_attn', 'rot_attn'):
-            assert feature_ndim == 2
-            # if attn pooling used, remove both classifier and default pool
-            self.trunk.reset_classifier(0, global_pool='')
-        else:
-            # reset global pool if pool config set, otherwise leave as network default
-            reset_kwargs = dict(global_pool=pool) if pool else {}
-            self.trunk.reset_classifier(0, **reset_kwargs)
-        prev_chs = self.trunk.num_features
-        head_layers = OrderedDict()
-        if pool == 'abs_attn':
-            head_layers['pool'] = AbsAttentionPool2d(prev_chs, feat_size=feat_size, out_features=embed_dim)
-            prev_chs = embed_dim
-        elif pool == 'rot_attn':
-            head_layers['pool'] = RotAttentionPool2d(prev_chs, out_features=embed_dim)
-            prev_chs = embed_dim
-        else:
-            assert proj, 'projection layer needed if non-attention pooling is used.'
-        # NOTE attention pool ends with a projection layer, so proj should usually be set to '' if such pooling is used
-        if proj == 'linear':
-            head_layers['drop'] = nn.Dropout(drop)
-            head_layers['proj'] = nn.Linear(prev_chs, embed_dim, bias=proj_bias)
-        elif proj == 'mlp':
-            head_layers['mlp'] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=drop, bias=(True, proj_bias))
-        self.head = nn.Sequential(head_layers)
-    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
-        """ lock modules
-        Args:
-            unlocked_groups (int): leave last n layer groups unlocked (default: 0)
-        """
-        if not unlocked_groups:
-            # lock full model
-            for param in self.trunk.parameters():
-                param.requires_grad = False
-            if freeze_bn_stats:
-                freeze_batch_norm_2d(self.trunk)
-        else:
-            # NOTE: partial freeze requires latest timm (master) branch and is subject to change
-            try:
-                # FIXME import here until API stable and in an official release
-                from timm.models.helpers import group_parameters, group_modules
-            except ImportError:
-                raise RuntimeError(
-                    'Please install latest timm `pip install git+https://github.com/rwightman/pytorch-image-models`')
-            matcher = self.trunk.group_matcher()
-            gparams = group_parameters(self.trunk, matcher)
-            max_layer_id = max(gparams.keys())
-            max_layer_id = max_layer_id - unlocked_groups
-            for group_idx in range(max_layer_id + 1):
-                group = gparams[group_idx]
-                for param in group:
-                    self.trunk.get_parameter(param).requires_grad = False
-            if freeze_bn_stats:
-                gmodules = group_modules(self.trunk, matcher, reverse=True)
-                gmodules = {k for k, v in gmodules.items() if v <= max_layer_id}
-                freeze_batch_norm_2d(self.trunk, gmodules)
-    @torch.jit.ignore
-    def set_grad_checkpointing(self, enable=True):
-        try:
-            self.trunk.set_grad_checkpointing(enable)
-        except Exception as e:
-            logging.warning('grad checkpointing not supported for this timm image tower, continuing without...')
-    def forward(self, x):
-        x = self.trunk(x)
-        x = self.head(x)
-        return x

eva_clip/tokenizer.py DELETED Viewed

@@ -1,201 +0,0 @@
-""" CLIP tokenizer
-Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
-"""
-import gzip
-import html
-import os
-from functools import lru_cache
-from typing import Union, List
-import ftfy
-import regex as re
-import torch
-# https://stackoverflow.com/q/62691279
-import os
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-@lru_cache()
-def default_bpe():
-    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
-@lru_cache()
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-def get_pairs(word):
-    """Return set of symbol pairs in a word.
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-def basic_clean(text):
-    text = ftfy.fix_text(text)
-    text = html.unescape(html.unescape(text))
-    return text.strip()
-def whitespace_clean(text):
-    text = re.sub(r'\s+', ' ', text)
-    text = text.strip()
-    return text
-class SimpleTokenizer(object):
-    def __init__(self, bpe_path: str = default_bpe(), special_tokens=None):
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
-        merges = merges[1:49152-256-2+1]
-        merges = [tuple(merge.split()) for merge in merges]
-        vocab = list(bytes_to_unicode().values())
-        vocab = vocab + [v+'</w>' for v in vocab]
-        for merge in merges:
-            vocab.append(''.join(merge))
-        if not special_tokens:
-            special_tokens = ['<start_of_text>', '<end_of_text>']
-        else:
-            special_tokens = ['<start_of_text>', '<end_of_text>'] + special_tokens
-        vocab.extend(special_tokens)
-        self.encoder = dict(zip(vocab, range(len(vocab))))
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.bpe_ranks = dict(zip(merges, range(len(merges))))
-        self.cache = {t:t for t in special_tokens}
-        special = "|".join(special_tokens)
-        self.pat = re.compile(special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
-        self.vocab_size = len(self.encoder)
-        self.all_special_ids = [self.encoder[t] for t in special_tokens]
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
-        pairs = get_pairs(word)
-        if not pairs:
-            return token+'</w>'
-        while True:
-            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
-                    new_word.extend(word[i:])
-                    break
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
-                    new_word.append(first+second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = ' '.join(word)
-        self.cache[token] = word
-        return word
-    def encode(self, text):
-        bpe_tokens = []
-        text = whitespace_clean(basic_clean(text)).lower()
-        for token in re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
-            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
-        return bpe_tokens
-    def decode(self, tokens):
-        text = ''.join([self.decoder[token] for token in tokens])
-        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
-        return text
-_tokenizer = SimpleTokenizer()
-def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor:
-    """
-    Returns the tokenized representation of given input string(s)
-    Parameters
-    ----------
-    texts : Union[str, List[str]]
-        An input string or a list of input strings to tokenize
-    context_length : int
-        The context length to use; all CLIP models use 77 as the context length
-    Returns
-    -------
-    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
-    """
-    if isinstance(texts, str):
-        texts = [texts]
-    sot_token = _tokenizer.encoder["<start_of_text>"]
-    eot_token = _tokenizer.encoder["<end_of_text>"]
-    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
-    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
-    for i, tokens in enumerate(all_tokens):
-        if len(tokens) > context_length:
-            tokens = tokens[:context_length]  # Truncate
-            tokens[-1] = eot_token
-        result[i, :len(tokens)] = torch.tensor(tokens)
-    return result
-class HFTokenizer:
-    "HuggingFace tokenizer wrapper"
-    def __init__(self, tokenizer_name:str):
-        from transformers import AutoTokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-    def __call__(self, texts:Union[str, List[str]], context_length:int=77) -> torch.Tensor:
-        # same cleaning as for default tokenizer, except lowercasing
-        # adding lower (for case-sensitive tokenizers) will make it more robust but less sensitive to nuance
-        if isinstance(texts, str):
-            texts = [texts]
-        texts = [whitespace_clean(basic_clean(text)) for text in texts]
-        input_ids = self.tokenizer(texts, return_tensors='pt', max_length=context_length, padding='max_length', truncation=True).input_ids
-        return input_ids

eva_clip/transform.py DELETED Viewed

@@ -1,103 +0,0 @@
-from typing import Optional, Sequence, Tuple
-import torch
-import torch.nn as nn
-import torchvision.transforms.functional as F
-from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \
-    CenterCrop
-from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
-class ResizeMaxSize(nn.Module):
-    def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn='max', fill=0):
-        super().__init__()
-        if not isinstance(max_size, int):
-            raise TypeError(f"Size should be int. Got {type(max_size)}")
-        self.max_size = max_size
-        self.interpolation = interpolation
-        self.fn = min if fn == 'min' else min
-        self.fill = fill
-    def forward(self, img):
-        if isinstance(img, torch.Tensor):
-            height, width = img.shape[:2]
-        else:
-            width, height = img.size
-        scale = self.max_size / float(max(height, width))
-        if scale != 1.0:
-            new_size = tuple(round(dim * scale) for dim in (height, width))
-            img = F.resize(img, new_size, self.interpolation)
-            pad_h = self.max_size - new_size[0]
-            pad_w = self.max_size - new_size[1]
-            img = F.pad(img, padding=[pad_w//2, pad_h//2, pad_w - pad_w//2, pad_h - pad_h//2], fill=self.fill)
-        return img
-def _convert_to_rgb(image):
-    return image.convert('RGB')
-# class CatGen(nn.Module):
-#     def __init__(self, num=4):
-#         self.num = num
-#     def mixgen_batch(image, text):
-#         batch_size = image.shape[0]
-#         index = np.random.permutation(batch_size)
-#         cat_images = []
-#         for i in range(batch_size):
-#             # image mixup
-#             image[i,:] = lam * image[i,:] + (1 - lam) * image[index[i],:]
-#             # text concat
-#             text[i] = tokenizer((str(text[i]) + " " + str(text[index[i]])))[0]
-#         text = torch.stack(text)
-#         return image, text
-def image_transform(
-        image_size: int,
-        is_train: bool,
-        mean: Optional[Tuple[float, ...]] = None,
-        std: Optional[Tuple[float, ...]] = None,
-        resize_longest_max: bool = False,
-        fill_color: int = 0,
-):
-    mean = mean or OPENAI_DATASET_MEAN
-    if not isinstance(mean, (list, tuple)):
-        mean = (mean,) * 3
-    std = std or OPENAI_DATASET_STD
-    if not isinstance(std, (list, tuple)):
-        std = (std,) * 3
-    if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]:
-        # for square size, pass size as int so that Resize() uses aspect preserving shortest edge
-        image_size = image_size[0]
-    normalize = Normalize(mean=mean, std=std)
-    if is_train:
-        return Compose([
-            RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC),
-            _convert_to_rgb,
-            ToTensor(),
-            normalize,
-        ])
-    else:
-        if resize_longest_max:
-            transforms = [
-                ResizeMaxSize(image_size, fill=fill_color)
-            ]
-        else:
-            transforms = [
-                Resize(image_size, interpolation=InterpolationMode.BICUBIC),
-                CenterCrop(image_size),
-            ]
-        transforms.extend([
-            _convert_to_rgb,
-            ToTensor(),
-            normalize,
-        ])
-        return Compose(transforms)

eva_clip/transformer.py DELETED Viewed

@@ -1,737 +0,0 @@
-import os
-import logging
-from collections import OrderedDict
-import math
-from typing import Callable, Optional, Sequence
-import numpy as np
-import torch
-from torch import nn
-from torch.nn import functional as F
-try:
-    from timm.models.layers import trunc_normal_
-except:
-    from timm.layers import trunc_normal_
-from .rope import VisionRotaryEmbedding, VisionRotaryEmbeddingFast
-from .utils import to_2tuple
-if os.getenv('ENV_TYPE') == 'deepspeed':
-    try:
-        import deepspeed
-        from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint
-    except:
-        print("Please 'pip install deepspeed'")
-        deepspeed = None
-        from torch.utils.checkpoint import checkpoint
-else:
-    from torch.utils.checkpoint import checkpoint
-try:
-    import xformers.ops as xops
-except ImportError:
-    xops = None
-    print("Please 'pip install xformers'")
-class LayerNormFp32(nn.LayerNorm):
-    """Subclass torch's LayerNorm to handle fp16 (by casting to float32 and back)."""
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-    def forward(self, x: torch.Tensor):
-        output = F.layer_norm(
-            x.float(),
-            self.normalized_shape,
-            self.weight.float() if self.weight is not None else None,
-            self.bias.float() if self.bias is not None else None,
-            self.eps,
-        )
-        return output.type_as(x)
-class LayerNorm(nn.LayerNorm):
-    """Subclass torch's LayerNorm (with cast back to input dtype)."""
-    def forward(self, x: torch.Tensor):
-        orig_type = x.dtype
-        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
-        return x.to(orig_type)
-class QuickGELU(nn.Module):
-    # NOTE This is slower than nn.GELU or nn.SiLU and uses more GPU memory
-    def forward(self, x: torch.Tensor):
-        return x * torch.sigmoid(1.702 * x)
-class LayerScale(nn.Module):
-    def __init__(self, dim, init_values=1e-5, inplace=False):
-        super().__init__()
-        self.inplace = inplace
-        self.gamma = nn.Parameter(init_values * torch.ones(dim))
-    def forward(self, x):
-        return x.mul_(self.gamma) if self.inplace else x * self.gamma
-class PatchDropout(nn.Module):
-    """
-    https://arxiv.org/abs/2212.00794
-    """
-    def __init__(self, prob, exclude_first_token=True):
-        super().__init__()
-        assert 0 <= prob < 1.
-        self.prob = prob
-        self.exclude_first_token = exclude_first_token  # exclude CLS token
-        logging.info(f"os.getenv('RoPE')={os.getenv('RoPE')}")
-    def forward(self, x):
-        if not self.training or self.prob == 0.:
-            return x
-        if self.exclude_first_token:
-            cls_tokens, x = x[:, :1], x[:, 1:]
-        else:
-            cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
-        batch = x.size()[0]
-        num_tokens = x.size()[1]
-        batch_indices = torch.arange(batch)
-        batch_indices = batch_indices[..., None]
-        keep_prob = 1 - self.prob
-        num_patches_keep = max(1, int(num_tokens * keep_prob))
-        rand = torch.randn(batch, num_tokens)
-        patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
-        x = x[batch_indices, patch_indices_keep]
-        if self.exclude_first_token:
-            x = torch.cat((cls_tokens, x), dim=1)
-        if self.training and os.getenv('RoPE') == '1':
-            return x, patch_indices_keep
-        return x
-def _in_projection_packed(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    w: torch.Tensor,
-    b: Optional[torch.Tensor] = None,
-    ):
-    """
-    https://github.com/pytorch/pytorch/blob/db2a237763eb8693a20788be94f8c192e762baa8/torch/nn/functional.py#L4726
-    """
-    E = q.size(-1)
-    if k is v:
-        if q is k:
-            # self-attention
-            return F.linear(q, w, b).chunk(3, dim=-1)
-        else:
-            # encoder-decoder attention
-            w_q, w_kv = w.split([E, E * 2])
-            if b is None:
-                b_q = b_kv = None
-            else:
-                b_q, b_kv = b.split([E, E * 2])
-            return (F.linear(q, w_q, b_q),) + F.linear(k, w_kv, b_kv).chunk(2, dim=-1)
-    else:
-        w_q, w_k, w_v = w.chunk(3)
-        if b is None:
-            b_q = b_k = b_v = None
-        else:
-            b_q, b_k, b_v = b.chunk(3)
-        return F.linear(q, w_q, b_q), F.linear(k, w_k, b_k), F.linear(v, w_v, b_v)
-class Attention(nn.Module):
-    def __init__(
-            self,
-            dim,
-            num_heads=8,
-            qkv_bias=True,
-            scaled_cosine=False,
-            scale_heads=False,
-            logit_scale_max=math.log(1. / 0.01),
-            attn_drop=0.,
-            proj_drop=0.,
-            xattn=False,
-            rope=False
-    ):
-        super().__init__()
-        self.scaled_cosine = scaled_cosine
-        self.scale_heads = scale_heads
-        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
-        self.num_heads = num_heads
-        self.head_dim = dim // num_heads
-        self.scale = self.head_dim ** -0.5
-        self.logit_scale_max = logit_scale_max
-        # keeping in_proj in this form (instead of nn.Linear) to match weight scheme of original
-        self.in_proj_weight = nn.Parameter(torch.randn((dim * 3, dim)) * self.scale)
-        if qkv_bias:
-            self.in_proj_bias = nn.Parameter(torch.zeros(dim * 3))
-        else:
-            self.in_proj_bias = None
-        if self.scaled_cosine:
-            self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))))
-        else:
-            self.logit_scale = None
-        self.attn_drop = nn.Dropout(attn_drop)
-        if self.scale_heads:
-            self.head_scale = nn.Parameter(torch.ones((num_heads, 1, 1)))
-        else:
-            self.head_scale = None
-        self.out_proj = nn.Linear(dim, dim)
-        self.out_drop = nn.Dropout(proj_drop)
-        self.xattn = xattn
-        self.xattn_drop = attn_drop
-        self.rope = rope
-    def forward(self, x, attn_mask: Optional[torch.Tensor] = None):
-        L, N, C = x.shape
-        q, k, v = F.linear(x, self.in_proj_weight, self.in_proj_bias).chunk(3, dim=-1)
-        if self.xattn:
-            q = q.contiguous().view(L, N, self.num_heads, -1).transpose(0, 1)
-            k = k.contiguous().view(L, N, self.num_heads, -1).transpose(0, 1)
-            v = v.contiguous().view(L, N, self.num_heads, -1).transpose(0, 1)
-            x = xops.memory_efficient_attention(
-                q, k, v,
-                p=self.xattn_drop,
-                scale=self.scale if self.logit_scale is None else None,
-                attn_bias=xops.LowerTriangularMask() if attn_mask is not None else None,
-                )
-        else:
-            q = q.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1)
-            k = k.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1)
-            v = v.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1)
-            if self.logit_scale is not None:
-                attn = torch.bmm(F.normalize(q, dim=-1), F.normalize(k, dim=-1).transpose(-1, -2))
-                logit_scale = torch.clamp(self.logit_scale, max=self.logit_scale_max).exp()
-                attn = attn.view(N, self.num_heads, L, L) * logit_scale
-                attn = attn.view(-1, L, L)
-            else:
-                q = q * self.scale
-                attn = torch.bmm(q, k.transpose(-1, -2))
-            if attn_mask is not None:
-                if attn_mask.dtype == torch.bool:
-                    new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype)
-                    new_attn_mask.masked_fill_(attn_mask, float("-inf"))
-                    attn_mask = new_attn_mask
-                attn += attn_mask
-            attn = attn.softmax(dim=-1)
-            attn = self.attn_drop(attn)
-            x = torch.bmm(attn, v)
-        if self.head_scale is not None:
-            x = x.view(N, self.num_heads, L, C) * self.head_scale
-            x = x.view(-1, L, C)
-        x = x.transpose(0, 1).reshape(L, N, C)
-        x = self.out_proj(x)
-        x = self.out_drop(x)
-        return x
-class CustomAttention(nn.Module):
-    def __init__(
-            self,
-            dim,
-            num_heads=8,
-            qkv_bias=True,
-            scaled_cosine=True,
-            scale_heads=False,
-            logit_scale_max=math.log(1. / 0.01),
-            attn_drop=0.,
-            proj_drop=0.,
-            xattn=False
-    ):
-        super().__init__()
-        self.scaled_cosine = scaled_cosine
-        self.scale_heads = scale_heads
-        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
-        self.num_heads = num_heads
-        self.head_dim = dim // num_heads
-        self.scale = self.head_dim ** -0.5
-        self.logit_scale_max = logit_scale_max
-        # keeping in_proj in this form (instead of nn.Linear) to match weight scheme of original
-        self.in_proj_weight = nn.Parameter(torch.randn((dim * 3, dim)) * self.scale)
-        if qkv_bias:
-            self.in_proj_bias = nn.Parameter(torch.zeros(dim * 3))
-        else:
-            self.in_proj_bias = None
-        if self.scaled_cosine:
-            self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))))
-        else:
-            self.logit_scale = None
-        self.attn_drop = nn.Dropout(attn_drop)
-        if self.scale_heads:
-            self.head_scale = nn.Parameter(torch.ones((num_heads, 1, 1)))
-        else:
-            self.head_scale = None
-        self.out_proj = nn.Linear(dim, dim)
-        self.out_drop = nn.Dropout(proj_drop)
-        self.xattn = xattn
-        self.xattn_drop = attn_drop
-    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
-        q, k, v = _in_projection_packed(query, key, value, self.in_proj_weight, self.in_proj_bias)
-        N_q, B_q, C_q = q.shape
-        N_k, B_k, C_k = k.shape
-        N_v, B_v, C_v = v.shape
-        if self.xattn:
-            # B, N, C -> B, N, num_heads, C
-            q = q.permute(1, 0, 2).reshape(B_q, N_q, self.num_heads, -1)
-            k = k.permute(1, 0, 2).reshape(B_k, N_k, self.num_heads, -1)
-            v = v.permute(1, 0, 2).reshape(B_v, N_v, self.num_heads, -1)
-            x = xops.memory_efficient_attention(
-                q, k, v,
-                p=self.xattn_drop,
-                scale=self.scale if self.logit_scale is None else None,
-                attn_bias=xops.LowerTriangularMask() if attn_mask is not None else None
-                )
-        else:
-            # B*H, L, C
-            q = q.contiguous().view(N_q, B_q * self.num_heads, -1).transpose(0, 1)
-            k = k.contiguous().view(N_k, B_k * self.num_heads, -1).transpose(0, 1)
-            v = v.contiguous().view(N_v, B_v * self.num_heads, -1).transpose(0, 1)
-            if self.logit_scale is not None:
-                # B*H, N_q, N_k
-                attn = torch.bmm(F.normalize(q, dim=-1), F.normalize(k, dim=-1).transpose(-1, -2))
-                logit_scale = torch.clamp(self.logit_scale, max=self.logit_scale_max).exp()
-                attn = attn.view(B_q, self.num_heads, N_q, N_k) * logit_scale
-                attn = attn.view(-1, N_q, N_k)
-            else:
-                q = q * self.scale
-                attn = torch.bmm(q, k.transpose(-1, -2))
-            if attn_mask is not None:
-                if attn_mask.dtype == torch.bool:
-                    new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype)
-                    new_attn_mask.masked_fill_(attn_mask, float("-inf"))
-                    attn_mask = new_attn_mask
-                attn += attn_mask
-            attn = attn.softmax(dim=-1)
-            attn = self.attn_drop(attn)
-            x = torch.bmm(attn, v)
-        if self.head_scale is not None:
-            x = x.view(B_q, self.num_heads, N_q, C_q) * self.head_scale
-            x = x.view(-1, N_q, C_q)
-        x = x.transpose(0, 1).reshape(N_q, B_q, C_q)
-        x = self.out_proj(x)
-        x = self.out_drop(x)
-        return x
-class CustomResidualAttentionBlock(nn.Module):
-    def __init__(
-            self,
-            d_model: int,
-            n_head: int,
-            mlp_ratio: float = 4.0,
-            ls_init_value: float = None,
-            act_layer: Callable = nn.GELU,
-            norm_layer: Callable = LayerNorm,
-            scale_cosine_attn: bool = False,
-            scale_heads: bool = False,
-            scale_attn: bool = False,
-            scale_fc: bool = False,
-            cross_attn: bool = False,
-            xattn: bool = False,
-    ):
-        super().__init__()
-        self.ln_1 = norm_layer(d_model)
-        self.ln_1_k = norm_layer(d_model) if cross_attn else self.ln_1
-        self.ln_1_v = norm_layer(d_model) if cross_attn else self.ln_1
-        self.attn = CustomAttention(
-            d_model, n_head,
-            qkv_bias=True,
-            attn_drop=0.,
-            proj_drop=0.,
-            scaled_cosine=scale_cosine_attn,
-            scale_heads=scale_heads,
-            xattn=xattn
-        )
-        self.ln_attn = norm_layer(d_model) if scale_attn else nn.Identity()
-        self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
-        self.ln_2 = norm_layer(d_model)
-        mlp_width = int(d_model * mlp_ratio)
-        self.mlp = nn.Sequential(OrderedDict([
-            ("c_fc", nn.Linear(d_model, mlp_width)),
-            ('ln', norm_layer(mlp_width) if scale_fc else nn.Identity()),
-            ("gelu", act_layer()),
-            ("c_proj", nn.Linear(mlp_width, d_model))
-        ]))
-        self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
-    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
-        q = q + self.ls_1(self.ln_attn(self.attn(self.ln_1(q), self.ln_1_k(k), self.ln_1_v(v), attn_mask=attn_mask)))
-        q = q + self.ls_2(self.mlp(self.ln_2(q)))
-        return q
-class CustomTransformer(nn.Module):
-    def __init__(
-            self,
-            width: int,
-            layers: int,
-            heads: int,
-            mlp_ratio: float = 4.0,
-            ls_init_value: float = None,
-            act_layer: Callable = nn.GELU,
-            norm_layer: Callable = LayerNorm,
-            scale_cosine_attn: bool = True,
-            scale_heads: bool = False,
-            scale_attn: bool = False,
-            scale_fc: bool = False,
-            cross_attn: bool = False,
-            xattn: bool = False,
-    ):
-        super().__init__()
-        self.width = width
-        self.layers = layers
-        self.grad_checkpointing = False
-        self.xattn = xattn
-        self.resblocks = nn.ModuleList([
-            CustomResidualAttentionBlock(
-                width,
-                heads,
-                mlp_ratio,
-                ls_init_value=ls_init_value,
-                act_layer=act_layer,
-                norm_layer=norm_layer,
-                scale_cosine_attn=scale_cosine_attn,
-                scale_heads=scale_heads,
-                scale_attn=scale_attn,
-                scale_fc=scale_fc,
-                cross_attn=cross_attn,
-                xattn=xattn)
-            for _ in range(layers)
-        ])
-    def get_cast_dtype(self) -> torch.dtype:
-        return self.resblocks[0].mlp.c_fc.weight.dtype
-    def forward(self, q: torch.Tensor, k: torch.Tensor = None, v: torch.Tensor = None, attn_mask: Optional[torch.Tensor] = None):
-        if k is None and v is None:
-            k = v = q
-        for r in self.resblocks:
-            if self.grad_checkpointing and not torch.jit.is_scripting():
-                q = checkpoint(r, q, k, v, attn_mask)
-            else:
-                q = r(q, k, v, attn_mask=attn_mask)
-        return q
-class ResidualAttentionBlock(nn.Module):
-    def __init__(
-            self,
-            d_model: int,
-            n_head: int,
-            mlp_ratio: float = 4.0,
-            ls_init_value: float = None,
-            act_layer: Callable = nn.GELU,
-            norm_layer: Callable = LayerNorm,
-            xattn: bool = False,
-    ):
-        super().__init__()
-        self.ln_1 = norm_layer(d_model)
-        if xattn:
-            self.attn = Attention(d_model, n_head, xattn=True)
-        else:
-            self.attn = nn.MultiheadAttention(d_model, n_head)
-        self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
-        self.ln_2 = norm_layer(d_model)
-        mlp_width = int(d_model * mlp_ratio)
-        self.mlp = nn.Sequential(OrderedDict([
-            ("c_fc", nn.Linear(d_model, mlp_width)),
-            ("gelu", act_layer()),
-            ("c_proj", nn.Linear(mlp_width, d_model))
-        ]))
-        self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
-        self.xattn = xattn
-    def attention(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
-        attn_mask = attn_mask.to(x.dtype) if attn_mask is not None else None
-        if self.xattn:
-            return self.attn(x, attn_mask=attn_mask)
-        return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask)[0]
-    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
-        x = x + self.ls_1(self.attention(self.ln_1(x), attn_mask=attn_mask))
-        x = x + self.ls_2(self.mlp(self.ln_2(x)))
-        return x
-class Transformer(nn.Module):
-    def __init__(
-            self,
-            width: int,
-            layers: int,
-            heads: int,
-            mlp_ratio: float = 4.0,
-            ls_init_value: float = None,
-            act_layer: Callable = nn.GELU,
-            norm_layer: Callable = LayerNorm,
-            xattn: bool = False,
-    ):
-        super().__init__()
-        self.width = width
-        self.layers = layers
-        self.grad_checkpointing = False
-        self.resblocks = nn.ModuleList([
-            ResidualAttentionBlock(
-                width, heads, mlp_ratio, ls_init_value=ls_init_value, act_layer=act_layer, norm_layer=norm_layer, xattn=xattn)
-            for _ in range(layers)
-        ])
-    def get_cast_dtype(self) -> torch.dtype:
-        return self.resblocks[0].mlp.c_fc.weight.dtype
-    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
-        for r in self.resblocks:
-            if self.grad_checkpointing and not torch.jit.is_scripting():
-                x = checkpoint(r, x, attn_mask)
-            else:
-                x = r(x, attn_mask=attn_mask)
-        return x
-class VisionTransformer(nn.Module):
-    def __init__(
-            self,
-            image_size: int,
-            patch_size: int,
-            width: int,
-            layers: int,
-            heads: int,
-            mlp_ratio: float,
-            ls_init_value: float = None,
-            patch_dropout: float = 0.,
-            global_average_pool: bool = False,
-            output_dim: int = 512,
-            act_layer: Callable = nn.GELU,
-            norm_layer: Callable = LayerNorm,
-            xattn: bool = False,
-    ):
-        super().__init__()
-        self.image_size = to_2tuple(image_size)
-        self.patch_size = to_2tuple(patch_size)
-        self.grid_size = (self.image_size[0] // self.patch_size[0], self.image_size[1] // self.patch_size[1])
-        self.output_dim = output_dim
-        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
-        scale = width ** -0.5
-        self.class_embedding = nn.Parameter(scale * torch.randn(width))
-        self.positional_embedding = nn.Parameter(scale * torch.randn(self.grid_size[0] * self.grid_size[1] + 1, width))
-        # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
-        self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()
-        self.ln_pre = norm_layer(width)
-        self.transformer = Transformer(
-            width,
-            layers,
-            heads,
-            mlp_ratio,
-            ls_init_value=ls_init_value,
-            act_layer=act_layer,
-            norm_layer=norm_layer,
-            xattn=xattn
-        )
-        self.global_average_pool = global_average_pool
-        self.ln_post = norm_layer(width)
-        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
-    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
-        for param in self.parameters():
-            param.requires_grad = False
-        if unlocked_groups != 0:
-            groups = [
-                [
-                    self.conv1,
-                    self.class_embedding,
-                    self.positional_embedding,
-                    self.ln_pre,
-                ],
-                *self.transformer.resblocks[:-1],
-                [
-                    self.transformer.resblocks[-1],
-                    self.ln_post,
-                ],
-                self.proj,
-            ]
-            def _unlock(x):
-                if isinstance(x, Sequence):
-                    for g in x:
-                        _unlock(g)
-                else:
-                    if isinstance(x, torch.nn.Parameter):
-                        x.requires_grad = True
-                    else:
-                        for p in x.parameters():
-                            p.requires_grad = True
-            _unlock(groups[-unlocked_groups:])
-    def get_num_layers(self):
-        return self.transformer.layers
-    @torch.jit.ignore
-    def set_grad_checkpointing(self, enable=True):
-        self.transformer.grad_checkpointing = enable
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'positional_embedding', 'class_embedding'}
-    def forward(self, x: torch.Tensor, return_all_features: bool=False):
-        x = self.conv1(x)  # shape = [*, width, grid, grid]
-        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
-        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
-        x = torch.cat(
-            [self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
-             x], dim=1)  # shape = [*, grid ** 2 + 1, width]
-        x = x + self.positional_embedding.to(x.dtype)
-        # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in
-        x = self.patch_dropout(x)
-        x = self.ln_pre(x)
-        x = x.permute(1, 0, 2)  # NLD -> LND
-        x = self.transformer(x)
-        x = x.permute(1, 0, 2)  # LND -> NLD
-        if not return_all_features:
-            if self.global_average_pool:
-                x = x.mean(dim=1) #x = x[:,1:,:].mean(dim=1)
-            else:
-                x = x[:, 0]
-            x = self.ln_post(x)
-            if self.proj is not None:
-                x = x @ self.proj
-        return x
-class TextTransformer(nn.Module):
-    def __init__(
-            self,
-            context_length: int = 77,
-            vocab_size: int = 49408,
-            width: int = 512,
-            heads: int = 8,
-            layers: int = 12,
-            ls_init_value: float = None,
-            output_dim: int = 512,
-            act_layer: Callable = nn.GELU,
-            norm_layer: Callable = LayerNorm,
-            xattn: bool= False,
-            attn_mask: bool = True
-    ):
-        super().__init__()
-        self.context_length = context_length
-        self.vocab_size = vocab_size
-        self.width = width
-        self.output_dim = output_dim
-        self.token_embedding = nn.Embedding(vocab_size, width)
-        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, width))
-        self.transformer = Transformer(
-            width=width,
-            layers=layers,
-            heads=heads,
-            ls_init_value=ls_init_value,
-            act_layer=act_layer,
-            norm_layer=norm_layer,
-            xattn=xattn
-        )
-        self.xattn = xattn
-        self.ln_final = norm_layer(width)
-        self.text_projection = nn.Parameter(torch.empty(width, output_dim))
-        if attn_mask:
-            self.register_buffer('attn_mask', self.build_attention_mask(), persistent=False)
-        else:
-            self.attn_mask = None
-        self.init_parameters()
-    def init_parameters(self):
-        nn.init.normal_(self.token_embedding.weight, std=0.02)
-        nn.init.normal_(self.positional_embedding, std=0.01)
-        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
-        attn_std = self.transformer.width ** -0.5
-        fc_std = (2 * self.transformer.width) ** -0.5
-        for block in self.transformer.resblocks:
-            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
-            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
-            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
-            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
-        if self.text_projection is not None:
-            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
-    @torch.jit.ignore
-    def set_grad_checkpointing(self, enable=True):
-        self.transformer.grad_checkpointing = enable
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        # return {'positional_embedding', 'token_embedding'}
-        return {'positional_embedding'}
-    def get_num_layers(self):
-        return self.transformer.layers
-    def build_attention_mask(self):
-        # lazily create causal attention mask, with full attention between the vision tokens
-        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(self.context_length, self.context_length)
-        mask.fill_(float("-inf"))
-        mask.triu_(1)  # zero out the lower diagonal
-        return mask
-    def forward(self, text, return_all_features: bool=False):
-        cast_dtype = self.transformer.get_cast_dtype()
-        x = self.token_embedding(text).to(cast_dtype)  # [batch_size, n_ctx, d_model]
-        x = x + self.positional_embedding.to(cast_dtype)
-        x = x.permute(1, 0, 2)  # NLD -> LND
-        x = self.transformer(x, attn_mask=self.attn_mask)
-        # x = self.transformer(x) # no attention mask is applied
-        x = x.permute(1, 0, 2)  # LND -> NLD
-        x = self.ln_final(x)
-        if not return_all_features:
-            # x.shape = [batch_size, n_ctx, transformer.width]
-            # take features from the eot embedding (eot_token is the highest number in each sequence)
-            x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
-        return x

eva_clip/utils.py DELETED Viewed

@@ -1,326 +0,0 @@
-from itertools import repeat
-import collections.abc
-import logging
-import math
-import numpy as np
-import torch
-from torch import nn as nn
-from torchvision.ops.misc import FrozenBatchNorm2d
-import torch.nn.functional as F
-# open CLIP
-def resize_clip_pos_embed(state_dict, model, interpolation: str = 'bicubic', seq_dim=1):
-    # Rescale the grid of position embeddings when loading from state_dict
-    old_pos_embed = state_dict.get('visual.positional_embedding', None)
-    if old_pos_embed is None or not hasattr(model.visual, 'grid_size'):
-        return
-    grid_size = to_2tuple(model.visual.grid_size)
-    extra_tokens = 1  # FIXME detect different token configs (ie no class token, or more)
-    new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
-    if new_seq_len == old_pos_embed.shape[0]:
-        return
-    if extra_tokens:
-        pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
-    else:
-        pos_emb_tok, pos_emb_img = None, old_pos_embed
-    old_grid_size = to_2tuple(int(math.sqrt(len(pos_emb_img))))
-    logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
-    pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
-    pos_emb_img = F.interpolate(
-        pos_emb_img,
-        size=grid_size,
-        mode=interpolation,
-        align_corners=True,
-    )
-    pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
-    if pos_emb_tok is not None:
-        new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
-    else:
-        new_pos_embed = pos_emb_img
-    state_dict['visual.positional_embedding'] = new_pos_embed
-def resize_visual_pos_embed(state_dict, model, interpolation: str = 'bicubic', seq_dim=1):
-    # Rescale the grid of position embeddings when loading from state_dict
-    old_pos_embed = state_dict.get('positional_embedding', None)
-    if old_pos_embed is None or not hasattr(model.visual, 'grid_size'):
-        return
-    grid_size = to_2tuple(model.visual.grid_size)
-    extra_tokens = 1  # FIXME detect different token configs (ie no class token, or more)
-    new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
-    if new_seq_len == old_pos_embed.shape[0]:
-        return
-    if extra_tokens:
-        pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
-    else:
-        pos_emb_tok, pos_emb_img = None, old_pos_embed
-    old_grid_size = to_2tuple(int(math.sqrt(len(pos_emb_img))))
-    logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
-    pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
-    pos_emb_img = F.interpolate(
-        pos_emb_img,
-        size=grid_size,
-        mode=interpolation,
-        align_corners=True,
-    )
-    pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
-    if pos_emb_tok is not None:
-        new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
-    else:
-        new_pos_embed = pos_emb_img
-    state_dict['positional_embedding'] = new_pos_embed
-def resize_evaclip_pos_embed(state_dict, model, interpolation: str = 'bicubic', seq_dim=1):
-    all_keys = list(state_dict.keys())
-    # interpolate position embedding
-    if 'visual.pos_embed' in state_dict:
-        pos_embed_checkpoint = state_dict['visual.pos_embed']
-        embedding_size = pos_embed_checkpoint.shape[-1]
-        num_patches = model.visual.patch_embed.num_patches
-        num_extra_tokens = model.visual.pos_embed.shape[-2] - num_patches
-        # height (== width) for the checkpoint position embedding
-        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
-        # height (== width) for the new position embedding
-        new_size = int(num_patches ** 0.5)
-        # class_token and dist_token are kept unchanged
-        if orig_size != new_size:
-            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
-            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
-            # only the position tokens are interpolated
-            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
-            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
-            pos_tokens = torch.nn.functional.interpolate(
-                pos_tokens.float(), size=(new_size, new_size), mode='bicubic', align_corners=False)
-            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
-            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
-            state_dict['visual.pos_embed'] = new_pos_embed
-            patch_embed_proj = state_dict['visual.patch_embed.proj.weight']
-            patch_size = model.visual.patch_embed.patch_size
-            state_dict['visual.patch_embed.proj.weight'] = torch.nn.functional.interpolate(
-                patch_embed_proj.float(), size=patch_size, mode='bicubic', align_corners=False)
-def resize_eva_pos_embed(state_dict, model, interpolation: str = 'bicubic', seq_dim=1):
-    all_keys = list(state_dict.keys())
-    # interpolate position embedding
-    if 'pos_embed' in state_dict:
-        pos_embed_checkpoint = state_dict['pos_embed']
-        embedding_size = pos_embed_checkpoint.shape[-1]
-        num_patches = model.visual.patch_embed.num_patches
-        num_extra_tokens = model.visual.pos_embed.shape[-2] - num_patches
-        # height (== width) for the checkpoint position embedding
-        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
-        # height (== width) for the new position embedding
-        new_size = int(num_patches ** 0.5)
-        # class_token and dist_token are kept unchanged
-        if orig_size != new_size:
-            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
-            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
-            # only the position tokens are interpolated
-            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
-            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
-            pos_tokens = torch.nn.functional.interpolate(
-                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
-            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
-            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
-            state_dict['pos_embed'] = new_pos_embed
-            patch_embed_proj = state_dict['patch_embed.proj.weight']
-            patch_size = model.visual.patch_embed.patch_size
-            state_dict['patch_embed.proj.weight'] = torch.nn.functional.interpolate(
-                patch_embed_proj.float(), size=patch_size, mode='bicubic', align_corners=False)
-def resize_rel_pos_embed(state_dict, model, interpolation: str = 'bicubic', seq_dim=1):
-    all_keys = list(state_dict.keys())
-    for key in all_keys:
-        if "relative_position_index" in key:
-            state_dict.pop(key)
-        if "relative_position_bias_table" in key:
-            rel_pos_bias = state_dict[key]
-            src_num_pos, num_attn_heads = rel_pos_bias.size()
-            dst_num_pos, _ = model.visual.state_dict()[key].size()
-            dst_patch_shape = model.visual.patch_embed.patch_shape
-            if dst_patch_shape[0] != dst_patch_shape[1]:
-                raise NotImplementedError()
-            num_extra_tokens = dst_num_pos - (dst_patch_shape[0] * 2 - 1) * (dst_patch_shape[1] * 2 - 1)
-            src_size = int((src_num_pos - num_extra_tokens) ** 0.5)
-            dst_size = int((dst_num_pos - num_extra_tokens) ** 0.5)
-            if src_size != dst_size:
-                print("Position interpolate for %s from %dx%d to %dx%d" % (
-                    key, src_size, src_size, dst_size, dst_size))
-                extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
-                rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]
-                def geometric_progression(a, r, n):
-                    return a * (1.0 - r ** n) / (1.0 - r)
-                left, right = 1.01, 1.5
-                while right - left > 1e-6:
-                    q = (left + right) / 2.0
-                    gp = geometric_progression(1, q, src_size // 2)
-                    if gp > dst_size // 2:
-                        right = q
-                    else:
-                        left = q
-                # if q > 1.090307:
-                #     q = 1.090307
-                dis = []
-                cur = 1
-                for i in range(src_size // 2):
-                    dis.append(cur)
-                    cur += q ** (i + 1)
-                r_ids = [-_ for _ in reversed(dis)]
-                x = r_ids + [0] + dis
-                y = r_ids + [0] + dis
-                t = dst_size // 2.0
-                dx = np.arange(-t, t + 0.1, 1.0)
-                dy = np.arange(-t, t + 0.1, 1.0)
-                print("Original positions = %s" % str(x))
-                print("Target positions = %s" % str(dx))
-                all_rel_pos_bias = []
-                for i in range(num_attn_heads):
-                    z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy()
-                    f = F.interpolate.interp2d(x, y, z, kind='cubic')
-                    all_rel_pos_bias.append(
-                        torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(rel_pos_bias.device))
-                rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1)
-                new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), dim=0)
-                state_dict[key] = new_rel_pos_bias
-    # interpolate position embedding
-    if 'pos_embed' in state_dict:
-        pos_embed_checkpoint = state_dict['pos_embed']
-        embedding_size = pos_embed_checkpoint.shape[-1]
-        num_patches = model.visual.patch_embed.num_patches
-        num_extra_tokens = model.visual.pos_embed.shape[-2] - num_patches
-        # height (== width) for the checkpoint position embedding
-        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
-        # height (== width) for the new position embedding
-        new_size = int(num_patches ** 0.5)
-        # class_token and dist_token are kept unchanged
-        if orig_size != new_size:
-            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
-            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
-            # only the position tokens are interpolated
-            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
-            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
-            pos_tokens = torch.nn.functional.interpolate(
-                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
-            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
-            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
-            state_dict['pos_embed'] = new_pos_embed
-            patch_embed_proj = state_dict['patch_embed.proj.weight']
-            patch_size = model.visual.patch_embed.patch_size
-            state_dict['patch_embed.proj.weight'] = torch.nn.functional.interpolate(
-                patch_embed_proj.float(), size=patch_size, mode='bicubic', align_corners=False)
-def freeze_batch_norm_2d(module, module_match={}, name=''):
-    """
-    Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is
-    itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and
-    returned. Otherwise, the module is walked recursively and submodules are converted in place.
-    Args:
-        module (torch.nn.Module): Any PyTorch module.
-        module_match (dict): Dictionary of full module names to freeze (all if empty)
-        name (str): Full module name (prefix)
-    Returns:
-        torch.nn.Module: Resulting module
-    Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762
-    """
-    res = module
-    is_match = True
-    if module_match:
-        is_match = name in module_match
-    if is_match and isinstance(module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)):
-        res = FrozenBatchNorm2d(module.num_features)
-        res.num_features = module.num_features
-        res.affine = module.affine
-        if module.affine:
-            res.weight.data = module.weight.data.clone().detach()
-            res.bias.data = module.bias.data.clone().detach()
-        res.running_mean.data = module.running_mean.data
-        res.running_var.data = module.running_var.data
-        res.eps = module.eps
-    else:
-        for child_name, child in module.named_children():
-            full_child_name = '.'.join([name, child_name]) if name else child_name
-            new_child = freeze_batch_norm_2d(child, module_match, full_child_name)
-            if new_child is not child:
-                res.add_module(child_name, new_child)
-    return res
-# From PyTorch internals
-def _ntuple(n):
-    def parse(x):
-        if isinstance(x, collections.abc.Iterable):
-            return x
-        return tuple(repeat(x, n))
-    return parse
-to_1tuple = _ntuple(1)
-to_2tuple = _ntuple(2)
-to_3tuple = _ntuple(3)
-to_4tuple = _ntuple(4)
-to_ntuple = lambda n, x: _ntuple(n)(x)
-def is_logging(args):
-    def is_global_master(args):
-        return args.rank == 0
-    def is_local_master(args):
-        return args.local_rank == 0
-    def is_master(args, local=False):
-        return is_local_master(args) if local else is_global_master(args)
-    return is_master
-class AllGather(torch.autograd.Function):
-    """An autograd function that performs allgather on a tensor.
-    Performs all_gather operation on the provided tensors.
-    *** Warning ***: torch.distributed.all_gather has no gradient.
-    """
-    @staticmethod
-    def forward(ctx, tensor, rank, world_size):
-        tensors_gather = [torch.empty_like(tensor) for _ in range(world_size)]
-        torch.distributed.all_gather(tensors_gather, tensor)
-        ctx.rank = rank
-        ctx.batch_size = tensor.shape[0]
-        return torch.cat(tensors_gather, 0)
-    @staticmethod
-    def backward(ctx, grad_output):
-        return (
-            grad_output[ctx.batch_size * ctx.rank: ctx.batch_size * (ctx.rank + 1)],
-            None,
-            None
-        )
-allgather = AllGather.apply

modeling_kangaroo.py CHANGED Viewed

@@ -17,8 +17,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch LLaMA model."""
 import math
 from typing import List, Optional, Tuple, Union
@@ -26,16 +24,15 @@ import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache, StaticCache
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
-    QuestionAnsweringModelOutput,
-    SequenceClassifierOutputWithPast,
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
@@ -49,15 +46,14 @@ from transformers.utils import (
 )
 from transformers.models.llama.configuration_llama import LlamaConfig
-from eva_clip import create_model_and_transforms
 from .mm_projector_builder import build_vision_projector
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-from .data_utils import get_input, add_pred_to_history
-import transformers
 logger = logging.get_logger(__name__)
@@ -107,22 +103,6 @@ class LlamaRotaryEmbedding(nn.Module):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         # For BC we register cos and sin cached
         self.max_seq_len_cached = max_position_embeddings
-    #@torch.no_grad()
-    #def forward(self, x, position_ids):
-    #    # x: [bs, num_attention_heads, seq_len, head_size]
-    #    inv_freq_expanded = self.inv_freq[None, :, None].to(torch.bfloat16).expand(position_ids.shape[0], -1, 1)
-    #    position_ids_expanded = position_ids[:, None, :].to(torch.bfloat16)
-    #    # Force float32 since bfloat16 loses precision on long contexts
-    #    # See https://github.com/huggingface/transformers/pull/29285
-    #    device_type = x.device.type
-    #    device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-    #    with torch.autocast(device_type=device_type, enabled=False):
-    #        freqs = (inv_freq_expanded @ position_ids_expanded).transpose(1, 2)
-    #        emb = torch.cat((freqs, freqs), dim=-1)
-    #        cos = emb.cos()
-    #        sin = emb.sin()
-    #    return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
     @torch.no_grad()
     def forward(self, x, position_ids):
@@ -179,7 +159,6 @@ def rotate_half(x):
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
     Args:
         q (`torch.Tensor`): The query tensor.
         k (`torch.Tensor`): The key tensor.
@@ -504,7 +483,6 @@ class LlamaFlashAttention2(LlamaAttention):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
         Args:
             query_states (`torch.Tensor`):
                 Input query states to be passed to Flash Attention API
@@ -759,11 +737,9 @@ LLAMA_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
     Parameters:
         config ([`LlamaConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
@@ -804,50 +780,38 @@ LLAMA_INPUTS_DOCSTRING = r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
             [What are attention masks?](../glossary#attention-mask)
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
             `past_key_values`).
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
         past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
             Two formats are allowed:
             - a [`~cache_utils.Cache`] instance;
             - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
             shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
             cache format.
             The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
             legacy cache format will be returned.
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
             of shape `(batch_size, sequence_length)`.
@@ -880,7 +844,6 @@ LLAMA_INPUTS_DOCSTRING = r"""
 class LlamaModel(LlamaPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
     Args:
         config: LlamaConfig
     """
@@ -1107,13 +1070,10 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
         super().__init__(config)
         self.model = LlamaModel(config)
         model_name = "EVA02-CLIP-L-14-448"
-        pretrained = "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-mtcv/liujiajun18/models/models--QuanSun--EVA-CLIP/snapshots/11afd202f2ae80869d6cef18b1ec775e79bd8d12/EVA02_CLIP_L_psz14_s4B.pt"
         self.vocab_size = config.vocab_size
-        model, _, preprocess = create_model_and_transforms(model_name, pretrained, force_custom_clip=True)
-        model.text = None
-        model.logit_scale = None
-        self.vision_tower = model.visual
         self.mm_projector = build_vision_projector(mm_hidden_size=self.vision_tower.num_features, hidden_size=config.hidden_size, projector_type="mlp2x_gelu")
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -1121,6 +1081,7 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
         self.angle = torch.stack([1 / torch.pow(torch.tensor(10000), torch.tensor(2 * (hid_j // 2) / hidden_dim)) for hid_j in range(hidden_dim)])
         self.patch_shape = self.vision_tower.patch_embed.patch_shape[0]
         self.adaptive_pooling = torch.nn.Conv3d(in_channels=self.vision_tower.num_features,
                                                 out_channels=self.vision_tower.num_features,
                                                 kernel_size=(2, 2, 2),
@@ -1164,10 +1125,6 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
         image_features = image_features.permute(0, 4, 1, 2, 3)
         image_features = self.adaptive_pooling(image_features)
         image_features = image_features.permute(0, 2, 3, 4, 1)
-        #B, T, P, _, __ = image_features.shape
-        #image_features = image_features.reshape(B, T // 2, 2, P, _, __)
-        #image_features = image_features.mean(dim=2)
-        #image_features = image_features.reshape(B, T // 2, P, _, __)
         image_features = image_features.reshape(-1, self.patch_shape*self.patch_shape // 4, image_features.shape[-1])
         image_features = self.mm_projector(image_features)
@@ -1195,20 +1152,14 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, LlamaForCausalLM
         >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
         >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -1337,6 +1288,7 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
         T, C, H, W = video.shape
         video = video.reshape(-1, C, H, W)
         images_features = self.encode_images(video, durations, T)
         input_embeds = self.model.embed_tokens.weight[inputs]
         encoder_input = self.fuse_tokens_and_images(input_embeds, images_features, inputs)
         encoder_input = encoder_input.permute(1, 0, 2)
@@ -1420,13 +1372,12 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
         )
         return model_inputs
     @torch.no_grad()
     def chat(
         self,
         video_path : str,
         query : str,
-        tokenizer : transformers.PreTrainedTokenizer,
         num_segments : int = 64,
         history : str = None,
         system_prompt_id : int = 0,
@@ -1456,6 +1407,4 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
             reordered_past += (
                 tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
             )
-        return reordered_past

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 from typing import List, Optional, Tuple, Union
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers import PreTrainedTokenizer
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache, StaticCache
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
 )
 from transformers.models.llama.configuration_llama import LlamaConfig
+from .vision_tower_builder import build_vision_tower
 from .mm_projector_builder import build_vision_projector
+from .data_utils import get_input, add_pred_to_history
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 logger = logging.get_logger(__name__)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         # For BC we register cos and sin cached
         self.max_seq_len_cached = max_position_embeddings
     @torch.no_grad()
     def forward(self, x, position_ids):
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
     Args:
         q (`torch.Tensor`): The query tensor.
         k (`torch.Tensor`): The key tensor.
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
         Args:
             query_states (`torch.Tensor`):
                 Input query states to be passed to Flash Attention API
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
     Parameters:
         config ([`LlamaConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
             [What are attention masks?](../glossary#attention-mask)
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
             `past_key_values`).
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
         past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
             Two formats are allowed:
             - a [`~cache_utils.Cache`] instance;
             - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
             shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
             cache format.
             The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
             legacy cache format will be returned.
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
             of shape `(batch_size, sequence_length)`.
 class LlamaModel(LlamaPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
     Args:
         config: LlamaConfig
     """
         super().__init__(config)
         self.model = LlamaModel(config)
         model_name = "EVA02-CLIP-L-14-448"
         self.vocab_size = config.vocab_size
+        self.vision_tower = build_vision_tower(model_name)
         self.mm_projector = build_vision_projector(mm_hidden_size=self.vision_tower.num_features, hidden_size=config.hidden_size, projector_type="mlp2x_gelu")
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.angle = torch.stack([1 / torch.pow(torch.tensor(10000), torch.tensor(2 * (hid_j // 2) / hidden_dim)) for hid_j in range(hidden_dim)])
         self.patch_shape = self.vision_tower.patch_embed.patch_shape[0]
+        # patchify module
         self.adaptive_pooling = torch.nn.Conv3d(in_channels=self.vision_tower.num_features,
                                                 out_channels=self.vision_tower.num_features,
                                                 kernel_size=(2, 2, 2),
         image_features = image_features.permute(0, 4, 1, 2, 3)
         image_features = self.adaptive_pooling(image_features)
         image_features = image_features.permute(0, 2, 3, 4, 1)
         image_features = image_features.reshape(-1, self.patch_shape*self.patch_shape // 4, image_features.shape[-1])
         image_features = self.mm_projector(image_features)
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, LlamaForCausalLM
         >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
         >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         T, C, H, W = video.shape
         video = video.reshape(-1, C, H, W)
         images_features = self.encode_images(video, durations, T)
         input_embeds = self.model.embed_tokens.weight[inputs]
         encoder_input = self.fuse_tokens_and_images(input_embeds, images_features, inputs)
         encoder_input = encoder_input.permute(1, 0, 2)
         )
         return model_inputs
     @torch.no_grad()
     def chat(
         self,
         video_path : str,
         query : str,
+        tokenizer : PreTrainedTokenizer,
         num_segments : int = 64,
         history : str = None,
         system_prompt_id : int = 0,
             reordered_past += (
                 tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
             )
+        return reordered_past

eva_clip/eva_vit_model.py → vision_tower_builder.py RENAMED Viewed

@@ -1,20 +1,25 @@
 # --------------------------------------------------------
-# Adapted from  https://github.com/microsoft/unilm/tree/master/beit
 # --------------------------------------------------------
 import math
 import os
-from functools import partial
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 try:
     from timm.models.layers import drop_path, to_2tuple, trunc_normal_
 except:
     from timm.layers import drop_path, to_2tuple, trunc_normal_
-from .transformer import PatchDropout
-from .rope import VisionRotaryEmbedding, VisionRotaryEmbeddingFast
 if os.getenv('ENV_TYPE') == 'deepspeed':
     try:
         from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint
@@ -30,6 +35,59 @@ except ImportError:
     print("Please 'pip install xformers'")
 class DropPath(nn.Module):
     """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
     """
@@ -78,6 +136,7 @@ class Mlp(nn.Module):
         x = self.drop(x)
         return x
 class SwiGLU(nn.Module):
     def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0.,
                 norm_layer=nn.LayerNorm, subln=False):
@@ -103,6 +162,7 @@ class SwiGLU(nn.Module):
         x = self.drop(x)
         return x
 class Attention(nn.Module):
     def __init__(
             self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
@@ -364,6 +424,91 @@ class RelativePositionBias(nn.Module):
         return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
 class EVAVisionTransformer(nn.Module):
     """ Vision Transformer with support for patch or hybrid CNN input stage
     """
@@ -383,7 +528,6 @@ class EVAVisionTransformer(nn.Module):
         num_patches = self.patch_embed.num_patches
         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
-        # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
         if use_abs_pos_emb:
             self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
         else:
@@ -530,3 +674,95 @@ class EVAVisionTransformer(nn.Module):
         x = self.forward_features(x)
         x = self.head(x)
         return x

 # --------------------------------------------------------
+# Adapted from  https://github.com/baaivision/EVA
 # --------------------------------------------------------
 import math
 import os
+import json
+import logging
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from einops import rearrange, repeat
+from functools import partial
+from typing import Optional, Tuple, Union
+from dataclasses import dataclass
 try:
     from timm.models.layers import drop_path, to_2tuple, trunc_normal_
 except:
     from timm.layers import drop_path, to_2tuple, trunc_normal_
 if os.getenv('ENV_TYPE') == 'deepspeed':
     try:
         from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint
     print("Please 'pip install xformers'")
+class PatchDropout(nn.Module):
+    """
+    https://arxiv.org/abs/2212.00794
+    """
+    def __init__(self, prob, exclude_first_token=True):
+        super().__init__()
+        assert 0 <= prob < 1.
+        self.prob = prob
+        self.exclude_first_token = exclude_first_token  # exclude CLS token
+        logging.info(f"os.getenv('RoPE')={os.getenv('RoPE')}")
+    def forward(self, x):
+        if not self.training or self.prob == 0.:
+            return x
+        if self.exclude_first_token:
+            cls_tokens, x = x[:, :1], x[:, 1:]
+        else:
+            cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+        batch = x.size()[0]
+        num_tokens = x.size()[1]
+        batch_indices = torch.arange(batch)
+        batch_indices = batch_indices[..., None]
+        keep_prob = 1 - self.prob
+        num_patches_keep = max(1, int(num_tokens * keep_prob))
+        rand = torch.randn(batch, num_tokens)
+        patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+        x = x[batch_indices, patch_indices_keep]
+        if self.exclude_first_token:
+            x = torch.cat((cls_tokens, x), dim=1)
+        if self.training and os.getenv('RoPE') == '1':
+            return x, patch_indices_keep
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm (with cast back to input dtype)."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        return x.to(orig_type)
 class DropPath(nn.Module):
     """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
     """
         x = self.drop(x)
         return x
 class SwiGLU(nn.Module):
     def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0.,
                 norm_layer=nn.LayerNorm, subln=False):
         x = self.drop(x)
         return x
 class Attention(nn.Module):
     def __init__(
             self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
         return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+def broadcat(tensors, dim = -1):
+    num_tensors = len(tensors)
+    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
+    assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions'
+    shape_len = list(shape_lens)[0]
+    dim = (dim + shape_len) if dim < 0 else dim
+    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
+    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+    assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation'
+    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
+    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
+    expanded_dims.insert(dim, (dim, dims[dim]))
+    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
+    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
+    return torch.cat(tensors, dim = dim)
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r = 2)
+    x1, x2 = x.unbind(dim = -1)
+    x = torch.stack((-x2, x1), dim = -1)
+    return rearrange(x, '... d r -> ... (d r)')
+class VisionRotaryEmbeddingFast(nn.Module):
+    def __init__(
+        self,
+        dim,
+        pt_seq_len,
+        ft_seq_len=None,
+        custom_freqs = None,
+        freqs_for = 'lang',
+        theta = 10000,
+        max_freq = 10,
+        num_freqs = 1,
+        patch_dropout = 0.
+    ):
+        super().__init__()
+        if custom_freqs:
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f'unknown modality {freqs_for}')
+        if ft_seq_len is None: ft_seq_len = pt_seq_len
+        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
+        freqs = torch.einsum('..., f -> ... f', t, freqs)
+        freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
+        freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim = -1)
+        freqs_cos = freqs.cos().view(-1, freqs.shape[-1])
+        freqs_sin = freqs.sin().view(-1, freqs.shape[-1])
+        self.patch_dropout = patch_dropout
+        self.register_buffer("freqs_cos", freqs_cos)
+        self.register_buffer("freqs_sin", freqs_sin)
+        logging.info(f'Shape of rope freq: {self.freqs_cos.shape}')
+    def forward(self, t, patch_indices_keep=None):
+        if patch_indices_keep is not None:
+            batch = t.size()[0]
+            batch_indices = torch.arange(batch)
+            batch_indices = batch_indices[..., None]
+            freqs_cos = repeat(self.freqs_cos, 'i j -> n i m j', n=t.shape[0], m=t.shape[1])
+            freqs_sin = repeat(self.freqs_sin, 'i j -> n i m j', n=t.shape[0], m=t.shape[1])
+            freqs_cos = freqs_cos[batch_indices, patch_indices_keep]
+            freqs_cos = rearrange(freqs_cos, 'n i m j -> n m i j')
+            freqs_sin = freqs_sin[batch_indices, patch_indices_keep]
+            freqs_sin = rearrange(freqs_sin, 'n i m j -> n m i j')
+            return  t * freqs_cos + rotate_half(t) * freqs_sin
+        return  t * self.freqs_cos + rotate_half(t) * self.freqs_sin
 class EVAVisionTransformer(nn.Module):
     """ Vision Transformer with support for patch or hybrid CNN input stage
     """
         num_patches = self.patch_embed.num_patches
         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
         if use_abs_pos_emb:
             self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
         else:
         x = self.forward_features(x)
         x = self.head(x)
         return x
+@dataclass
+class CLIPVisionCfg:
+    layers: Union[Tuple[int, int, int, int], int] = 12
+    width: int = 768
+    head_width: int = 64
+    mlp_ratio: float = 4.0
+    patch_size: int = 16
+    image_size: Union[Tuple[int, int], int] = 224
+    ls_init_value: Optional[float] = None  # layer scale initial value
+    patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
+    global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
+    drop_path_rate: Optional[float] = None  # drop path rate
+    timm_model_name: str = None  # a valid model name overrides layers, width, patch_size
+    timm_model_pretrained: bool = False  # use (imagenet) pretrained weights for named model
+    timm_pool: str = 'avg'  # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
+    timm_proj: str = 'linear'  # linear projection for timm model output ('linear', 'mlp', '')
+    timm_proj_bias: bool = False  # enable bias final projection
+    eva_model_name: str = None # a valid eva model name overrides layers, width, patch_size
+    qkv_bias: bool = True
+    fusedLN: bool = False
+    xattn: bool = False
+    postnorm: bool = False
+    rope: bool = False
+    pt_hw_seq_len: int = 16   # 224/14
+    intp_freq: bool = False
+    naiveswiglu: bool = False
+    subln: bool = False
+def build_vision_tower(
+        model_name: str,
+        precision: str = 'bf16',
+        device: Union[str, torch.device] = 'cpu',
+):
+    if isinstance(device, str):
+        device = torch.device(device)
+    model_cfg = json.load(open(model_name + '.json'))
+    if 'rope' in model_cfg.get('vision_cfg', {}):
+        if model_cfg['vision_cfg']['rope']:
+            os.environ['RoPE'] = "1"
+    else:
+        os.environ['RoPE'] = "0"
+    vision_cfg = CLIPVisionCfg(**model_cfg['vision_cfg'])
+    if vision_cfg.fusedLN:
+        try:
+            from apex.normalization import FusedLayerNorm
+        except:
+            FusedLayerNorm = LayerNorm
+            print("Please 'pip install apex'")
+        norm_layer = partial(FusedLayerNorm, eps=1e-6)
+    else:
+        norm_layer = partial(LayerNorm, eps=1e-6)
+    vision_tower = EVAVisionTransformer(
+        img_size = vision_cfg.image_size,
+        patch_size = vision_cfg.patch_size,
+        num_classes = model_cfg['embed_dim'],
+        use_mean_pooling = vision_cfg.global_average_pool, #False
+        init_values = vision_cfg.ls_init_value,
+        patch_dropout = vision_cfg.patch_dropout,
+        embed_dim = vision_cfg.width,
+        depth = vision_cfg.layers,
+        num_heads = vision_cfg.width // vision_cfg.head_width,
+        mlp_ratio = vision_cfg.mlp_ratio,
+        qkv_bias = vision_cfg.qkv_bias,
+        drop_path_rate = vision_cfg.drop_path_rate,
+        norm_layer = norm_layer,
+        xattn = vision_cfg.xattn,
+        rope = vision_cfg.rope,
+        postnorm = vision_cfg.postnorm,
+        pt_hw_seq_len = vision_cfg.pt_hw_seq_len,   # 224/14
+        intp_freq = vision_cfg.intp_freq,
+        naiveswiglu = vision_cfg.naiveswiglu,
+        subln = vision_cfg.subln
+    )
+    if "fp16" in precision or "bf16" in precision:
+        logging.info(f'convert precision to {precision}')
+        vision_tower = vision_tower.to(torch.bfloat16) if 'bf16' in precision else vision_tower.to(torch.float16)
+    vision_tower.to(device=device)
+    # set image / mean metadata from pretrained_cfg if available, or use default
+    vision_tower.image_mean = (0.48145466, 0.4578275, 0.40821073)
+    vision_tower.image_std = (0.26862954, 0.26130258, 0.27577711)
+    return vision_tower