fix vit json
Browse files- EVA02-CLIP-L-14-448.json +0 -29
- modeling_kangaroo.py +1 -2
- vision_tower_builder.py +22 -35
EVA02-CLIP-L-14-448.json
DELETED
@@ -1,29 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"embed_dim": 768,
|
3 |
-
"vision_cfg": {
|
4 |
-
"image_size": 448,
|
5 |
-
"layers": 24,
|
6 |
-
"width": 1024,
|
7 |
-
"drop_path_rate": 0,
|
8 |
-
"head_width": 64,
|
9 |
-
"mlp_ratio": 2.6667,
|
10 |
-
"patch_size": 14,
|
11 |
-
"eva_model_name": "eva-clip-l-14-448",
|
12 |
-
"xattn": true,
|
13 |
-
"fusedLN": true,
|
14 |
-
"rope": true,
|
15 |
-
"pt_hw_seq_len": 16,
|
16 |
-
"intp_freq": true,
|
17 |
-
"naiveswiglu": true,
|
18 |
-
"subln": true
|
19 |
-
},
|
20 |
-
"text_cfg": {
|
21 |
-
"context_length": 77,
|
22 |
-
"vocab_size": 49408,
|
23 |
-
"width": 768,
|
24 |
-
"heads": 12,
|
25 |
-
"layers": 12,
|
26 |
-
"xattn": false,
|
27 |
-
"fusedLN": true
|
28 |
-
}
|
29 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modeling_kangaroo.py
CHANGED
@@ -1069,9 +1069,8 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
|
|
1069 |
def __init__(self, config):
|
1070 |
super().__init__(config)
|
1071 |
self.model = LlamaModel(config)
|
1072 |
-
model_name = "EVA02-CLIP-L-14-448"
|
1073 |
self.vocab_size = config.vocab_size
|
1074 |
-
self.vision_tower = build_vision_tower(
|
1075 |
self.mm_projector = build_vision_projector(mm_hidden_size=self.vision_tower.num_features, hidden_size=config.hidden_size, projector_type="mlp2x_gelu")
|
1076 |
|
1077 |
self.vocab_size = config.vocab_size
|
|
|
1069 |
def __init__(self, config):
|
1070 |
super().__init__(config)
|
1071 |
self.model = LlamaModel(config)
|
|
|
1072 |
self.vocab_size = config.vocab_size
|
1073 |
+
self.vision_tower = build_vision_tower()
|
1074 |
self.mm_projector = build_vision_projector(mm_hidden_size=self.vision_tower.num_features, hidden_size=config.hidden_size, projector_type="mlp2x_gelu")
|
1075 |
|
1076 |
self.vocab_size = config.vocab_size
|
vision_tower_builder.py
CHANGED
@@ -3,7 +3,6 @@
|
|
3 |
# --------------------------------------------------------
|
4 |
import math
|
5 |
import os
|
6 |
-
import json
|
7 |
import logging
|
8 |
|
9 |
import torch
|
@@ -669,56 +668,46 @@ class EVAVisionTransformer(nn.Module):
|
|
669 |
|
670 |
@dataclass
|
671 |
class CLIPVisionCfg:
|
672 |
-
|
673 |
-
|
|
|
674 |
head_width: int = 64
|
675 |
-
mlp_ratio: float =
|
676 |
-
patch_size: int =
|
677 |
-
image_size: Union[Tuple[int, int], int] =
|
678 |
ls_init_value: Optional[float] = None # layer scale initial value
|
679 |
patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
|
680 |
global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
|
681 |
-
drop_path_rate: Optional[float] =
|
682 |
timm_model_name: str = None # a valid model name overrides layers, width, patch_size
|
683 |
timm_model_pretrained: bool = False # use (imagenet) pretrained weights for named model
|
684 |
timm_pool: str = 'avg' # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
|
685 |
timm_proj: str = 'linear' # linear projection for timm model output ('linear', 'mlp', '')
|
686 |
timm_proj_bias: bool = False # enable bias final projection
|
687 |
-
eva_model_name: str =
|
688 |
qkv_bias: bool = True
|
689 |
fusedLN: bool = False
|
690 |
-
xattn: bool =
|
691 |
postnorm: bool = False
|
692 |
-
rope: bool =
|
693 |
pt_hw_seq_len: int = 16 # 224/14
|
694 |
-
intp_freq: bool =
|
695 |
-
naiveswiglu: bool =
|
696 |
-
subln: bool =
|
697 |
-
|
698 |
-
|
699 |
-
def build_vision_tower(
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
if isinstance(device, str):
|
705 |
-
device = torch.device(device)
|
706 |
-
|
707 |
-
model_cfg = json.load(open(model_name + '.json'))
|
708 |
-
if 'rope' in model_cfg.get('vision_cfg', {}):
|
709 |
-
if model_cfg['vision_cfg']['rope']:
|
710 |
os.environ['RoPE'] = "1"
|
711 |
else:
|
712 |
os.environ['RoPE'] = "0"
|
713 |
|
714 |
-
vision_cfg = CLIPVisionCfg(**model_cfg['vision_cfg'])
|
715 |
|
716 |
if vision_cfg.fusedLN:
|
717 |
-
|
718 |
-
from apex.normalization import FusedLayerNorm
|
719 |
-
except:
|
720 |
-
FusedLayerNorm = LayerNorm
|
721 |
-
print("Please 'pip install apex'")
|
722 |
norm_layer = partial(FusedLayerNorm, eps=1e-6)
|
723 |
else:
|
724 |
norm_layer = partial(LayerNorm, eps=1e-6)
|
@@ -726,7 +715,7 @@ def build_vision_tower(
|
|
726 |
vision_tower = EVAVisionTransformer(
|
727 |
img_size = vision_cfg.image_size,
|
728 |
patch_size = vision_cfg.patch_size,
|
729 |
-
num_classes =
|
730 |
use_mean_pooling = vision_cfg.global_average_pool,
|
731 |
init_values = vision_cfg.ls_init_value,
|
732 |
patch_dropout = vision_cfg.patch_dropout,
|
@@ -750,8 +739,6 @@ def build_vision_tower(
|
|
750 |
logging.info(f'convert precision to {precision}')
|
751 |
vision_tower = vision_tower.to(torch.bfloat16) if 'bf16' in precision else vision_tower.to(torch.float16)
|
752 |
|
753 |
-
vision_tower.to(device=device)
|
754 |
-
|
755 |
vision_tower.image_mean = (0.48145466, 0.4578275, 0.40821073)
|
756 |
vision_tower.image_std = (0.26862954, 0.26130258, 0.27577711)
|
757 |
|
|
|
3 |
# --------------------------------------------------------
|
4 |
import math
|
5 |
import os
|
|
|
6 |
import logging
|
7 |
|
8 |
import torch
|
|
|
668 |
|
669 |
@dataclass
|
670 |
class CLIPVisionCfg:
|
671 |
+
embed_dim: int = 768
|
672 |
+
layers: Union[Tuple[int, int, int, int], int] = 24
|
673 |
+
width: int = 1024
|
674 |
head_width: int = 64
|
675 |
+
mlp_ratio: float = 2.6667
|
676 |
+
patch_size: int = 14
|
677 |
+
image_size: Union[Tuple[int, int], int] = 448
|
678 |
ls_init_value: Optional[float] = None # layer scale initial value
|
679 |
patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
|
680 |
global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
|
681 |
+
drop_path_rate: Optional[float] = 0 # drop path rate
|
682 |
timm_model_name: str = None # a valid model name overrides layers, width, patch_size
|
683 |
timm_model_pretrained: bool = False # use (imagenet) pretrained weights for named model
|
684 |
timm_pool: str = 'avg' # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
|
685 |
timm_proj: str = 'linear' # linear projection for timm model output ('linear', 'mlp', '')
|
686 |
timm_proj_bias: bool = False # enable bias final projection
|
687 |
+
eva_model_name: str = "eva-clip-l-14-448" # a valid eva model name overrides layers, width, patch_size
|
688 |
qkv_bias: bool = True
|
689 |
fusedLN: bool = False
|
690 |
+
xattn: bool = True
|
691 |
postnorm: bool = False
|
692 |
+
rope: bool = True
|
693 |
pt_hw_seq_len: int = 16 # 224/14
|
694 |
+
intp_freq: bool = True
|
695 |
+
naiveswiglu: bool = True
|
696 |
+
subln: bool = True
|
697 |
+
|
698 |
+
|
699 |
+
def build_vision_tower(precision: str = 'bf16'):
|
700 |
+
|
701 |
+
vision_cfg = CLIPVisionCfg()
|
702 |
+
|
703 |
+
if vision_cfg.rope:
|
|
|
|
|
|
|
|
|
|
|
|
|
704 |
os.environ['RoPE'] = "1"
|
705 |
else:
|
706 |
os.environ['RoPE'] = "0"
|
707 |
|
|
|
708 |
|
709 |
if vision_cfg.fusedLN:
|
710 |
+
from apex.normalization import FusedLayerNorm
|
|
|
|
|
|
|
|
|
711 |
norm_layer = partial(FusedLayerNorm, eps=1e-6)
|
712 |
else:
|
713 |
norm_layer = partial(LayerNorm, eps=1e-6)
|
|
|
715 |
vision_tower = EVAVisionTransformer(
|
716 |
img_size = vision_cfg.image_size,
|
717 |
patch_size = vision_cfg.patch_size,
|
718 |
+
num_classes = vision_cfg.embed_dim,
|
719 |
use_mean_pooling = vision_cfg.global_average_pool,
|
720 |
init_values = vision_cfg.ls_init_value,
|
721 |
patch_dropout = vision_cfg.patch_dropout,
|
|
|
739 |
logging.info(f'convert precision to {precision}')
|
740 |
vision_tower = vision_tower.to(torch.bfloat16) if 'bf16' in precision else vision_tower.to(torch.float16)
|
741 |
|
|
|
|
|
742 |
vision_tower.image_mean = (0.48145466, 0.4578275, 0.40821073)
|
743 |
vision_tower.image_std = (0.26862954, 0.26130258, 0.27577711)
|
744 |
|