WEBing commited on
Commit
127d3e1
1 Parent(s): 85d6546

fix vit json

Browse files
EVA02-CLIP-L-14-448.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "embed_dim": 768,
3
- "vision_cfg": {
4
- "image_size": 448,
5
- "layers": 24,
6
- "width": 1024,
7
- "drop_path_rate": 0,
8
- "head_width": 64,
9
- "mlp_ratio": 2.6667,
10
- "patch_size": 14,
11
- "eva_model_name": "eva-clip-l-14-448",
12
- "xattn": true,
13
- "fusedLN": true,
14
- "rope": true,
15
- "pt_hw_seq_len": 16,
16
- "intp_freq": true,
17
- "naiveswiglu": true,
18
- "subln": true
19
- },
20
- "text_cfg": {
21
- "context_length": 77,
22
- "vocab_size": 49408,
23
- "width": 768,
24
- "heads": 12,
25
- "layers": 12,
26
- "xattn": false,
27
- "fusedLN": true
28
- }
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modeling_kangaroo.py CHANGED
@@ -1069,9 +1069,8 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
1069
  def __init__(self, config):
1070
  super().__init__(config)
1071
  self.model = LlamaModel(config)
1072
- model_name = "EVA02-CLIP-L-14-448"
1073
  self.vocab_size = config.vocab_size
1074
- self.vision_tower = build_vision_tower(model_name)
1075
  self.mm_projector = build_vision_projector(mm_hidden_size=self.vision_tower.num_features, hidden_size=config.hidden_size, projector_type="mlp2x_gelu")
1076
 
1077
  self.vocab_size = config.vocab_size
 
1069
  def __init__(self, config):
1070
  super().__init__(config)
1071
  self.model = LlamaModel(config)
 
1072
  self.vocab_size = config.vocab_size
1073
+ self.vision_tower = build_vision_tower()
1074
  self.mm_projector = build_vision_projector(mm_hidden_size=self.vision_tower.num_features, hidden_size=config.hidden_size, projector_type="mlp2x_gelu")
1075
 
1076
  self.vocab_size = config.vocab_size
vision_tower_builder.py CHANGED
@@ -3,7 +3,6 @@
3
  # --------------------------------------------------------
4
  import math
5
  import os
6
- import json
7
  import logging
8
 
9
  import torch
@@ -669,56 +668,46 @@ class EVAVisionTransformer(nn.Module):
669
 
670
  @dataclass
671
  class CLIPVisionCfg:
672
- layers: Union[Tuple[int, int, int, int], int] = 12
673
- width: int = 768
 
674
  head_width: int = 64
675
- mlp_ratio: float = 4.0
676
- patch_size: int = 16
677
- image_size: Union[Tuple[int, int], int] = 224
678
  ls_init_value: Optional[float] = None # layer scale initial value
679
  patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
680
  global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
681
- drop_path_rate: Optional[float] = None # drop path rate
682
  timm_model_name: str = None # a valid model name overrides layers, width, patch_size
683
  timm_model_pretrained: bool = False # use (imagenet) pretrained weights for named model
684
  timm_pool: str = 'avg' # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
685
  timm_proj: str = 'linear' # linear projection for timm model output ('linear', 'mlp', '')
686
  timm_proj_bias: bool = False # enable bias final projection
687
- eva_model_name: str = None # a valid eva model name overrides layers, width, patch_size
688
  qkv_bias: bool = True
689
  fusedLN: bool = False
690
- xattn: bool = False
691
  postnorm: bool = False
692
- rope: bool = False
693
  pt_hw_seq_len: int = 16 # 224/14
694
- intp_freq: bool = False
695
- naiveswiglu: bool = False
696
- subln: bool = False
697
-
698
-
699
- def build_vision_tower(
700
- model_name: str,
701
- precision: str = 'bf16',
702
- device: Union[str, torch.device] = 'cpu',
703
- ):
704
- if isinstance(device, str):
705
- device = torch.device(device)
706
-
707
- model_cfg = json.load(open(model_name + '.json'))
708
- if 'rope' in model_cfg.get('vision_cfg', {}):
709
- if model_cfg['vision_cfg']['rope']:
710
  os.environ['RoPE'] = "1"
711
  else:
712
  os.environ['RoPE'] = "0"
713
 
714
- vision_cfg = CLIPVisionCfg(**model_cfg['vision_cfg'])
715
 
716
  if vision_cfg.fusedLN:
717
- try:
718
- from apex.normalization import FusedLayerNorm
719
- except:
720
- FusedLayerNorm = LayerNorm
721
- print("Please 'pip install apex'")
722
  norm_layer = partial(FusedLayerNorm, eps=1e-6)
723
  else:
724
  norm_layer = partial(LayerNorm, eps=1e-6)
@@ -726,7 +715,7 @@ def build_vision_tower(
726
  vision_tower = EVAVisionTransformer(
727
  img_size = vision_cfg.image_size,
728
  patch_size = vision_cfg.patch_size,
729
- num_classes = model_cfg['embed_dim'],
730
  use_mean_pooling = vision_cfg.global_average_pool,
731
  init_values = vision_cfg.ls_init_value,
732
  patch_dropout = vision_cfg.patch_dropout,
@@ -750,8 +739,6 @@ def build_vision_tower(
750
  logging.info(f'convert precision to {precision}')
751
  vision_tower = vision_tower.to(torch.bfloat16) if 'bf16' in precision else vision_tower.to(torch.float16)
752
 
753
- vision_tower.to(device=device)
754
-
755
  vision_tower.image_mean = (0.48145466, 0.4578275, 0.40821073)
756
  vision_tower.image_std = (0.26862954, 0.26130258, 0.27577711)
757
 
 
3
  # --------------------------------------------------------
4
  import math
5
  import os
 
6
  import logging
7
 
8
  import torch
 
668
 
669
  @dataclass
670
  class CLIPVisionCfg:
671
+ embed_dim: int = 768
672
+ layers: Union[Tuple[int, int, int, int], int] = 24
673
+ width: int = 1024
674
  head_width: int = 64
675
+ mlp_ratio: float = 2.6667
676
+ patch_size: int = 14
677
+ image_size: Union[Tuple[int, int], int] = 448
678
  ls_init_value: Optional[float] = None # layer scale initial value
679
  patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
680
  global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
681
+ drop_path_rate: Optional[float] = 0 # drop path rate
682
  timm_model_name: str = None # a valid model name overrides layers, width, patch_size
683
  timm_model_pretrained: bool = False # use (imagenet) pretrained weights for named model
684
  timm_pool: str = 'avg' # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
685
  timm_proj: str = 'linear' # linear projection for timm model output ('linear', 'mlp', '')
686
  timm_proj_bias: bool = False # enable bias final projection
687
+ eva_model_name: str = "eva-clip-l-14-448" # a valid eva model name overrides layers, width, patch_size
688
  qkv_bias: bool = True
689
  fusedLN: bool = False
690
+ xattn: bool = True
691
  postnorm: bool = False
692
+ rope: bool = True
693
  pt_hw_seq_len: int = 16 # 224/14
694
+ intp_freq: bool = True
695
+ naiveswiglu: bool = True
696
+ subln: bool = True
697
+
698
+
699
+ def build_vision_tower(precision: str = 'bf16'):
700
+
701
+ vision_cfg = CLIPVisionCfg()
702
+
703
+ if vision_cfg.rope:
 
 
 
 
 
 
704
  os.environ['RoPE'] = "1"
705
  else:
706
  os.environ['RoPE'] = "0"
707
 
 
708
 
709
  if vision_cfg.fusedLN:
710
+ from apex.normalization import FusedLayerNorm
 
 
 
 
711
  norm_layer = partial(FusedLayerNorm, eps=1e-6)
712
  else:
713
  norm_layer = partial(LayerNorm, eps=1e-6)
 
715
  vision_tower = EVAVisionTransformer(
716
  img_size = vision_cfg.image_size,
717
  patch_size = vision_cfg.patch_size,
718
+ num_classes = vision_cfg.embed_dim,
719
  use_mean_pooling = vision_cfg.global_average_pool,
720
  init_values = vision_cfg.ls_init_value,
721
  patch_dropout = vision_cfg.patch_dropout,
 
739
  logging.info(f'convert precision to {precision}')
740
  vision_tower = vision_tower.to(torch.bfloat16) if 'bf16' in precision else vision_tower.to(torch.float16)
741
 
 
 
742
  vision_tower.image_mean = (0.48145466, 0.4578275, 0.40821073)
743
  vision_tower.image_std = (0.26862954, 0.26130258, 0.27577711)
744