Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,936 Bytes
85efb5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
from abc import ABC, abstractmethod
import torch
import torch.nn as nn
class ProcessorWrapper:
def __init__(
self,
transform,
height=378,
width=378,
image_mean=[0.48145466, 0.4578275, 0.40821073],
):
self._crop_size = {
"height": height,
"width": width,
}
self._transforms = transform
# print(transform)
self.image_mean = image_mean
@property
def crop_size(self):
return self._crop_size
def preprocess(self, image, return_tensors="pt"):
# Ensure image is a PIL Image
output = {}
output["pixel_values"] = [self._transforms(image)]
return output
class BaseVisionTower(nn.Module):
def __init__(self, vision_tower_name, args, delay_load=False):
super().__init__()
self.is_loaded = False
self.args = args
self.vision_tower_name = vision_tower_name
self.select_layer = args.mm_vision_select_layer
self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
self.unfreeze_mm_vision_tower = getattr(args, "unfreeze_mm_vision_tower", False)
self.delay_load = delay_load
@abstractmethod
def load_model(self, device_map=None):
raise NotImplementedError("Subclasses must implement load_model")
@abstractmethod
def _forward(self, images):
raise NotImplementedError("Subclasses must implement forward")
def forward(self, images):
if type(images) is list:
image_features = [self._forward(image.unsqueeze(0)) for image in images]
else:
image_features = self._forward(images)
return image_features
@property
def dummy_feature(self):
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
@property
def dtype(self):
# Dynamically infer the dtype from the first parameter, if not explicitly specified
if hasattr(self.vision_tower, "dtype"):
return self.vision_tower.dtype
else:
params = list(self.vision_tower.parameters())
return (
params[0].dtype if len(params) > 0 else torch.float32
) # Default to torch.float32 if no parameters
@property
def device(self):
# Dynamically infer the device from the first parameter, if not explicitly specified
if hasattr(self.vision_tower, "device"):
return self.vision_tower.device
else:
params = list(self.vision_tower.parameters())
return (
params[0].device if len(params) > 0 else torch.device("cpu")
) # Default to CPU if no parameters
@property
def config(self):
if self.is_loaded:
return self.vision_tower.config
else:
return self.cfg_only
@property
def hidden_size(self):
try:
return self.config.hidden_size
except:
return self._hidden_size
@property
def image_size(self): # resolution
# return self.config.image_size
try:
return self.config.image_size
except:
return self._image_size
@property
def patch_size(self):
# return self.config.patch_size
try:
return self.config.patch_size
except:
return self._patch_size
@property
def num_patches_per_side(self):
if self._interp_size is not None:
return int(self._interp_size**0.5)
try:
return self.image_size // self.patch_size
except:
return self._num_patches_per_side
@property
def num_patches(self):
if self._interp_size is not None:
return self._interp_size
try:
return self.num_patches_per_side**2
except:
return self._num_patches
|