Leyo commited on
Commit
60a1dbc
1 Parent(s): 8d33426

make processing similar to transformers implementation

Browse files
Files changed (1) hide show
  1. image_processing_siglip.py +46 -50
image_processing_siglip.py CHANGED
@@ -1,5 +1,5 @@
1
  # coding=utf-8
2
- # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
3
  #
4
  # Licensed under the Apache License, Version 2.0 (the "License");
5
  # you may not use this file except in compliance with the License.
@@ -14,17 +14,16 @@
14
  # limitations under the License.
15
  """Image processor class for SigLIP."""
16
 
17
- from typing import Dict, Optional, Union
18
-
19
- import numpy as np
20
 
21
  from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
22
  from transformers.image_transforms import (
23
- rescale,
24
  resize,
25
  to_channel_dimension_format,
26
  )
27
  from transformers.image_utils import (
 
 
28
  ChannelDimension,
29
  ImageInput,
30
  PILImageResampling,
@@ -54,7 +53,7 @@ class SiglipImageProcessor(BaseImageProcessor):
54
  `do_resize` in the `preprocess` method.
55
  size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
56
  Size of the image after resizing. Can be overridden by `size` in the `preprocess` method.
57
- resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
58
  Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
59
  do_rescale (`bool`, *optional*, defaults to `True`):
60
  Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
@@ -62,6 +61,16 @@ class SiglipImageProcessor(BaseImageProcessor):
62
  rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
63
  Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
64
  method.
 
 
 
 
 
 
 
 
 
 
65
  """
66
 
67
  model_input_names = ["pixel_values"]
@@ -70,60 +79,27 @@ class SiglipImageProcessor(BaseImageProcessor):
70
  self,
71
  do_resize: bool = True,
72
  size: Dict[str, int] = None,
73
- resample: PILImageResampling = PILImageResampling.BILINEAR,
74
  do_rescale: bool = True,
75
  rescale_factor: Union[int, float] = 1 / 255,
 
 
 
76
  **kwargs,
77
  ) -> None:
78
  super().__init__(**kwargs)
79
  size = size if size is not None else {"height": 224, "width": 224}
80
- size = get_size_dict(size, default_to_square=False)
 
81
 
82
  self.do_resize = do_resize
83
  self.size = size
84
  self.resample = resample
85
  self.do_rescale = do_rescale
86
  self.rescale_factor = rescale_factor
87
-
88
- def rescale(
89
- self,
90
- image: np.ndarray,
91
- rescale_factor: float,
92
- data_format: Optional[Union[str, ChannelDimension]] = None,
93
- input_data_format: Optional[Union[str, ChannelDimension]] = None,
94
- **kwargs,
95
- ) -> np.ndarray:
96
- """
97
- Rescale an image by a scale factor. image = image * scale, after which image = image * 2 - 1.
98
-
99
- Args:
100
- image (`np.ndarray`):
101
- Image to rescale.
102
- scale (`float`):
103
- The scaling factor to rescale pixel values by.
104
- data_format (`str` or `ChannelDimension`, *optional*):
105
- The channel dimension format for the output image. If unset, the channel dimension format of the input
106
- image is used. Can be one of:
107
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
108
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
109
- input_data_format (`ChannelDimension` or `str`, *optional*):
110
- The channel dimension format for the input image. If unset, the channel dimension format is inferred
111
- from the input image. Can be one of:
112
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
113
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
114
-
115
- Returns:
116
- `np.ndarray`: The rescaled image.
117
- """
118
- # first, rescale to 0->1
119
- rescaled_image = rescale(
120
- image, scale=rescale_factor, data_format=data_format, input_data_format=input_data_format, **kwargs
121
- )
122
-
123
- # next, rescale to -1->1
124
- rescaled_image = 2 * rescaled_image - 1
125
-
126
- return rescaled_image
127
 
128
  def preprocess(
129
  self,
@@ -133,6 +109,9 @@ class SiglipImageProcessor(BaseImageProcessor):
133
  resample: PILImageResampling = None,
134
  do_rescale: bool = None,
135
  rescale_factor: float = None,
 
 
 
136
  return_tensors: Optional[Union[str, TensorType]] = None,
137
  data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
138
  input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -156,6 +135,13 @@ class SiglipImageProcessor(BaseImageProcessor):
156
  Whether to rescale the image.
157
  rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
158
  Rescale factor to rescale the image by if `do_rescale` is set to `True`.
 
 
 
 
 
 
 
159
  return_tensors (`str` or `TensorType`, *optional*):
160
  The type of tensors to return. Can be one of:
161
  - Unset: Return a list of `np.ndarray`.
@@ -181,6 +167,9 @@ class SiglipImageProcessor(BaseImageProcessor):
181
  resample = resample if resample is not None else self.resample
182
  do_rescale = do_rescale if do_rescale is not None else self.do_rescale
183
  rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
 
 
 
184
 
185
  images = make_list_of_images(images)
186
 
@@ -210,14 +199,21 @@ class SiglipImageProcessor(BaseImageProcessor):
210
  input_data_format = infer_channel_dimension_format(images[0])
211
 
212
  if do_resize:
 
213
  images = [
214
- resize(image=image, size=(size["width"], size["height"]), resample=resample, input_data_format=input_data_format)
215
  for image in images
216
  ]
217
 
218
  if do_rescale:
219
  images = [
220
- self.rescale(image=image, rescale_factor=rescale_factor, input_data_format=input_data_format)
 
 
 
 
 
 
221
  for image in images
222
  ]
223
 
 
1
  # coding=utf-8
2
+ # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
3
  #
4
  # Licensed under the Apache License, Version 2.0 (the "License");
5
  # you may not use this file except in compliance with the License.
 
14
  # limitations under the License.
15
  """Image processor class for SigLIP."""
16
 
17
+ from typing import Dict, List, Optional, Union
 
 
18
 
19
  from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
20
  from transformers.image_transforms import (
 
21
  resize,
22
  to_channel_dimension_format,
23
  )
24
  from transformers.image_utils import (
25
+ IMAGENET_STANDARD_MEAN,
26
+ IMAGENET_STANDARD_STD,
27
  ChannelDimension,
28
  ImageInput,
29
  PILImageResampling,
 
53
  `do_resize` in the `preprocess` method.
54
  size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
55
  Size of the image after resizing. Can be overridden by `size` in the `preprocess` method.
56
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
57
  Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
58
  do_rescale (`bool`, *optional*, defaults to `True`):
59
  Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
 
61
  rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
62
  Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
63
  method.
64
+ do_normalize (`bool`, *optional*, defaults to `True`):
65
+ Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
66
+ `do_normalize` in the `preprocess` method.
67
+ image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
68
+ Mean to use if normalizing the image. This is a float or list of floats the length of the number of
69
+ channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
70
+ image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
71
+ Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
72
+ number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
73
+ Can be overridden by the `image_std` parameter in the `preprocess` method.
74
  """
75
 
76
  model_input_names = ["pixel_values"]
 
79
  self,
80
  do_resize: bool = True,
81
  size: Dict[str, int] = None,
82
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
83
  do_rescale: bool = True,
84
  rescale_factor: Union[int, float] = 1 / 255,
85
+ do_normalize: bool = True,
86
+ image_mean: Optional[Union[float, List[float]]] = None,
87
+ image_std: Optional[Union[float, List[float]]] = None,
88
  **kwargs,
89
  ) -> None:
90
  super().__init__(**kwargs)
91
  size = size if size is not None else {"height": 224, "width": 224}
92
+ image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
93
+ image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
94
 
95
  self.do_resize = do_resize
96
  self.size = size
97
  self.resample = resample
98
  self.do_rescale = do_rescale
99
  self.rescale_factor = rescale_factor
100
+ self.do_normalize = do_normalize
101
+ self.image_mean = image_mean
102
+ self.image_std = image_std
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  def preprocess(
105
  self,
 
109
  resample: PILImageResampling = None,
110
  do_rescale: bool = None,
111
  rescale_factor: float = None,
112
+ do_normalize: bool = None,
113
+ image_mean: Optional[Union[float, List[float]]] = None,
114
+ image_std: Optional[Union[float, List[float]]] = None,
115
  return_tensors: Optional[Union[str, TensorType]] = None,
116
  data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
117
  input_data_format: Optional[Union[str, ChannelDimension]] = None,
 
135
  Whether to rescale the image.
136
  rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
137
  Rescale factor to rescale the image by if `do_rescale` is set to `True`.
138
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
139
+ Whether to normalize the image.
140
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
141
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
142
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
143
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
144
+ `True`.
145
  return_tensors (`str` or `TensorType`, *optional*):
146
  The type of tensors to return. Can be one of:
147
  - Unset: Return a list of `np.ndarray`.
 
167
  resample = resample if resample is not None else self.resample
168
  do_rescale = do_rescale if do_rescale is not None else self.do_rescale
169
  rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
170
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
171
+ image_mean = image_mean if image_mean is not None else self.image_mean
172
+ image_std = image_std if image_std is not None else self.image_std
173
 
174
  images = make_list_of_images(images)
175
 
 
199
  input_data_format = infer_channel_dimension_format(images[0])
200
 
201
  if do_resize:
202
+ height, width = size["height"], size["width"]
203
  images = [
204
+ resize(image=image, size=(height, width), resample=resample, input_data_format=input_data_format)
205
  for image in images
206
  ]
207
 
208
  if do_rescale:
209
  images = [
210
+ self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
211
+ for image in images
212
+ ]
213
+
214
+ if do_normalize:
215
+ images = [
216
+ self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
217
  for image in images
218
  ]
219