Karroyan commited on
Commit
d5f94d4
·
verified ·
1 Parent(s): 48185c6

Upload image_processing_keye.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. image_processing_keye.py +568 -0
image_processing_keye.py ADDED
@@ -0,0 +1,568 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Image processor class for Keye."""
15
+
16
+ import math
17
+ from typing import Dict, List, Optional, Union
18
+
19
+ import numpy as np
20
+ import torch
21
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
22
+ from torchvision.transforms import functional as TF
23
+ from transformers.image_transforms import (
24
+ convert_to_rgb,
25
+ resize,
26
+ to_channel_dimension_format,
27
+ )
28
+ from transformers.image_utils import (
29
+ OPENAI_CLIP_MEAN,
30
+ OPENAI_CLIP_STD,
31
+ ChannelDimension,
32
+ PILImageResampling,
33
+ get_image_size,
34
+ infer_channel_dimension_format,
35
+ is_scaled_image,
36
+ is_valid_image,
37
+ make_list_of_images,
38
+ to_numpy_array,
39
+ valid_images,
40
+ validate_preprocess_arguments,
41
+ )
42
+ from transformers.utils import TensorType, is_vision_available, logging
43
+
44
+
45
+ logger = logging.get_logger(__name__)
46
+
47
+
48
+ if is_vision_available():
49
+ from PIL import Image
50
+
51
+ ImageInput = Union[
52
+ "PIL.Image.Image",
53
+ np.ndarray,
54
+ "torch.Tensor",
55
+ List["PIL.Image.Image"],
56
+ List[np.ndarray],
57
+ List["torch.Tensor"],
58
+ ] # noqa
59
+
60
+
61
+ VideoInput = Union[
62
+ List["PIL.Image.Image"],
63
+ "np.ndarray",
64
+ "torch.Tensor",
65
+ List["np.ndarray"],
66
+ List["torch.Tensor"],
67
+ List[List["PIL.Image.Image"]],
68
+ List[List["np.ndarrray"]],
69
+ List[List["torch.Tensor"]],
70
+ ] # noqa
71
+
72
+
73
+ def make_batched_images(images) -> List[List[ImageInput]]:
74
+ """
75
+ Accepts images in list or nested list format, and makes a list of images for preprocessing.
76
+
77
+ Args:
78
+ images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
79
+ The input image.
80
+
81
+ Returns:
82
+ list: A list of images.
83
+ """
84
+ if (
85
+ isinstance(images, (list, tuple))
86
+ and isinstance(images[0], (list, tuple))
87
+ and is_valid_image(images[0][0])
88
+ ):
89
+ return [img for img_list in images for img in img_list]
90
+
91
+ elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
92
+ return images
93
+
94
+ elif is_valid_image(images):
95
+ return [images]
96
+
97
+ raise ValueError(f"Could not make batched images from {images}")
98
+
99
+
100
+ def adjust_size(size, patch_size):
101
+ num_patches = size // patch_size
102
+ if num_patches % 2 != 0: # 如果是奇数,减1
103
+ num_patches -= 1
104
+ return num_patches * patch_size
105
+
106
+
107
+ def make_batched_videos(videos) -> List[VideoInput]:
108
+ if (
109
+ isinstance(videos, (list, tuple))
110
+ and isinstance(videos[0], (list, tuple))
111
+ and is_valid_image(videos[0][0])
112
+ ):
113
+ return videos
114
+
115
+ elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
116
+ if isinstance(videos[0], Image.Image):
117
+ return [videos]
118
+ elif len(videos[0].shape) == 4:
119
+ return [list(video) for video in videos]
120
+
121
+ elif is_valid_image(videos) and len(videos.shape) == 4:
122
+ return [list(videos)]
123
+
124
+ raise ValueError(f"Could not make batched video from {videos}")
125
+
126
+
127
+ def smart_resize(
128
+ height: int,
129
+ width: int,
130
+ factor: int = 28,
131
+ min_pixels: int = 28 * 28 * 130,
132
+ max_pixels: int = 28 * 28 * 1280,
133
+ ):
134
+ """Rescales the image so that the following conditions are met:
135
+
136
+ 1. Both dimensions (height and width) are divisible by 'factor'.
137
+
138
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
139
+
140
+ 3. The aspect ratio of the image is maintained as closely as possible.
141
+
142
+ """
143
+ # if height < factor or width < factor:
144
+ # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
145
+ # if int(height < factor//4) + int(width < factor//4):
146
+ # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
147
+
148
+ if height < factor:
149
+ print(f"smart_resize: height={height} < factor={factor}, reset height=factor")
150
+ width = round((width * factor) / height)
151
+ height = factor
152
+
153
+ if width < factor:
154
+ print(f"smart_resize: width={width} < factor={factor}, reset width=factor")
155
+ height = round((height * factor) / width)
156
+ width = factor
157
+
158
+ if max(height, width) / min(height, width) > 200:
159
+ raise ValueError(
160
+ f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
161
+ )
162
+ h_bar = round(height / factor) * factor
163
+ w_bar = round(width / factor) * factor
164
+ if h_bar * w_bar > max_pixels:
165
+ beta = math.sqrt((height * width) / max_pixels)
166
+ h_bar = math.floor(height / beta / factor) * factor
167
+ w_bar = math.floor(width / beta / factor) * factor
168
+ elif h_bar * w_bar < min_pixels:
169
+ beta = math.sqrt(min_pixels / (height * width))
170
+ h_bar = math.ceil(height * beta / factor) * factor
171
+ w_bar = math.ceil(width * beta / factor) * factor
172
+ return h_bar, w_bar
173
+
174
+
175
+ class SiglipImageProcessor(BaseImageProcessor):
176
+ r"""
177
+ Constructs a Siglip image processor that dynamically resizes images based on the original images.
178
+
179
+ Args:
180
+ do_resize (`bool`, *optional*, defaults to `True`):
181
+ Whether to resize the image's (height, width) dimensions.
182
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
183
+ Resampling filter to use when resizing the image.
184
+ do_rescale (`bool`, *optional*, defaults to `True`):
185
+ Whether to rescale the image by the specified scale `rescale_factor`.
186
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
187
+ Scale factor to use if rescaling the image.
188
+ do_normalize (`bool`, *optional*, defaults to `True`):
189
+ Whether to normalize the image.
190
+ image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
191
+ Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
192
+ image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
193
+ Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
194
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
195
+ Whether to convert the image to RGB.
196
+ min_pixels (`int`, *optional*, defaults to `28 * 28 * 130`):
197
+ The min pixels of the image to resize the image.
198
+ max_pixels (`int`, *optional*, defaults to `28 * 28 * 1670`):
199
+ The max pixels of the image to resize the image.
200
+ patch_size (`int`, *optional*, defaults to 14):
201
+ The spacial patch size of the vision encoder.
202
+ temporal_patch_size (`int`, *optional*, defaults to 2):
203
+ The temporal patch size of the vision encoder.
204
+ merge_size (`int`, *optional*, defaults to 2):
205
+ The merge size of the vision encoder to llm encoder.
206
+ """
207
+
208
+ model_input_names = [
209
+ "pixel_values",
210
+ "image_grid_thw",
211
+ "pixel_values_videos",
212
+ "video_grid_thw",
213
+ ]
214
+
215
+ def __init__(
216
+ self,
217
+ do_resize: bool = True,
218
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
219
+ do_rescale: bool = True,
220
+ rescale_factor: Union[int, float] = 1 / 255,
221
+ do_normalize: bool = True,
222
+ image_mean: Optional[Union[float, List[float]]] = None,
223
+ image_std: Optional[Union[float, List[float]]] = None,
224
+ do_convert_rgb: bool = True,
225
+ min_pixels: int = 28 * 28 * 130,
226
+ max_pixels: int = 28 * 28 * 1280,
227
+ patch_size: int = 14,
228
+ temporal_patch_size: int = 1,
229
+ merge_size: int = 2,
230
+ **kwargs,
231
+ ) -> None:
232
+ super().__init__(**kwargs)
233
+ self.do_resize = do_resize
234
+ self.resample = resample
235
+ self.do_rescale = do_rescale
236
+ self.rescale_factor = rescale_factor
237
+ self.do_normalize = do_normalize
238
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
239
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
240
+ self.min_pixels = min_pixels
241
+ self.max_pixels = max_pixels
242
+ self.patch_size = patch_size
243
+ self.temporal_patch_size = temporal_patch_size
244
+ self.merge_size = merge_size
245
+ self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used
246
+ self.do_convert_rgb = do_convert_rgb
247
+
248
+ def mvit_rescale(self, image: Image.Image, merge_size: int = 2) -> Image.Image:
249
+ try:
250
+ w, h = image.size
251
+ except:
252
+ raise ValueError(str((type(image), image)))
253
+ patch_size = self.patch_size
254
+
255
+ if (w // patch_size) * (h // patch_size) > self.in_token_limit:
256
+ scale = math.sqrt(
257
+ self.in_token_limit / ((w // patch_size) * (h // patch_size))
258
+ )
259
+ new_w, new_h = int(w * scale), int(h * scale)
260
+
261
+ image = image.resize((new_w, new_h), Image.Resampling.BICUBIC)
262
+ if self.pad_input:
263
+ new_w, new_h = image.size
264
+ pad_size_h = merge_size * patch_size
265
+ pad_size_w = merge_size * patch_size
266
+
267
+ pad_h = (pad_size_h - new_h % pad_size_h) % pad_size_h
268
+ pad_w = (pad_size_w - new_w % pad_size_w) % pad_size_w
269
+
270
+ image = TF.pad(image, (0, 0, pad_w, pad_h))
271
+ else:
272
+ new_w, new_h = image.size
273
+ new_w = new_w - new_w % patch_size
274
+ new_h = new_h - new_h % patch_size
275
+
276
+ new_w = adjust_size(new_w, patch_size)
277
+ new_h = adjust_size(new_h, patch_size)
278
+
279
+ image = TF.center_crop(image, (new_h, new_w))
280
+
281
+ w, h = image.size
282
+ if w // patch_size >= 512 or h // patch_size >= 512:
283
+ new_h = min(patch_size * 510, h)
284
+ new_w = min(patch_size * 510, w)
285
+ image = TF.center_crop(image, (new_h, new_w))
286
+ # raise ValueError("Exceed pos emb")
287
+ return image
288
+
289
+ def _preprocess(
290
+ self,
291
+ images: Union[ImageInput, VideoInput],
292
+ do_resize: bool = None,
293
+ resample: PILImageResampling = None,
294
+ do_rescale: bool = None,
295
+ rescale_factor: float = None,
296
+ do_normalize: bool = None,
297
+ image_mean: Optional[Union[float, List[float]]] = None,
298
+ image_std: Optional[Union[float, List[float]]] = None,
299
+ do_convert_rgb: bool = None,
300
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
301
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
302
+ ):
303
+ """
304
+ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
305
+
306
+ Args:
307
+ images (`ImageInput`):
308
+ Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
309
+ vision_info (`List[Dict]`, *optional*):
310
+ Optional list of dictionaries containing additional information about vision inputs.
311
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
312
+ Whether to resize the image.
313
+ resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
314
+ Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
315
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
316
+ Whether to rescale the image.
317
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
318
+ Scale factor to use if rescaling the image.
319
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
320
+ Whether to normalize the image.
321
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
322
+ Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
323
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
324
+ Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
325
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
326
+ Whether to convert the image to RGB.
327
+ data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
328
+ The channel dimension format for the output image. Can be one of:
329
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
330
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
331
+ - Unset: Use the channel dimension format of the input image.
332
+ input_data_format (`ChannelDimension` or `str`, *optional*):
333
+ The channel dimension format for the input image. Can be one of:
334
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
335
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
336
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
337
+ """
338
+ images = make_list_of_images(images)
339
+
340
+ if do_convert_rgb:
341
+ images = [convert_to_rgb(image) for image in images]
342
+
343
+ # All transformations expect numpy arrays.
344
+ images = [to_numpy_array(image) for image in images]
345
+
346
+ if is_scaled_image(images[0]) and do_rescale:
347
+ logger.warning_once(
348
+ "It looks like you are trying to rescale already rescaled images. If the input"
349
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
350
+ )
351
+ if input_data_format is None:
352
+ # We assume that all images have the same channel dimension format.
353
+ input_data_format = infer_channel_dimension_format(images[0])
354
+
355
+ height, width = get_image_size(images[0], channel_dim=input_data_format)
356
+ resized_height, resized_width = height, width
357
+ processed_images = []
358
+
359
+ for image in images:
360
+ if do_resize:
361
+ resized_height, resized_width = smart_resize(
362
+ height,
363
+ width,
364
+ factor=self.patch_size * self.merge_size,
365
+ min_pixels=self.min_pixels,
366
+ max_pixels=self.max_pixels,
367
+ )
368
+ image = resize(
369
+ image,
370
+ size=(resized_height, resized_width),
371
+ resample=resample,
372
+ input_data_format=input_data_format,
373
+ )
374
+
375
+ if do_rescale:
376
+ image = self.rescale(
377
+ image, scale=rescale_factor, input_data_format=input_data_format
378
+ )
379
+
380
+ if do_normalize:
381
+ image = self.normalize(
382
+ image=image,
383
+ mean=image_mean,
384
+ std=image_std,
385
+ input_data_format=input_data_format,
386
+ )
387
+ image = to_channel_dimension_format(
388
+ image, data_format, input_channel_dim=input_data_format
389
+ )
390
+ processed_images.append(image)
391
+
392
+ patches = np.array(processed_images)
393
+ if data_format == ChannelDimension.LAST:
394
+ patches = patches.transpose(0, 3, 1, 2)
395
+ if patches.shape[0] == 1:
396
+ patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
397
+ init_patches = patches
398
+ channel = patches.shape[1]
399
+ grid_t = patches.shape[0] // self.temporal_patch_size
400
+ grid_h, grid_w = (
401
+ resized_height // self.patch_size,
402
+ resized_width // self.patch_size,
403
+ )
404
+ patches = patches.reshape(
405
+ grid_t,
406
+ self.temporal_patch_size,
407
+ channel,
408
+ grid_h,
409
+ self.patch_size,
410
+ grid_w,
411
+ self.patch_size,
412
+ )
413
+ patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
414
+ assert self.temporal_patch_size == 1
415
+ flatten_patches = patches.reshape(
416
+ grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size
417
+ )
418
+ return flatten_patches, (grid_t, grid_h, grid_w)
419
+
420
+ def preprocess(
421
+ self,
422
+ images: ImageInput,
423
+ videos: VideoInput = None,
424
+ do_resize: bool = None,
425
+ size: Dict[str, int] = None,
426
+ resample: PILImageResampling = None,
427
+ do_rescale: bool = None,
428
+ rescale_factor: float = None,
429
+ do_normalize: bool = None,
430
+ image_mean: Optional[Union[float, List[float]]] = None,
431
+ image_std: Optional[Union[float, List[float]]] = None,
432
+ do_convert_rgb: bool = None,
433
+ return_tensors: Optional[Union[str, TensorType]] = None,
434
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
435
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
436
+ ):
437
+ """
438
+ Args:
439
+ images (`ImageInput`):
440
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
441
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
442
+ videos (`VideoInput`):
443
+ Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
444
+ passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
445
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
446
+ Whether to resize the image.
447
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
448
+ Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
449
+ the longest edge resized to keep the input aspect ratio.
450
+ resample (`int`, *optional*, defaults to `self.resample`):
451
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
452
+ has an effect if `do_resize` is set to `True`.
453
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
454
+ Whether to rescale the image.
455
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
456
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
457
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
458
+ Whether to normalize the image.
459
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
460
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
461
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
462
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
463
+ `True`.
464
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
465
+ Whether to convert the image to RGB.
466
+ return_tensors (`str` or `TensorType`, *optional*):
467
+ The type of tensors to return. Can be one of:
468
+ - Unset: Return a list of `np.ndarray`.
469
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
470
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
471
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
472
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
473
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
474
+ The channel dimension format for the output image. Can be one of:
475
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
476
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
477
+ - Unset: Use the channel dimension format of the input image.
478
+ input_data_format (`ChannelDimension` or `str`, *optional*):
479
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
480
+ from the input image. Can be one of:
481
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
482
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
483
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
484
+
485
+ """
486
+ do_resize = do_resize if do_resize is not None else self.do_resize
487
+ size = size if size is not None else self.size
488
+ resample = resample if resample is not None else self.resample
489
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
490
+ rescale_factor = (
491
+ rescale_factor if rescale_factor is not None else self.rescale_factor
492
+ )
493
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
494
+ image_mean = image_mean if image_mean is not None else self.image_mean
495
+ image_std = image_std if image_std is not None else self.image_std
496
+ do_convert_rgb = (
497
+ do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
498
+ )
499
+
500
+ if images is not None:
501
+ images = make_batched_images(images)
502
+ if videos is not None:
503
+ videos = make_batched_videos(videos)
504
+
505
+ if images is not None and not valid_images(images):
506
+ raise ValueError(
507
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
508
+ "torch.Tensor, tf.Tensor or jax.ndarray."
509
+ )
510
+
511
+ validate_preprocess_arguments(
512
+ rescale_factor=rescale_factor,
513
+ do_normalize=do_normalize,
514
+ image_mean=image_mean,
515
+ image_std=image_std,
516
+ do_resize=do_resize,
517
+ size=size,
518
+ resample=resample,
519
+ )
520
+
521
+ if images is not None:
522
+ pixel_values, vision_grid_thws = [], []
523
+ for image in images:
524
+ patches, image_grid_thw = self._preprocess(
525
+ image,
526
+ do_resize=do_resize,
527
+ resample=resample,
528
+ do_rescale=do_rescale,
529
+ rescale_factor=rescale_factor,
530
+ do_normalize=do_normalize,
531
+ image_mean=image_mean,
532
+ image_std=image_std,
533
+ data_format=data_format,
534
+ do_convert_rgb=do_convert_rgb,
535
+ input_data_format=input_data_format,
536
+ )
537
+ pixel_values.extend(patches)
538
+ vision_grid_thws.append(image_grid_thw)
539
+ pixel_values = np.array(pixel_values)
540
+ vision_grid_thws = np.array(vision_grid_thws)
541
+ data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
542
+
543
+ if videos is not None:
544
+ pixel_values, vision_grid_thws = [], []
545
+ for images in videos:
546
+ patches, video_grid_thw = self._preprocess(
547
+ images,
548
+ do_resize=do_resize,
549
+ resample=resample,
550
+ do_rescale=do_rescale,
551
+ rescale_factor=rescale_factor,
552
+ do_normalize=do_normalize,
553
+ image_mean=image_mean,
554
+ image_std=image_std,
555
+ data_format=data_format,
556
+ do_convert_rgb=do_convert_rgb,
557
+ input_data_format=input_data_format,
558
+ )
559
+ pixel_values.extend(patches)
560
+ vision_grid_thws.append(video_grid_thw)
561
+ pixel_values = np.array(pixel_values)
562
+ vision_grid_thws = np.array(vision_grid_thws)
563
+ data = {
564
+ "pixel_values_videos": pixel_values,
565
+ "video_grid_thw": vision_grid_thws,
566
+ }
567
+
568
+ return BatchFeature(data=data, tensor_type=return_tensors)