| import math |
| from typing import List, Optional, Union |
|
|
| from PIL import Image |
| import torch |
| from transformers.image_processing_base import BatchFeature |
| from transformers.image_processing_utils_fast import BaseImageProcessorFast |
| from transformers.image_utils import make_list_of_images, get_image_type, ImageInput, ImageType |
| from transformers.utils import TensorType |
| import torchvision.transforms as T |
|
|
|
|
| class NemotronH_Nano_Omni_Reasoning_V3ImageProcessor(BaseImageProcessorFast): |
| """ |
| Dynamic-resolution image processor for the V3 omni model. |
| |
| Each image is resized to a single tile whose patch-grid `(h_patches, w_patches)` is chosen to |
| land between `min_num_patches` and `max_num_patches` (on a 16×16-pixel grid), respecting |
| aspect ratio. This matches the algorithm in vLLM's `DynamicResolutionImageTiler` |
| (`vllm/model_executor/models/nano_nemotron_vl.py`) so HF and vLLM inference see identical pixel |
| inputs. |
| """ |
|
|
| model_input_names = ["pixel_values"] |
|
|
| def __init__( |
| self, |
| norm_mean=None, |
| norm_std=None, |
| patch_size=16, |
| downsample_ratio=0.5, |
| min_num_patches=1024, |
| max_num_patches=13312, |
| max_model_len=16384, |
| video_target_num_patches=1024, |
| video_maintain_aspect_ratio=True, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| self.norm_mean = norm_mean |
| self.norm_std = norm_std |
| self.patch_size = patch_size |
| self.downsample_ratio = downsample_ratio |
| |
| self._downsample_factor = int(round(1.0 / downsample_ratio)) |
| |
| self.min_num_patches = min_num_patches |
| self.max_num_patches = max_num_patches |
| self.max_model_len = max_model_len |
| |
| |
| self.video_target_num_patches = video_target_num_patches |
| self.video_maintain_aspect_ratio = video_maintain_aspect_ratio |
|
|
| |
| |
| |
| def _process_image(self, image: ImageInput, **kwargs): |
| if get_image_type(image) == ImageType.PIL: |
| if image.mode != "RGB": |
| image = image.convert("RGB") |
| return image |
|
|
| |
| process_image = _process_image |
|
|
| |
| |
| _is_video_mode: bool = False |
|
|
| def _preprocess( |
| self, |
| images, |
| return_tensors: Optional[Union[str, TensorType]] = None, |
| **kwargs, |
| ) -> BatchFeature: |
| """Port of vLLM's `DynamicResolutionImageTiler._images_to_pixel_values_lst`. |
| |
| When `self._is_video_mode=True` (flipped by `processing.py` before the video call), each |
| input is resized using the **video** target-size rule (`video_target_num_patches`, |
| aspect-ratio preserved) instead of the image dynamic-res rule. This matches vLLM's split |
| between `video_to_pixel_values` (video path) and `DynamicResolutionImageTiler` (image |
| path). |
| """ |
| is_video = self._is_video_mode |
| images = make_list_of_images(images) |
|
|
| target_sizes = [] |
| if is_video: |
| for img in images: |
| target_w_patches, target_h_patches = self._compute_target_patches_video(img) |
| target_sizes.append((target_w_patches, target_h_patches)) |
| else: |
| |
| |
| num_tokens_available = self.max_model_len - 4 |
| budget = num_tokens_available * (self._downsample_factor ** 2) |
| budget = max(budget, self.min_num_patches * len(images)) |
| max_budget = self.max_num_patches if (self.max_num_patches and self.max_num_patches > 0) else float("inf") |
| per_image_budget = [max(min(budget, max_budget), self.min_num_patches) for _ in images] |
| |
| |
| for img, tokens_for_media in zip(images, per_image_budget): |
| target_w_patches, target_h_patches = self._compute_target_patches(img, tokens_for_media) |
| target_sizes.append((target_w_patches, target_h_patches)) |
|
|
| import numpy as np |
| norm_mean = torch.tensor(self.norm_mean).view(1, 3, 1, 1) |
| norm_std = torch.tensor(self.norm_std).view(1, 3, 1, 1) |
|
|
| pixel_values_list = [] |
| num_tokens_per_image = [] |
| imgs_sizes = [] |
| for img, (wp, hp) in zip(images, target_sizes): |
| target_w = wp * self.patch_size |
| target_h = hp * self.patch_size |
| |
| |
| |
| |
| |
| arr = np.asarray(img, dtype=np.uint8) |
| t = torch.from_numpy(arr).permute(2, 0, 1).unsqueeze(0).to(dtype=torch.float32) |
| if t.shape[-2] != target_h or t.shape[-1] != target_w: |
| t = torch.nn.functional.interpolate( |
| t, size=(target_h, target_w), mode="bicubic", align_corners=False, antialias=True |
| ) |
| t = (t / 255.0 - norm_mean) / norm_std |
| pixel_values_list.append(t.squeeze(0)) |
| num_tokens_per_image.append((wp * hp) // (self._downsample_factor ** 2)) |
| imgs_sizes.append((target_h, target_w)) |
|
|
| |
| |
| |
| all_same_shape = all(t.shape == pixel_values_list[0].shape for t in pixel_values_list) |
| if all_same_shape: |
| pixel_values = torch.stack(pixel_values_list, dim=0) |
| else: |
| pixel_values = pixel_values_list |
|
|
| return BatchFeature( |
| data={ |
| "pixel_values": pixel_values, |
| |
| |
| "num_patches": [1] * len(images), |
| "num_tokens": num_tokens_per_image, |
| "imgs_sizes": imgs_sizes, |
| }, |
| tensor_type=(return_tensors if all_same_shape else None), |
| ) |
|
|
| def _compute_target_patches(self, img: Image.Image, tokens_available: int): |
| """Port of `DynamicResolutionImageTiler.process_media` (image-only, no thumbnail).""" |
| orig_w, orig_h = img.width, img.height |
| |
| closest_patch_h = round(orig_h / self.patch_size + 0.5) |
| closest_patch_w = round(orig_w / self.patch_size + 0.5) |
| patches = closest_patch_h * closest_patch_w |
|
|
| |
| factor = min(math.sqrt(tokens_available / patches), 1.0) |
| target_h = math.floor(factor * closest_patch_h) |
| target_w = math.floor(factor * closest_patch_w) |
|
|
| |
| if ( |
| tokens_available > self.min_num_patches |
| and target_h * target_w < self.min_num_patches |
| ): |
| up = math.sqrt(self.min_num_patches / (target_h * target_w)) |
| target_h = math.ceil(up * target_h) |
| target_w = math.ceil(up * target_w) |
|
|
| |
| divisor = self._downsample_factor |
| rem_h = target_h % divisor |
| if rem_h: |
| inc_h = divisor - rem_h |
| if (target_h + inc_h) * target_w <= tokens_available: |
| target_h += inc_h |
| else: |
| target_h = max(divisor, target_h - rem_h) |
| rem_w = target_w % divisor |
| if rem_w: |
| inc_w = divisor - rem_w |
| if target_h * (target_w + inc_w) <= tokens_available: |
| target_w += inc_w |
| else: |
| target_w = max(divisor, target_w - rem_w) |
|
|
| return target_w, target_h |
|
|
| def _compute_target_patches_video(self, img: Image.Image): |
| """Port of vLLM's `_compute_aspect_preserving_size` for video frames. |
| |
| Each frame is resized to roughly `video_target_num_patches` (default 1024) on the 16×16 |
| grid, with aspect ratio preserved and dims snapped to a multiple of the pixel_shuffle |
| factor. For `maintain_aspect_ratio=False`, it falls back to a square of sqrt(target) |
| patches. |
| """ |
| orig_w, orig_h = img.width, img.height |
| target = self.video_target_num_patches |
| divisor = self._downsample_factor |
| if self.video_maintain_aspect_ratio: |
| aspect_wh = orig_w / max(orig_h, 1) |
| ph = max(round(math.sqrt(target / aspect_wh)), 1) |
| pw = max(round(math.sqrt(target * aspect_wh)), 1) |
| if divisor > 1: |
| rem_h = ph % divisor |
| rem_w = pw % divisor |
| ph_up = ph + (divisor - rem_h if rem_h else 0) |
| ph_down = ph - rem_h |
| pw_up = pw + (divisor - rem_w if rem_w else 0) |
| pw_down = pw - rem_w |
| |
| |
| if ph_up * pw_up <= target: |
| ph, pw = ph_up, pw_up |
| else: |
| ph = max(divisor, ph_down) |
| pw = max(divisor, pw_down) |
| else: |
| side = int(math.sqrt(target)) |
| side = max(divisor, (side // divisor) * divisor) |
| ph = pw = side |
| return pw, ph |
|
|