76_Nvidia_Reasoning_30B / image_processing.py

Upload folder using huggingface_hub

14189d7 verified 6 days ago

10.9 kB

	import math
	from typing import List, Optional, Union

	from PIL import Image
	import torch
	from transformers.image_processing_base import BatchFeature
	from transformers.image_processing_utils_fast import BaseImageProcessorFast
	from transformers.image_utils import make_list_of_images, get_image_type, ImageInput, ImageType
	from transformers.utils import TensorType
	import torchvision.transforms as T


	class NemotronH_Nano_Omni_Reasoning_V3ImageProcessor(BaseImageProcessorFast):
	"""
	Dynamic-resolution image processor for the V3 omni model.

	Each image is resized to a single tile whose patch-grid `(h_patches, w_patches)` is chosen to
	land between `min_num_patches` and `max_num_patches` (on a 16×16-pixel grid), respecting
	aspect ratio. This matches the algorithm in vLLM's `DynamicResolutionImageTiler`
	(`vllm/model_executor/models/nano_nemotron_vl.py`) so HF and vLLM inference see identical pixel
	inputs.
	"""

	model_input_names = ["pixel_values"]

	def __init__(
	self,
	norm_mean=None,
	norm_std=None,
	patch_size=16,
	downsample_ratio=0.5,
	min_num_patches=1024,
	max_num_patches=13312,
	max_model_len=16384,
	video_target_num_patches=1024,
	video_maintain_aspect_ratio=True,
	**kwargs,
	):
	super().__init__(**kwargs)
	self.norm_mean = norm_mean
	self.norm_std = norm_std
	self.patch_size = patch_size
	self.downsample_ratio = downsample_ratio
	# Integer reduction factor for pixel_shuffle (downsample_ratio = 0.5 → factor 2).
	self._downsample_factor = int(round(1.0 / downsample_ratio))
	# Per-image patch-grid bounds (on the pre-pixel-shuffle 16×16 grid).
	self.min_num_patches = min_num_patches
	self.max_num_patches = max_num_patches
	self.max_model_len = max_model_len
	# Video frames use a separate (fixed) target-patch budget with aspect-ratio preserved.
	# Matches vLLM's `_compute_aspect_preserving_size` in `nano_nemotron_vl.py`.
	self.video_target_num_patches = video_target_num_patches
	self.video_maintain_aspect_ratio = video_maintain_aspect_ratio

	# Keep the PIL image through to `_preprocess` — we need PIL.resize (bicubic) to match vLLM's
	# algorithm exactly; resizing a tensor via `torchvision.transforms.Resize` uses different
	# kernels and breaks bit-exact agreement.
	def _process_image(self, image: ImageInput, **kwargs):
	if get_image_type(image) == ImageType.PIL:
	if image.mode != "RGB":
	image = image.convert("RGB")
	return image

	# transformers 5.6 renamed this hook from `_process_image` to `process_image`; alias both.
	process_image = _process_image

	# Toggled by `processing.py` around video calls (the strict `ImagesKwargs` validator won't let
	# us thread a new kwarg down, so we use a flag on the instance instead).
	_is_video_mode: bool = False

	def _preprocess(
	self,
	images,
	return_tensors: Optional[Union[str, TensorType]] = None,
	**kwargs,
	) -> BatchFeature:
	"""Port of vLLM's `DynamicResolutionImageTiler._images_to_pixel_values_lst`.

	When `self._is_video_mode=True` (flipped by `processing.py` before the video call), each
	input is resized using the video target-size rule (`video_target_num_patches`,
	aspect-ratio preserved) instead of the image dynamic-res rule. This matches vLLM's split
	between `video_to_pixel_values` (video path) and `DynamicResolutionImageTiler` (image
	path).
	"""
	is_video = self._is_video_mode
	images = make_list_of_images(images)

	target_sizes = []
	if is_video:
	for img in images:
	target_w_patches, target_h_patches = self._compute_target_patches_video(img)
	target_sizes.append((target_w_patches, target_h_patches))
	else:
	# Image path: per-image budget bounded by [min_num_patches, max_num_patches], with a
	# global cap derived from `max_model_len` × pixel-shuffle factor².
	num_tokens_available = self.max_model_len - 4 # match vLLM's reserve
	budget = num_tokens_available * (self._downsample_factor ** 2)
	budget = max(budget, self.min_num_patches * len(images))
	max_budget = self.max_num_patches if (self.max_num_patches and self.max_num_patches > 0) else float("inf")
	per_image_budget = [max(min(budget, max_budget), self.min_num_patches) for _ in images]
	# Single-pass — vLLM has an iterative scale-down for the batch, but it rarely binds in
	# single-image / small-batch inference.
	for img, tokens_for_media in zip(images, per_image_budget):
	target_w_patches, target_h_patches = self._compute_target_patches(img, tokens_for_media)
	target_sizes.append((target_w_patches, target_h_patches))

	import numpy as np
	norm_mean = torch.tensor(self.norm_mean).view(1, 3, 1, 1)
	norm_std = torch.tensor(self.norm_std).view(1, 3, 1, 1)

	pixel_values_list = []
	num_tokens_per_image = []
	imgs_sizes = []
	for img, (wp, hp) in zip(images, target_sizes):
	target_w = wp * self.patch_size
	target_h = hp * self.patch_size
	# Use torch's antialiased bicubic interpolation to match vLLM's
	# `_bicubic_resize_and_normalize` (`torch.nn.functional.interpolate`, `antialias=True`).
	# PIL's bicubic uses a different kernel (and no antialiasing), producing visibly different
	# pixel values that amplify through the 52-layer ViT / mamba stack and cause HF/vLLM
	# outputs to diverge past the first few tokens.
	arr = np.asarray(img, dtype=np.uint8) # (H, W, 3)
	t = torch.from_numpy(arr).permute(2, 0, 1).unsqueeze(0).to(dtype=torch.float32) # (1, 3, H, W)
	if t.shape[-2] != target_h or t.shape[-1] != target_w:
	t = torch.nn.functional.interpolate(
	t, size=(target_h, target_w), mode="bicubic", align_corners=False, antialias=True
	)
	t = (t / 255.0 - norm_mean) / norm_std
	pixel_values_list.append(t.squeeze(0)) # (3, H, W)
	num_tokens_per_image.append((wp * hp) // (self._downsample_factor ** 2))
	imgs_sizes.append((target_h, target_w))

	# Stack if all images have the same target size (common for same-aspect-ratio batches);
	# otherwise keep as a list of (3, H_i, W_i) tensors. The outer model's `extract_feature`
	# handles both.
	all_same_shape = all(t.shape == pixel_values_list[0].shape for t in pixel_values_list)
	if all_same_shape:
	pixel_values = torch.stack(pixel_values_list, dim=0)
	else:
	pixel_values = pixel_values_list

	return BatchFeature(
	data={
	"pixel_values": pixel_values,
	# One tile per image in dynamic mode — `num_tokens` is what the text-side
	# placeholder expansion should use.
	"num_patches": [1] * len(images),
	"num_tokens": num_tokens_per_image,
	"imgs_sizes": imgs_sizes,
	},
	tensor_type=(return_tensors if all_same_shape else None),
	)

	def _compute_target_patches(self, img: Image.Image, tokens_available: int):
	"""Port of `DynamicResolutionImageTiler.process_media` (image-only, no thumbnail)."""
	orig_w, orig_h = img.width, img.height
	# Ceil-ish: `round(x + 0.5)` == `floor(x) + 1` for non-integer x, `x` for integer.
	closest_patch_h = round(orig_h / self.patch_size + 0.5)
	closest_patch_w = round(orig_w / self.patch_size + 0.5)
	patches = closest_patch_h * closest_patch_w

	# Downscale to fit the token budget.
	factor = min(math.sqrt(tokens_available / patches), 1.0)
	target_h = math.floor(factor * closest_patch_h)
	target_w = math.floor(factor * closest_patch_w)

	# Scale up if below the per-image minimum.
	if (
	tokens_available > self.min_num_patches
	and target_h * target_w < self.min_num_patches
	):
	up = math.sqrt(self.min_num_patches / (target_h * target_w))
	target_h = math.ceil(up * target_h)
	target_w = math.ceil(up * target_w)

	# Round each dim to a multiple of the pixel_shuffle factor so tokens divide evenly.
	divisor = self._downsample_factor
	rem_h = target_h % divisor
	if rem_h:
	inc_h = divisor - rem_h
	if (target_h + inc_h) * target_w <= tokens_available:
	target_h += inc_h
	else:
	target_h = max(divisor, target_h - rem_h)
	rem_w = target_w % divisor
	if rem_w:
	inc_w = divisor - rem_w
	if target_h * (target_w + inc_w) <= tokens_available:
	target_w += inc_w
	else:
	target_w = max(divisor, target_w - rem_w)

	return target_w, target_h

	def _compute_target_patches_video(self, img: Image.Image):
	"""Port of vLLM's `_compute_aspect_preserving_size` for video frames.

	Each frame is resized to roughly `video_target_num_patches` (default 1024) on the 16×16
	grid, with aspect ratio preserved and dims snapped to a multiple of the pixel_shuffle
	factor. For `maintain_aspect_ratio=False`, it falls back to a square of sqrt(target)
	patches.
	"""
	orig_w, orig_h = img.width, img.height
	target = self.video_target_num_patches
	divisor = self._downsample_factor # 2 for pixel_shuffle
	if self.video_maintain_aspect_ratio:
	aspect_wh = orig_w / max(orig_h, 1)
	ph = max(round(math.sqrt(target / aspect_wh)), 1)
	pw = max(round(math.sqrt(target * aspect_wh)), 1)
	if divisor > 1:
	rem_h = ph % divisor
	rem_w = pw % divisor
	ph_up = ph + (divisor - rem_h if rem_h else 0)
	ph_down = ph - rem_h
	pw_up = pw + (divisor - rem_w if rem_w else 0)
	pw_down = pw - rem_w
	# Prefer rounding up when the up-rounded patch count still fits the target;
	# otherwise round down (mirrors vLLM's logic exactly).
	if ph_up * pw_up <= target:
	ph, pw = ph_up, pw_up
	else:
	ph = max(divisor, ph_down)
	pw = max(divisor, pw_down)
	else:
	side = int(math.sqrt(target))
	side = max(divisor, (side // divisor) * divisor)
	ph = pw = side
	return pw, ph