Spaces:

rxtan
/

Koala-video-llm

Sleeping

Koala-video-llm / global_local /processors /functional_video.py

Reuben Tan

initial commit

b2afdba 8 months ago

3.92 kB

	"""
	Copyright (c) 2022, salesforce.com, inc.
	All rights reserved.
	SPDX-License-Identifier: BSD-3-Clause
	For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
	"""

	import warnings

	import torch


	def _is_tensor_video_clip(clip):
	if not torch.is_tensor(clip):
	raise TypeError("clip should be Tensor. Got %s" % type(clip))

	if not clip.ndimension() == 4:
	raise ValueError("clip should be 4D. Got %dD" % clip.dim())

	return True


	def crop(clip, i, j, h, w):
	"""
	Args:
	clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
	"""
	if len(clip.size()) != 4:
	raise ValueError("clip should be a 4D tensor")
	return clip[..., i : i + h, j : j + w]


	def resize(clip, target_size, interpolation_mode):
	if len(target_size) != 2:
	raise ValueError(
	f"target size should be tuple (height, width), instead got {target_size}"
	)
	return torch.nn.functional.interpolate(
	clip, size=target_size, mode=interpolation_mode, align_corners=False
	)


	def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
	"""
	Do spatial cropping and resizing to the video clip
	Args:
	clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
	i (int): i in (i,j) i.e coordinates of the upper left corner.
	j (int): j in (i,j) i.e coordinates of the upper left corner.
	h (int): Height of the cropped region.
	w (int): Width of the cropped region.
	size (tuple(int, int)): height and width of resized clip
	Returns:
	clip (torch.tensor): Resized and cropped clip. Size is (C, T, H, W)
	"""
	if not _is_tensor_video_clip(clip):
	raise ValueError("clip should be a 4D torch.tensor")
	clip = crop(clip, i, j, h, w)
	clip = resize(clip, size, interpolation_mode)
	return clip


	def center_crop(clip, crop_size):
	if not _is_tensor_video_clip(clip):
	raise ValueError("clip should be a 4D torch.tensor")
	h, w = clip.size(-2), clip.size(-1)
	th, tw = crop_size
	if h < th or w < tw:
	raise ValueError("height and width must be no smaller than crop_size")

	i = int(round((h - th) / 2.0))
	j = int(round((w - tw) / 2.0))
	return crop(clip, i, j, th, tw)


	def to_tensor(clip):
	"""
	Convert tensor data type from uint8 to float, divide value by 255.0 and
	permute the dimensions of clip tensor
	Args:
	clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C)
	Return:
	clip (torch.tensor, dtype=torch.float): Size is (C, T, H, W)
	"""
	_is_tensor_video_clip(clip)
	if not clip.dtype == torch.uint8:
	raise TypeError(
	"clip tensor should have data type uint8. Got %s" % str(clip.dtype)
	)
	return clip.float().permute(3, 0, 1, 2) / 255.0


	def normalize(clip, mean, std, inplace=False):
	"""
	Args:
	clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W)
	mean (tuple): pixel RGB mean. Size is (3)
	std (tuple): pixel standard deviation. Size is (3)
	Returns:
	normalized clip (torch.tensor): Size is (C, T, H, W)
	"""
	if not _is_tensor_video_clip(clip):
	raise ValueError("clip should be a 4D torch.tensor")
	if not inplace:
	clip = clip.clone()
	mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
	std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
	clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
	return clip


	def hflip(clip):
	"""
	Args:
	clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W)
	Returns:
	flipped clip (torch.tensor): Size is (C, T, H, W)
	"""
	if not _is_tensor_video_clip(clip):
	raise ValueError("clip should be a 4D torch.tensor")
	return clip.flip(-1)