Spaces:

Koi953215
/

DiffIR2VR

Running on Zero

App Files Files Community

DiffIR2VR / utils /helpers.py

jimmycv07

first commit

1de8821 4 months ago

raw

history blame

19.3 kB

	from typing import overload, Tuple, Optional

	import os
	import cv2
	import torch
	from torch import nn
	import torch.nn.functional as F
	import torchvision.transforms as T

	import numpy as np
	from glob import glob
	from PIL import Image
	from einops import rearrange

	from model.cldm import ControlLDM
	from model.gaussian_diffusion import Diffusion
	from model.bsrnet import RRDBNet
	from model.swinir import SwinIR
	from model.scunet import SCUNet
	from utils.sampler import SpacedSampler
	from utils.cond_fn import Guidance
	from utils.video_visualizer import VideoVisualizer
	from utils.common import wavelet_decomposition, wavelet_reconstruction, count_vram_usage

	import vidtome
	from GMFlow.gmflow.gmflow import GMFlow
	from utils.flow_utils import get_warped_and_mask

	def save_video(input_folder, out_path, output_name, fps=25):
	video_visualizer = VideoVisualizer(path=os.path.join(out_path, output_name),
	frame_size=None,
	fps=fps)
	input_folder = os.path.join(out_path, input_folder)
	imgs = sorted([filename for filename in os.listdir(input_folder) if filename.endswith(('.png', '.jpg'))], key=lambda x: int(x.split('.')[0]))
	for img in imgs:
	img_pth = os.path.join(input_folder, img)
	image = cv2.imread(img_pth)
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	video_visualizer.add(image)
	video_visualizer.save()

	def batch_bicubic_resize(img: np.ndarray, scale: float) -> np.ndarray:

	if scale != 1:
	for i in range(img.shape[0]):
	img[i] = bicubic_resize(img[i], scale)
	# pil = Image.fromarray(img)
	# res = pil.resize(tuple(int(x * scale) for x in pil.size), Image.BICUBIC)
	return img

	def bicubic_resize(img: np.ndarray, scale: float) -> np.ndarray:

	if scale != 1:
	pil = Image.fromarray(img)
	res = pil.resize(tuple(int(x * scale) for x in pil.size), Image.BICUBIC)
	return np.array(res)


	def resize_short_edge_to(imgs: torch.Tensor, size: int) -> torch.Tensor:
	_, _, h, w = imgs.size()
	if h == w:
	new_h, new_w = size, size
	elif h < w:
	new_h, new_w = size, int(w * (size / h))
	else:
	new_h, new_w = int(h * (size / w)), size
	return F.interpolate(imgs, size=(new_h, new_w), mode="bicubic", antialias=True)


	def pad_to_multiples_of(imgs: torch.Tensor, multiple: int) -> torch.Tensor:
	_, _, h, w = imgs.size()
	if h % multiple == 0 and w % multiple == 0:
	return imgs.clone()
	# get_pad = lambda x: (x // multiple + 1) * multiple - x
	get_pad = lambda x: (x // multiple + int(x % multiple != 0)) * multiple - x
	ph, pw = get_pad(h), get_pad(w)
	return F.pad(imgs, pad=(0, pw, 0, ph), mode="constant", value=0)


	class Pipeline:

	def __init__(self, stage1_model: nn.Module, cldm: ControlLDM, diffusion: Diffusion, cond_fn: Optional[Guidance], device: str) -> None:
	self.stage1_model = stage1_model
	self.cldm = cldm
	self.diffusion = diffusion
	self.cond_fn = cond_fn
	self.device = device
	self.final_size: Tuple[int] = None

	def set_final_size(self, lq: torch.Tensor) -> None:
	h, w = lq.shape[2:]
	self.final_size = (h, w)

	@overload
	def run_stage1(self, lq: torch.Tensor) -> torch.Tensor:
	...

	@count_vram_usage
	def run_stage2(
	self,
	clean: torch.Tensor,
	steps: int,
	strength: float,
	tiled: bool,
	tile_size: int,
	tile_stride: int,
	pos_prompt: str,
	neg_prompt: str,
	cfg_scale: float,
	better_start: float,
	index: int = 0,
	input: str = None
	) -> torch.Tensor:
	### preprocess
	bs, _, ori_h, ori_w = clean.shape
	# pad: ensure that height & width are multiples of 64
	pad_clean = pad_to_multiples_of(clean, multiple=64)
	h, w = pad_clean.shape[2:]
	if self.cldm.controller is not None:
	self.cldm.controller.cldm = self.cldm
	self.cldm.controller.non_pad_ratio = (ori_h / h, ori_w / w)
	self.cldm.vae.decoder.controller = self.cldm.controller
	# prepare conditon
	if not tiled:
	cond = self.cldm.prepare_condition(pad_clean, [pos_prompt] * bs)
	uncond = self.cldm.prepare_condition(pad_clean, [neg_prompt] * bs)
	else:
	cond = self.cldm.prepare_condition_tiled(pad_clean, [pos_prompt] * bs, tile_size, tile_stride)
	uncond = self.cldm.prepare_condition_tiled(pad_clean, [neg_prompt] * bs, tile_size, tile_stride)
	if self.cond_fn:
	self.cond_fn.load_target(pad_clean * 2 - 1)
	old_control_scales = self.cldm.control_scales
	self.cldm.control_scales = [strength] * 13
	if better_start:
	# using noised low frequency part of condition as a better start point of
	# reverse sampling, which can prevent our model from generating noise in
	# image background.
	_, low_freq = wavelet_decomposition(pad_clean)
	# low_freq = pad_clean
	if not tiled:
	x_0 = self.cldm.vae_encode(low_freq, batch_size=5)
	else:
	x_0 = self.cldm.vae_encode_tiled(low_freq, tile_size, tile_stride)
	x_T = self.diffusion.q_sample(
	x_0,
	torch.full((bs, ), self.diffusion.num_timesteps - 1, dtype=torch.long, device=self.device),
	torch.randn(x_0.shape, dtype=torch.float32, device=self.device)
	)
	# print(f"diffusion sqrt_alphas_cumprod: {self.diffusion.sqrt_alphas_cumprod[-1]}")
	else:
	if self.cldm.latent_control:
	print(f"[INFO] random initialize {bs} same latents")
	x_T = 1 * torch.randn((1, 4, h // 8, w // 8), dtype=torch.float32, device=self.device)
	x_T = x_T.repeat(bs, 1, 1, 1)
	else:
	print(f"[INFO] random initialize {bs} latents")
	x_T = torch.randn((bs, 4, h // 8, w // 8), dtype=torch.float32, device=self.device)
	''' loaded latents '''
	# t = 981
	# latent_fname = f'noisy_latents_{t}.pt'
	# # model_key = config.model_key.split('/')[-1]
	# model_key = "stable-diffusion-2-1-base"
	# inversion_path = os.path.join("latents", os.path.basename(input), "latents")
	# # outputs/bear_4_BD/latents/stable-diffusion-v1-5/noisy_latents_981.pt
	# lp = os.path.join(inversion_path, model_key, latent_fname)
	# latents = torch.load(lp)

	# # init_noise = latents.to(dtype).to(args.device)
	# x_T = latents[index][None].to(torch.float32).to(self.device)
	# print(f"[INFO] loaded latents[{index}]")
	''' loaded latent ended '''
	### run sampler
	sampler = SpacedSampler(self.diffusion.betas)
	z = sampler.sample(
	model=self.cldm, device=self.device, steps=steps, batch_size=bs, x_size=(4, h // 8, w // 8),
	cond=cond, uncond=uncond, cfg_scale=cfg_scale, x_T=x_T, progress=True,
	progress_leave=True, cond_fn=self.cond_fn, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride,
	non_pad_ratio=(ori_h / h, ori_w / w)
	)
	if not tiled:
	if ori_w > 1500:
	x = self.cldm.vae_decode(z, batch_size=2)
	else:
	x = self.cldm.vae_decode(z, batch_size=5)
	else:
	x = self.cldm.vae_decode_tiled(z, tile_size // 8, tile_stride // 8)
	### postprocess
	self.cldm.control_scales = old_control_scales
	sample = x[:, :, :ori_h, :ori_w]
	return sample

	@torch.no_grad()
	def run(
	self,
	lq: np.ndarray,
	steps: int,
	strength: float,
	tiled: bool,
	tile_size: int,
	tile_stride: int,
	pos_prompt: str,
	neg_prompt: str,
	cfg_scale: float,
	better_start: bool,
	index: int = 0,
	input: str = None,
	final_size: Tuple[int] = None,
	flow_model: GMFlow = None,
	hq: np.ndarray = None
	) -> np.ndarray:
	# image to tensor
	lq = torch.tensor((lq / 255.).clip(0, 1), dtype=torch.float32, device=self.device)
	lq = rearrange(lq, "n h w c -> n c h w").contiguous()
	# set pipeline output size
	if final_size is None:
	self.set_final_size(lq)
	else:
	self.final_size = final_size

	clean = self.run_stage1(lq)
	print(f"[INFO] {clean.shape}")
	# import ipdb; ipdb.set_trace()
	# clean = F.interpolate(lq, size=clean.shape[-2:], mode='bicubic', align_corners=False)
	''' hq flow & occlusion mask '''
	# hq = torch.tensor((hq / 255.).clip(0, 1), dtype=torch.float32, device=self.device)
	# hq = rearrange(hq, "n h w c -> n c h w").contiguous()
	# hq = resize_short_edge_to(hq, size=512)
	# pre_keyframe_lq = None

	# if self.cldm.controller is not None and \
	# self.cldm.controller.step_store["pre_keyframe_lq"] is not None:

	# pre_keyframe_lq = self.cldm.controller.step_store["pre_keyframe_lq"]
	# pre_keyframe_lq = torch.tensor((pre_keyframe_lq / 255.).clip(0, 1), dtype=torch.float32, device=self.device)
	# pre_keyframe_lq = rearrange(pre_keyframe_lq, "n h w c -> n c h w").contiguous()
	# pre_keyframe_lq = resize_short_edge_to(pre_keyframe_lq, size=512)
	# pre_keyframe_clean = pre_keyframe_lq[0]
	# # pre_keyframe_clean = self.run_stage1(pre_keyframe_lq)[0]

	# flows, masks, confids = [], [], []
	# mid = lq.shape[0] // 2
	# for k in range(lq.shape[0]):
	# if k == mid:
	# if pre_keyframe_lq is not None:
	# tar_img = (torch.clamp(hq[mid], 0 ,1) * 255).float().to(self.device)
	# src_img = (torch.clamp(pre_keyframe_clean, 0 ,1) * 255).float().to(self.device)
	# else:
	# flows.append(None)
	# masks.append(None)
	# confids.append(None)
	# continue
	# else:
	# tar_img = (torch.clamp(hq[k], 0 ,1) * 255).float().to(self.device)
	# src_img = (torch.clamp(hq[mid], 0 ,1) * 255).float().to(self.device)
	# # tar_img = stage1_x[0].float().to(args.device)
	# _, bwd_occ, bwd_flow, bwd_confid = get_warped_and_mask(
	# flow_model, src_img, tar_img, image3=None, pixel_consistency=False, return_confidence=True)
	# blend_mask = T.GaussianBlur(kernel_size=(9, 9), sigma=(18, 18))(
	# F.max_pool2d(bwd_occ, kernel_size=9, stride=1, padding=4))

	# blend_mask = torch.clamp(blend_mask + bwd_occ, 0, 1)
	# blend_mask = 1 - F.max_pool2d(blend_mask, kernel_size=8)

	# bwd_confid = F.max_pool2d(bwd_confid, kernel_size=8)

	# bwd_flow = F.interpolate(bwd_flow / 8.0, scale_factor=1. / 8, mode='bilinear')

	# # _, _, h, w = bwd_flow.shape
	# # bwd_flow = pad_to_multiples_of(bwd_flow, 8)
	# # padding_ratio = w / bwd_flow.shape[3]
	# blend_mask = pad_to_multiples_of(blend_mask[None], 8)[0]
	# # bwd_confid = pad_to_multiples_of(bwd_confid[None], 8)[0]
	# flows.append(bwd_flow)
	# masks.append(blend_mask)
	# confids.append(bwd_confid)

	# if self.cldm.controller is not None:
	# self.cldm.controller.set_warp(flows, masks, flow_confids=confids)

	''' flow & occlusion mask '''
	pre_keyframe_lq = None

	if self.cldm.controller is not None and \
	self.cldm.controller.step_store["pre_keyframe_lq"] is not None:

	pre_keyframe_lq = self.cldm.controller.step_store["pre_keyframe_lq"]
	pre_keyframe_lq = torch.tensor((pre_keyframe_lq / 255.).clip(0, 1), dtype=torch.float32, device=self.device)
	pre_keyframe_lq = rearrange(pre_keyframe_lq, "n h w c -> n c h w").contiguous()
	pre_keyframe_clean = self.run_stage1(pre_keyframe_lq)[0]

	flows, masks, confids = [], [], []
	flows2, confids2 = [], []
	mid = lq.shape[0] // 2
	for k in range(lq.shape[0]):
	if k == mid:
	if pre_keyframe_lq is not None:
	tar_img = (torch.clamp(clean[mid], 0 ,1) * 255).float().to(self.device)
	src_img = (torch.clamp(pre_keyframe_clean, 0 ,1) * 255).float().to(self.device)
	else:
	flows.append(None)
	masks.append(None)
	confids.append(None)
	continue
	else:
	tar_img = (torch.clamp(clean[k], 0 ,1) * 255).float().to(self.device)
	src_img = (torch.clamp(clean[mid], 0 ,1) * 255).float().to(self.device)
	# tar_img = stage1_x[0].float().to(args.device)
	_, bwd_occ, bwd_flow, bwd_confid = get_warped_and_mask(
	flow_model, src_img, tar_img, image3=None, pixel_consistency=False, return_confidence=True)
	blend_mask = T.GaussianBlur(kernel_size=(9, 9), sigma=(18, 18))(
	F.max_pool2d(bwd_occ, kernel_size=9, stride=1, padding=4))
	blend_mask = torch.clamp(blend_mask + bwd_occ, 0, 1)
	blend_mask = 1 - F.max_pool2d(blend_mask, kernel_size=8)

	blend_mask = 1 - F.max_pool2d(bwd_occ, kernel_size=8)

	bwd_confid2 = F.max_pool2d(bwd_confid, kernel_size=16)
	bwd_flow2 = F.interpolate(bwd_flow / 16.0, scale_factor=1. / 16, mode='bilinear')


	bwd_confid = F.max_pool2d(bwd_confid, kernel_size=8)
	bwd_flow = F.interpolate(bwd_flow / 8.0, scale_factor=1. / 8, mode='bilinear')

	# _, _, h, w = bwd_flow.shape
	# bwd_flow = pad_to_multiples_of(bwd_flow, 8)
	# padding_ratio = w / bwd_flow.shape[3]
	blend_mask = pad_to_multiples_of(blend_mask[None], 8)[0]
	# bwd_confid = pad_to_multiples_of(bwd_confid[None], 8)[0]
	flows.append(bwd_flow)
	masks.append(blend_mask)
	confids.append(bwd_confid)

	flows2.append(bwd_flow2)
	confids2.append(bwd_confid2)

	if self.cldm.controller is not None:
	self.cldm.controller.set_warp(flows, masks, flow_confids=confids)
	# import ipdb; ipdb.set_trace()
	_, H, W = confids[0].shape
	self.cldm.controller.set_flow_correspondence(lq.shape[0], H, W, lq.shape[0] // 2, confids, flows)
	_, H, W = confids2[0].shape
	self.cldm.controller.set_flow_correspondence(lq.shape[0], H, W, lq.shape[0] // 2, confids2, flows2)
	for j, flow in enumerate(self.cldm.controller.step_store["flows"]):
	if flow is not None:
	self.cldm.controller.step_store["flows"][j] = pad_to_multiples_of(self.cldm.controller.step_store["flows"][j], 8)
	# self.cldm.controller.set_warp2(flows2, confids2)
	''' flow & occlusion mask ended '''


	sample = self.run_stage2(
	clean, steps, strength, tiled, tile_size, tile_stride,
	pos_prompt, neg_prompt, cfg_scale, better_start,
	index=index, input=input
	)

	if self.cldm.controller is not None:
	print(f"[INFO] clearing controller correspondence scores ... ")
	self.cldm.controller.step_store["corres_scores"] = None
	# colorfix (borrowed from StableSR, thanks for their work)
	sample = (sample + 1) / 2
	sample = wavelet_reconstruction(sample, clean)
	# resize to desired output size
	sample = F.interpolate(sample, size=self.final_size, mode="bicubic", antialias=True)
	clean = F.interpolate(clean, size=self.final_size, mode="bilinear", antialias=True)
	# tensor to image
	sample = rearrange(sample * 255., "n c h w -> n h w c")
	sample = sample.contiguous().clamp(0, 255).to(torch.uint8).cpu().numpy()
	clean = rearrange(clean * 255., "n c h w -> n h w c")
	clean = clean.contiguous().clamp(0, 255).to(torch.uint8).cpu().numpy()
	return sample, clean


	class BSRNetPipeline(Pipeline):

	def __init__(self, bsrnet: RRDBNet, cldm: ControlLDM, diffusion: Diffusion, cond_fn: Optional[Guidance], device: str, upscale: float) -> None:
	super().__init__(bsrnet, cldm, diffusion, cond_fn, device)
	self.upscale = upscale

	def set_final_size(self, lq: torch.Tensor) -> None:
	h, w = lq.shape[2:]
	self.final_size = (int(h * self.upscale), int(w * self.upscale))

	@count_vram_usage
	def run_stage1(self, lq: torch.Tensor) -> torch.Tensor:
	# NOTE: upscale is always set to 4 in our experiments
	if lq.shape[-2] > 1000:
	clean = []
	for i in range(lq.shape[0]):
	torch.cuda.empty_cache()
	clean.append(self.stage1_model(lq[i:i+1]))
	clean = torch.cat(clean, dim=0)
	else:
	clean = self.stage1_model(lq)
	# if self.final_size[0] < 512 and self.final_size[1] < 512:
	if min(self.final_size) < 512:
	clean = resize_short_edge_to(clean, size=512)
	else:
	clean = F.interpolate(clean, size=self.final_size, mode="bicubic", antialias=True)
	return clean


	class SwinIRPipeline(Pipeline):

	def __init__(self, swinir: SwinIR, cldm: ControlLDM, diffusion: Diffusion, cond_fn: Optional[Guidance], device: str) -> None:
	super().__init__(swinir, cldm, diffusion, cond_fn, device)

	@count_vram_usage
	def run_stage1(self, lq: torch.Tensor) -> torch.Tensor:
	# NOTE: lq size is always equal to 512 in our experiments
	# resize: ensure the input lq size is as least 512, since SwinIR is trained on 512 resolution
	if min(lq.shape[2:]) < 512:
	lq = resize_short_edge_to(lq, size=512)
	ori_h, ori_w = lq.shape[2:]
	# pad: ensure that height & width are multiples of 64
	pad_lq = pad_to_multiples_of(lq, multiple=64)
	# run
	clean = self.stage1_model(pad_lq)
	# remove padding
	clean = clean[:, :, :ori_h, :ori_w]
	return clean


	class SCUNetPipeline(Pipeline):

	def __init__(self, scunet: SCUNet, cldm: ControlLDM, diffusion: Diffusion, cond_fn: Optional[Guidance], device: str) -> None:
	super().__init__(scunet, cldm, diffusion, cond_fn, device)

	@count_vram_usage
	def run_stage1(self, lq: torch.Tensor) -> torch.Tensor:
	if lq.shape[-1] > 1500:
	clean = []
	batch_lq = lq.split(2, dim=0)
	for lq_ in batch_lq:
	clean.append(self.stage1_model(lq_))
	torch.cuda.empty_cache()
	clean = torch.cat(clean)
	else:
	clean = self.stage1_model(lq)
	if min(clean.shape[2:]) < 512:
	clean = resize_short_edge_to(clean, size=512)
	# import ipdb; ipdb.set_trace()
	return clean