Spaces:

stepfun-ai
/

Step1X-3D

Running on Zero

Step1X-3D / step1x3d_texture /systems /ig2mv_sdxl.py

2ac1c2d 3 days ago

18.8 kB

	import os
	import random
	from dataclasses import dataclass, field

	import torch
	import torch.nn.functional as F
	from diffusers import DDPMScheduler, UNet2DConditionModel
	from diffusers.models import AutoencoderKL
	from diffusers.training_utils import compute_snr
	from einops import rearrange
	from omegaconf import OmegaConf
	from PIL import Image

	from ..pipelines.ig2mv_sdxl_pipeline import IG2MVSDXLPipeline
	from ..schedulers.scheduling_shift_snr import ShiftSNRScheduler
	from ..utils.core import find
	from ..utils.typing import *
	from .base import BaseSystem
	from .utils import encode_prompt, vae_encode


	def compute_embeddings(
	prompt_batch,
	empty_prompt_indices,
	text_encoders,
	tokenizers,
	is_train=True,
	**kwargs,
	):
	original_size = kwargs["original_size"]
	target_size = kwargs["target_size"]
	crops_coords_top_left = kwargs["crops_coords_top_left"]

	for i in range(empty_prompt_indices.shape[0]):
	if empty_prompt_indices[i]:
	prompt_batch[i] = ""

	prompt_embeds, pooled_prompt_embeds = encode_prompt(
	prompt_batch, text_encoders, tokenizers, 0, is_train
	)
	add_text_embeds = pooled_prompt_embeds.to(
	device=prompt_embeds.device, dtype=prompt_embeds.dtype
	)

	# Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
	add_time_ids = list(original_size + crops_coords_top_left + target_size)
	add_time_ids = torch.tensor([add_time_ids])
	add_time_ids = add_time_ids.repeat(len(prompt_batch), 1)
	add_time_ids = add_time_ids.to(
	device=prompt_embeds.device, dtype=prompt_embeds.dtype
	)

	unet_added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}

	return {"prompt_embeds": prompt_embeds, **unet_added_cond_kwargs}


	class IG2MVSDXLSystem(BaseSystem):
	@dataclass
	class Config(BaseSystem.Config):

	# Model / Adapter
	pretrained_model_name_or_path: str = "stabilityai/stable-diffusion-xl-base-1.0"
	pretrained_vae_name_or_path: Optional[str] = "madebyollin/sdxl-vae-fp16-fix"
	pretrained_adapter_name_or_path: Optional[str] = None
	pretrained_unet_name_or_path: Optional[str] = None
	init_adapter_kwargs: Dict[str, Any] = field(default_factory=dict)

	use_fp16_vae: bool = True
	use_fp16_clip: bool = True

	# Training
	trainable_modules: List[str] = field(default_factory=list)
	train_cond_encoder: bool = True
	prompt_drop_prob: float = 0.0
	image_drop_prob: float = 0.0
	cond_drop_prob: float = 0.0

	gradient_checkpointing: bool = False

	# Noise sampler
	noise_scheduler_kwargs: Dict[str, Any] = field(default_factory=dict)
	noise_offset: float = 0.0
	input_perturbation: float = 0.0
	snr_gamma: Optional[float] = 5.0
	prediction_type: Optional[str] = None
	shift_noise: bool = False
	shift_noise_mode: str = "interpolated"
	shift_noise_scale: float = 1.0

	# Evaluation
	eval_seed: int = 0
	eval_num_inference_steps: int = 30
	eval_guidance_scale: float = 1.0
	eval_height: int = 512
	eval_width: int = 512

	cfg: Config

	def configure(self):
	super().configure()

	# Prepare pipeline
	pipeline_kwargs = {}
	if self.cfg.pretrained_vae_name_or_path is not None:
	pipeline_kwargs["vae"] = AutoencoderKL.from_pretrained(
	self.cfg.pretrained_vae_name_or_path
	)
	if self.cfg.pretrained_unet_name_or_path is not None:
	pipeline_kwargs["unet"] = UNet2DConditionModel.from_pretrained(
	self.cfg.pretrained_unet_name_or_path
	)

	pipeline: IG2MVSDXLPipeline
	pipeline = IG2MVSDXLPipeline.from_pretrained(
	self.cfg.pretrained_model_name_or_path, **pipeline_kwargs
	)

	init_adapter_kwargs = OmegaConf.to_container(self.cfg.init_adapter_kwargs)
	if "self_attn_processor" in init_adapter_kwargs:
	self_attn_processor = init_adapter_kwargs["self_attn_processor"]
	if self_attn_processor is not None and isinstance(self_attn_processor, str):
	self_attn_processor = find(self_attn_processor)
	init_adapter_kwargs["self_attn_processor"] = self_attn_processor
	pipeline.init_custom_adapter(**init_adapter_kwargs)

	if self.cfg.pretrained_adapter_name_or_path:
	pretrained_path = os.path.dirname(self.cfg.pretrained_adapter_name_or_path)
	adapter_name = os.path.basename(self.cfg.pretrained_adapter_name_or_path)
	pipeline.load_custom_adapter(pretrained_path, weight_name=adapter_name)

	noise_scheduler = DDPMScheduler.from_config(
	pipeline.scheduler.config, **self.cfg.noise_scheduler_kwargs
	)
	if self.cfg.shift_noise:
	noise_scheduler = ShiftSNRScheduler.from_scheduler(
	noise_scheduler,
	shift_mode=self.cfg.shift_noise_mode,
	shift_scale=self.cfg.shift_noise_scale,
	scheduler_class=DDPMScheduler,
	)
	pipeline.scheduler = noise_scheduler

	# Prepare models
	self.pipeline: IG2MVSDXLPipeline = pipeline
	self.vae = self.pipeline.vae.to(
	dtype=torch.float16 if self.cfg.use_fp16_vae else torch.float32
	)
	self.tokenizer = self.pipeline.tokenizer
	self.tokenizer_2 = self.pipeline.tokenizer_2
	self.text_encoder = self.pipeline.text_encoder.to(
	dtype=torch.float16 if self.cfg.use_fp16_clip else torch.float32
	)
	self.text_encoder_2 = self.pipeline.text_encoder_2.to(
	dtype=torch.float16 if self.cfg.use_fp16_clip else torch.float32
	)
	self.feature_extractor = self.pipeline.feature_extractor

	self.cond_encoder = self.pipeline.cond_encoder
	self.unet = self.pipeline.unet
	self.noise_scheduler = self.pipeline.scheduler
	self.inference_scheduler = DDPMScheduler.from_config(
	self.noise_scheduler.config
	)
	self.pipeline.scheduler = self.inference_scheduler
	if self.cfg.prediction_type is not None:
	self.noise_scheduler.register_to_config(
	prediction_type=self.cfg.prediction_type
	)

	# Prepare trainable / non-trainable modules
	trainable_modules = self.cfg.trainable_modules
	if trainable_modules and len(trainable_modules) > 0:
	self.unet.requires_grad_(False)
	for name, module in self.unet.named_modules():
	for trainable_module in trainable_modules:
	if trainable_module in name:
	module.requires_grad_(True)
	else:
	self.unet.requires_grad_(True)
	self.cond_encoder.requires_grad_(self.cfg.train_cond_encoder)

	self.vae.requires_grad_(False)
	self.text_encoder.requires_grad_(False)
	self.text_encoder_2.requires_grad_(False)

	# Others
	# Prepare gradient checkpointing
	if self.cfg.gradient_checkpointing:
	self.unet.enable_gradient_checkpointing()

	def forward(
	self,
	noisy_latents: Tensor,
	conditioning_pixel_values: Tensor,
	timesteps: Tensor,
	ref_latents: Tensor,
	prompts: List[str],
	num_views: int,
	**kwargs,
	) -> Dict[str, Any]:
	bsz = noisy_latents.shape[0]
	b_samples = bsz // num_views
	num_batch_images = num_views

	prompt_drop_mask = (
	torch.rand(b_samples, device=noisy_latents.device)
	< self.cfg.prompt_drop_prob
	)
	image_drop_mask = (
	torch.rand(b_samples, device=noisy_latents.device)
	< self.cfg.image_drop_prob
	)
	cond_drop_mask = (
	torch.rand(b_samples, device=noisy_latents.device) < self.cfg.cond_drop_prob
	)
	prompt_drop_mask = prompt_drop_mask \| cond_drop_mask
	image_drop_mask = image_drop_mask \| cond_drop_mask

	with torch.no_grad(), torch.cuda.amp.autocast(enabled=False):
	# Here, we compute not just the text embeddings but also the additional embeddings
	# needed for the SD XL UNet to operate.
	additional_embeds = compute_embeddings(
	prompts,
	prompt_drop_mask,
	[self.text_encoder, self.text_encoder_2],
	[self.tokenizer, self.tokenizer_2],
	**kwargs,
	)

	# Process reference latents to obtain reference features
	with torch.no_grad():
	ref_timesteps = torch.zeros_like(timesteps[:b_samples])
	ref_hidden_states = {}
	self.unet(
	ref_latents,
	ref_timesteps,
	encoder_hidden_states=additional_embeds["prompt_embeds"],
	added_cond_kwargs={
	"text_embeds": additional_embeds["text_embeds"],
	"time_ids": additional_embeds["time_ids"],
	},
	cross_attention_kwargs={
	"cache_hidden_states": ref_hidden_states,
	"use_mv": False,
	"use_ref": False,
	},
	return_dict=False,
	)
	for k, v in ref_hidden_states.items():
	v_ = v
	v_[image_drop_mask] = 0.0
	ref_hidden_states[k] = v_.repeat_interleave(num_batch_images, dim=0)

	# Repeat additional embeddings for each image in the batch
	for key, value in additional_embeds.items():
	kwargs[key] = value.repeat_interleave(num_batch_images, dim=0)

	conditioning_features = self.cond_encoder(conditioning_pixel_values)

	added_cond_kwargs = {
	"text_embeds": kwargs["text_embeds"],
	"time_ids": kwargs["time_ids"],
	}

	noise_pred = self.unet(
	noisy_latents,
	timesteps,
	encoder_hidden_states=kwargs["prompt_embeds"],
	added_cond_kwargs=added_cond_kwargs,
	down_intrablock_additional_residuals=conditioning_features,
	cross_attention_kwargs={
	"ref_hidden_states": ref_hidden_states,
	"num_views": num_views,
	},
	).sample

	return {"noise_pred": noise_pred}

	def training_step(self, batch, batch_idx):
	num_views = batch["num_views"]

	vae_max_slice = 8
	with torch.no_grad(), torch.cuda.amp.autocast(enabled=False):
	latents = []
	for i in range(0, batch["rgb"].shape[0], vae_max_slice):
	latents.append(
	vae_encode(
	self.vae,
	batch["rgb"][i : i + vae_max_slice].to(self.vae.dtype) * 2 - 1,
	sample=True,
	apply_scale=True,
	).float()
	)
	latents = torch.cat(latents, dim=0)

	with torch.no_grad(), torch.cuda.amp.autocast(enabled=False):
	ref_latents = vae_encode(
	self.vae,
	batch["reference_rgb"].to(self.vae.dtype) * 2 - 1,
	sample=True,
	apply_scale=True,
	).float()

	bsz = latents.shape[0]
	b_samples = bsz // num_views

	noise = torch.randn_like(latents)
	if self.cfg.noise_offset is not None:
	# # https://www.crosslabs.org//blog/diffusion-with-offset-noise
	noise += self.cfg.noise_offset * torch.randn(
	(latents.shape[0], latents.shape[1], 1, 1), device=latents.device
	)

	noise_mask = (
	batch["noise_mask"]
	if "noise_mask" in batch
	else torch.ones((bsz,), dtype=torch.bool, device=latents.device)
	)
	timesteps = torch.randint(
	0,
	self.noise_scheduler.config.num_train_timesteps,
	(b_samples,),
	device=latents.device,
	dtype=torch.long,
	)
	timesteps = timesteps.repeat_interleave(num_views)
	timesteps[~noise_mask] = 0

	if self.cfg.input_perturbation is not None:
	new_noise = noise + self.cfg.input_perturbation * torch.randn_like(noise)
	noisy_latents = self.noise_scheduler.add_noise(
	latents, new_noise, timesteps
	)
	else:
	noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)

	noisy_latents[~noise_mask] = latents[~noise_mask]

	if self.noise_scheduler.config.prediction_type == "epsilon":
	target = noise
	elif self.noise_scheduler.config.prediction_type == "v_prediction":
	target = self.noise_scheduler.get_velocity(latents, noise, timesteps)
	else:
	raise ValueError(
	f"Unsupported prediction type {self.noise_scheduler.config.prediction_type}"
	)

	model_pred = self(
	noisy_latents, batch["source_rgb"], timesteps, ref_latents, **batch
	)["noise_pred"]

	model_pred = model_pred[noise_mask]
	target = target[noise_mask]

	if self.cfg.snr_gamma is None:
	loss = F.mse_loss(model_pred, target, reduction="mean")
	else:
	# Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
	# Since we predict the noise instead of x_0, the original formulation is slightly changed.
	# This is discussed in Section 4.2 of the same paper.
	snr = compute_snr(self.noise_scheduler, timesteps)
	if self.noise_scheduler.config.prediction_type == "v_prediction":
	# Velocity objective requires that we add one to SNR values before we divide by them.
	snr = snr + 1
	mse_loss_weights = (
	torch.stack(
	[snr, self.cfg.snr_gamma * torch.ones_like(timesteps)], dim=1
	).min(dim=1)[0]
	/ snr
	)

	loss = F.mse_loss(model_pred, target, reduction="none")
	loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
	loss = loss.mean()

	self.log("train/loss", loss, prog_bar=True)

	# will execute self.on_check_train every self.cfg.check_train_every_n_steps steps
	self.check_train(batch)

	return {"loss": loss}

	def on_train_batch_end(self, outputs, batch, batch_idx):
	pass

	def get_input_visualizations(self, batch):
	return [
	{
	"type": "rgb",
	"img": rearrange(
	batch["source_rgb"],
	"(B N) C H W -> (B H) (N W) C",
	N=batch["num_views"],
	),
	"kwargs": {"data_format": "HWC"},
	},
	{
	"type": "rgb",
	"img": rearrange(batch["reference_rgb"], "B C H W -> (B H) W C"),
	"kwargs": {"data_format": "HWC"},
	},
	{
	"type": "rgb",
	"img": rearrange(
	batch["rgb"], "(B N) C H W -> (B H) (N W) C", N=batch["num_views"]
	),
	"kwargs": {"data_format": "HWC"},
	},
	]

	def get_output_visualizations(self, batch, outputs):
	images = [
	{
	"type": "rgb",
	"img": rearrange(
	batch["source_rgb"],
	"(B N) C H W -> (B H) (N W) C",
	N=batch["num_views"],
	),
	"kwargs": {"data_format": "HWC"},
	},
	{
	"type": "rgb",
	"img": rearrange(
	batch["rgb"], "(B N) C H W -> (B H) (N W) C", N=batch["num_views"]
	),
	"kwargs": {"data_format": "HWC"},
	},
	{
	"type": "rgb",
	"img": rearrange(batch["reference_rgb"], "B C H W -> (B H) W C"),
	"kwargs": {"data_format": "HWC"},
	},
	{
	"type": "rgb",
	"img": rearrange(
	outputs, "(B N) C H W -> (B H) (N W) C", N=batch["num_views"]
	),
	"kwargs": {"data_format": "HWC"},
	},
	]
	return images

	def generate_images(self, batch, **kwargs):
	return self.pipeline(
	prompt=batch["prompts"],
	control_image=batch["source_rgb"],
	num_images_per_prompt=batch["num_views"],
	generator=torch.Generator(device=self.device).manual_seed(
	self.cfg.eval_seed
	),
	num_inference_steps=self.cfg.eval_num_inference_steps,
	guidance_scale=self.cfg.eval_guidance_scale,
	height=self.cfg.eval_height,
	width=self.cfg.eval_width,
	reference_image=batch["reference_rgb"],
	output_type="pt",
	).images

	def on_save_checkpoint(self, checkpoint):
	if self.global_rank == 0:
	self.pipeline.save_custom_adapter(
	os.path.dirname(self.get_save_dir()),
	"step1x-3d-ig2v.safetensors",
	safe_serialization=True,
	include_keys=self.cfg.trainable_modules,
	)

	def on_check_train(self, batch):
	self.save_image_grid(
	f"it{self.true_global_step}-train.jpg",
	self.get_input_visualizations(batch),
	name="train_step_input",
	step=self.true_global_step,
	)

	def validation_step(self, batch, batch_idx):
	out = self.generate_images(batch)

	if (
	self.cfg.check_val_limit_rank > 0
	and self.global_rank < self.cfg.check_val_limit_rank
	):
	self.save_image_grid(
	f"it{self.true_global_step}-validation-{self.global_rank}_{batch_idx}.jpg",
	self.get_output_visualizations(batch, out),
	name=f"validation_step_output_{self.global_rank}_{batch_idx}",
	step=self.true_global_step,
	)

	def on_validation_epoch_end(self):
	pass

	def test_step(self, batch, batch_idx):
	out = self.generate_images(batch)

	self.save_image_grid(
	f"it{self.true_global_step}-test-{self.global_rank}_{batch_idx}.jpg",
	self.get_output_visualizations(batch, out),
	name=f"test_step_output_{self.global_rank}_{batch_idx}",
	step=self.true_global_step,
	)

	def on_test_end(self):
	pass