Jingya
/

torch-neuron-test-samples

Model card Files Files and versions

torch-neuron-test-samples / torch_compile /flux /test_vae_decoder.py

Ubuntu

tests

5ee43e9 about 2 months ago

history blame contribute delete

3.11 kB

	#!/usr/bin/env python3
	"""
	Flux VAE decoder (16-ch latent → RGB image) on Neuron.
	Checkpoint: black-forest-labs/FLUX.1-dev/vae
	"""
	import argparse
	import logging
	import time
	from pathlib import Path

	import torch
	from diffusers import AutoencoderKL
	import torch_neuronx # noqa: F401 guarantees Neuron backend
	from PIL import Image

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	def main():
	parser = argparse.ArgumentParser(
	description="Flux VAE decoder (latent → image) with torch.compile on Neuron"
	)
	parser.add_argument(
	"--model",
	type=str,
	# default="black-forest-labs/FLUX.1-dev/vae",
	default="/workspace/flux_weight/",
	help="Flux VAE checkpoint on Hugging Face Hub",
	)
	parser.add_argument("--latent-ch", type=int, default=16, help="Latent channels (Flux=16)")
	parser.add_argument("--scale", type=int, default=32, help="Latent spatial size (256 px / 8)")
	parser.add_argument("--output", type=str, default="flux_vae_out.png", help="Output image path")
	args = parser.parse_args()

	torch.set_default_dtype(torch.float32)
	torch.manual_seed(42)

	# Load Flux VAE decoder
	vae = AutoencoderKL.from_pretrained(args.model, subfolder="vae", torch_dtype=torch.float32).eval()

	# Create dummy latent (bfloat16, N(0,1)) - shape: [B, 16, H/8, W/8]
	latent = torch.randn(1, args.latent_ch, args.scale, args.scale, dtype=torch.float32)

	# Pre-run once to freeze shapes before compilation
	with torch.no_grad():
	_ = vae.decode(latent).sample

	# Compile decode function (allow graph breaks for big kernels)
	decode_fn = torch.compile(vae.decode, backend="neuron", fullgraph=True)

	# Warmup
	warmup_start = time.time()
	with torch.no_grad():
	_ = decode_fn(latent)
	warmup_time = time.time() - warmup_start

	# Actual run
	run_start = time.time()
	with torch.no_grad():
	image = decode_fn(latent).sample
	run_time = time.time() - run_start

	logger.info("Warmup: %.2f s, Run: %.4f s", warmup_time, run_time)
	logger.info("VAE output shape: %s", image.shape) # [1, 3, H, W]

	# Convert to PIL and save
	image = (image / 2 + 0.5).clamp(0, 1) # scale to [0,1]
	image = image.cpu().float()
	Image.fromarray((image[0].permute(1, 2, 0).numpy() * 255).astype("uint8")).save(args.output)
	logger.info("Saved decoded image to %s", Path(args.output).resolve())


	if __name__ == "__main__":
	main()

	"""
	The compilation process took more than 2 hours.
	/usr/local/lib/python3.10/site-packages/torch_mlir/dialects/stablehlo/__init__.py:24: UserWarning: Could not import StableHLO C++ extension: libStablehloUnifiedPythonCAPI.so.22.0git: cannot open shared object file: No such file or directory
	warnings.warn(f"Could not import StableHLO C++ extension: {e}")
	INFO:__main__:Warmup: 4010.52 s, Run: 22.5420 s
	INFO:__main__:VAE output shape: torch.Size([1, 3, 256, 256])
	INFO:__main__:Saved decoded image to /workspace/torch_neuron_samples/torch-neuron-samples/scripts/torch_compile/flux/flux_vae_out.png
	"""