Spaces:

darknoon
/

image-tokens

Sleeping

App Files Files Community

darknoon commited on May 24

Commit

aa212ba

•

1 Parent(s): 392a374

Upload from local env

Browse files

Files changed (1) hide show

app.py +245 -0

app.py ADDED Viewed

	@@ -0,0 +1,245 @@

+from typing import List, Literal
+import gradio as gr
+import torch
+import numpy as np
+import colorsys
+from diffusers import VQModel
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.wuerstchen.modeling_paella_vq_model import PaellaVQModel
+from abc import abstractmethod
+import torch.backends
+import torch.mps
+from PIL import Image
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+elif torch.backends.mps.is_available():
+    device = torch.device("mps")
+else:
+    device = torch.device("cpu")
+# abstract class VQImageRoundtripPipeline:
+class ImageRoundtripPipeline:
+    @abstractmethod
+    def roundtrip_image(self, image, output_type="pil"): ...
+class VQImageRoundtripPipeline(ImageRoundtripPipeline):
+    vqvae: VQModel
+    vae_scale_factor: int
+    vqvae_processor: VaeImageProcessor
+    def __init__(self):
+        self.vqvae = VQModel.from_pretrained("amused/amused-512", subfolder="vqvae")
+        self.vqvae.eval()
+        self.vqvae.to(device)
+        self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1)
+        self.vqvae_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False
+        )
+        print("VQ-GAN model loaded", self.vqvae)
+    def roundtrip_image(self, image, output_type="pil"):
+        image = self.vqvae_processor.preprocess(image)
+        device = self.vqvae.device
+        needs_upcasting = (
+            self.vqvae.dtype == torch.float16 and self.vqvae.config.force_upcast
+        )
+        batch_size, im_channels, height, width = image.shape
+        if needs_upcasting:
+            self.vqvae.float()
+        latents = self.vqvae.encode(
+            image.to(dtype=self.vqvae.dtype, device=device)
+        ).latents
+        latents_batch_size, latent_channels, latents_height, latents_width = (
+            latents.shape
+        )
+        latents = self.vqvae.quantize(latents)[2][2].reshape(
+            batch_size, latents_height, latents_width
+        )
+        output = self.vqvae.decode(
+            latents,
+            force_not_quantize=True,
+            shape=(
+                batch_size,
+                height // self.vae_scale_factor,
+                width // self.vae_scale_factor,
+                self.vqvae.config.latent_channels,
+            ),
+        ).sample.clip(0, 1)
+        output = self.vqvae_processor.postprocess(output, output_type)
+        if needs_upcasting:
+            self.vqvae.half()
+        return output[0], latents.cpu().numpy(), self.vqvae.config.num_vq_embeddings
+class PaellaImageRoundtripPipeline(ImageRoundtripPipeline):
+    vqgan: PaellaVQModel
+    vae_scale_factor: int
+    vqvae_processor: VaeImageProcessor
+    def __init__(self):
+        self.vqgan = PaellaVQModel.from_pretrained(
+            "warp-ai/wuerstchen", subfolder="vqgan"
+        )
+        self.vqgan.eval()
+        self.vqgan.to(device)
+        self.vae_scale_factor = 4
+        self.vqvae_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False
+        )
+        print("Paella VQ-GAN model loaded", self.vqgan)
+    def roundtrip_image(self, image, output_type="pil"):
+        image = self.vqvae_processor.preprocess(image)
+        device = self.vqgan.device
+        batch_size, im_channels, height, width = image.shape
+        latents = self.vqgan.encode(
+            image.to(dtype=self.vqgan.dtype, device=device)
+        ).latents
+        latents_batch_size, latent_channels, latents_height, latents_width = (
+            latents.shape
+        )
+        # latents = latents * self.vqgan.config.scale_factor
+        # Manually quantize so we can inspect
+        latents_q = self.vqgan.vquantizer(latents)[2][2].reshape(
+            batch_size, latents_height, latents_width
+        )
+        print("latents after quantize", (latents_q.shape, latents_q.dtype))
+        images = self.vqgan.decode(latents).sample.clamp(0, 1)
+        output = self.vqvae_processor.postprocess(images, output_type)
+        # if needs_upcasting:
+        #     self.vqgan.half()
+        return output[0], latents_q.cpu().numpy(), self.vqgan.config.num_vq_embeddings
+pipeline_paella = PaellaImageRoundtripPipeline()
+pipeline_vq = VQImageRoundtripPipeline()
+# Function to generate a list of unique colors
+def generate_unique_colors_hsl(n):
+    colors = []
+    for i in range(n):
+        hue = i / (n // 4)  # Distribute hues evenly around the color wheel 4 times
+        lightness = 0.8 - (i / n) * 0.6  # Decrease brightness from 0.8 to 0.2
+        saturation = 1.0
+        rgb = colorsys.hls_to_rgb(hue, lightness, saturation)
+        rgb = tuple(int(255 * x) for x in rgb)
+        colors.append(rgb)
+    return colors
+# Function to create the image from VQGAN tokens
+def vqgan_tokens_to_image(tokens, codebook_size, downscale_factor):
+    # Generate unique colors for each token in the codebook
+    colors = generate_unique_colors_hsl(codebook_size)
+    # Create a lookup table
+    lookup_table = np.array(colors, dtype=np.uint8)
+    # Extract the token array (remove the batch dimension)
+    token_array = tokens[0]
+    # Map tokens to their RGB colors using the lookup table
+    color_image = lookup_table[token_array]
+    # Create a PIL image from the numpy array
+    img = Image.fromarray(color_image, "RGB")
+    # Upscale the image using nearest neighbor interpolation
+    img = img.resize(
+        (
+            color_image.shape[1] * downscale_factor,
+            color_image.shape[0] * downscale_factor,
+        ),
+        Image.NEAREST,
+    )
+    return img
+# This is a gradio space that lets you encode an image with various encoder-decoder pairs, eg VQ-GAN, SDXL's VAE, etc and check the image quality
+# def image_grid_to_string(image_grid):
+#     """Convert a latent vq index "image" grid to a string, input shape is (1, height, width)"""
+#     return "\n".join(
+#         [" ".join([str(int(x)) for x in row]) for row in image_grid.squeeze()]
+#     )
+def describe_shape(shape):
+    return f"Shape: {shape} num elements: {np.prod(shape)}"
+# @spaces.GPU
+@torch.no_grad()
+def roundtrip_image(
+    image,
+    model: List[Literal["vqgan", Literal["paella"]]],
+    size: List[Literal["256x256", "512x512", "1024x1024"]],
+    output_type="pil",
+):
+    if size == "256x256":
+        image = image.resize((256, 256))
+    elif size == "512x512":
+        image = image.resize((512, 512))
+    elif size == "1024x1024":
+        image = image.resize((1024, 1024))
+    else:
+        raise ValueError(f"Unknown size {size}")
+    if model == "vqgan":
+        image, latents, codebook_size = pipeline_vq.roundtrip_image(image, output_type)
+        return (
+            image,
+            vqgan_tokens_to_image(
+                latents, codebook_size, downscale_factor=pipeline_vq.vae_scale_factor
+            ),
+            describe_shape(latents.shape),
+        )
+    elif model == "paella":
+        image, latents, codebook_size = pipeline_paella.roundtrip_image(
+            image, output_type
+        )
+        return (
+            image,
+            vqgan_tokens_to_image(
+                latents, codebook_size, downscale_factor=pipeline_vq.vae_scale_factor
+            ),
+            describe_shape(latents.shape),
+        )
+    else:
+        raise ValueError(f"Unknown model {model}")
+demo = gr.Interface(
+    fn=roundtrip_image,
+    inputs=[
+        gr.Image(type="pil"),
+        gr.Dropdown(["vqgan", "paella"], label="Model", value="vqgan"),
+        gr.Dropdown(["256x256", "512x512", "1024x1024"], label="Size", value="512x512"),
+    ],
+    outputs=[
+        gr.Image(label="Reconstructed"),
+        gr.Image(label="Tokens"),
+        gr.Text(label="VQ Shape"),
+    ],
+    title="Image Tokenizer Playground",
+    description="Round-trip an image through an encode-decoder pair to see the quality loss from the VQ-GAN for image generation, etc.",
+)
+demo.launch()