wd-tagger-heatmap-more-models

Running

App Files Files Community

neggles commited on Apr 5

Commit

c24a176

•

1 Parent(s): dc9c12b

make the thing work

Browse files

Files changed (7) hide show

app.py +146 -0
examples/img-01.png +3 -0
examples/img-02.png +3 -0
examples/img-03.jpg +0 -0
tagger/__init__.py +0 -0
tagger/common.py +180 -0
tagger/model.py +208 -0

app.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from os import getenv
+from pathlib import Path
+import gradio as gr
+from PIL import Image
+from rich.traceback import install as traceback_install
+from tagger.common import Heatmap, ImageLabels, LabelData, load_labels_hf, preprocess_image
+from tagger.model import load_model_and_transform, process_heatmap
+TITLE = "WD Tagger Heatmap"
+DESCRIPTION = """WD Tagger v3 Heatmap Generator."""
+# get HF token
+HF_TOKEN = getenv("HF_TOKEN", None)
+# model repo and cache
+MODEL_REPO = "SmilingWolf/wd-vit-tagger-v3"
+# get the repo root (or the current working directory if running in ipython)
+WORK_DIR = Path(__file__).parent.resolve() if "__file__" in globals() else Path().resolve()
+# allowed extensions
+IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif"]
+_ = traceback_install(show_locals=True, locals_max_length=0)
+# get the example images
+example_images = sorted(
+    [
+        str(x.relative_to(WORK_DIR))
+        for x in WORK_DIR.joinpath("examples").iterdir()
+        if x.is_file() and x.suffix.lower() in IMAGE_EXTENSIONS
+    ]
+)
+def predict(
+    image: Image.Image,
+    threshold: float = 0.5,
+):
+    # join variant for cache key
+    model, transform = load_model_and_transform(MODEL_REPO)
+    # load labels
+    labels: LabelData = load_labels_hf(MODEL_REPO)
+    # preprocess image
+    image = preprocess_image(image, (448, 448))
+    image = transform(image).unsqueeze(0)
+    # get the model output
+    heatmaps: list[Heatmap]
+    image_labels: ImageLabels
+    heatmaps, heatmap_grid, image_labels = process_heatmap(model, image, labels, threshold)
+    heatmap_images = [(x.image, x.label) for x in heatmaps]
+    return (
+        heatmap_images,
+        heatmap_grid,
+        image_labels.caption,
+        image_labels.booru,
+        image_labels.rating,
+        image_labels.character,
+        image_labels.general,
+    )
+css = """
+#use_mcut, #char_mcut {
+    padding-top: var(--scale-3);
+}
+#threshold.dimmed {
+    filter: brightness(75%);
+}
+"""
+with gr.Blocks(theme="NoCrypt/miku", analytics_enabled=False, title=TITLE, css=css) as demo:
+    with gr.Row(equal_height=False):
+        with gr.Column(min_width=720):
+            with gr.Group():
+                img_input = gr.Image(
+                    label="Input",
+                    type="pil",
+                    image_mode="RGB",
+                    sources=["upload", "clipboard"],
+                )
+            with gr.Group():
+                with gr.Row():
+                    threshold = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.35,
+                        step=0.01,
+                        label="Tag Threshold",
+                        scale=5,
+                        elem_id="threshold",
+                    )
+            with gr.Row():
+                clear = gr.ClearButton(
+                    components=[],
+                    variant="secondary",
+                    size="lg",
+                )
+                submit = gr.Button(value="Submit", variant="primary", size="lg")
+        with gr.Column(min_width=720):
+            with gr.Tab(label="Heatmaps"):
+                heatmap_gallery = gr.Gallery(columns=3, show_label=False)
+            with gr.Tab(label="Grid"):
+                heatmap_grid = gr.Image(show_label=False)
+            with gr.Tab(label="Tags"):
+                with gr.Group():
+                    rating = gr.Label(label="Rating")
+                with gr.Group():
+                    character = gr.Label(label="Character")
+                with gr.Group():
+                    general = gr.Label(label="General")
+            with gr.Group():
+                caption = gr.Textbox(label="Caption", show_copy_button=True)
+                tags = gr.Textbox(label="Tags", show_copy_button=True)
+    with gr.Row():
+        examples = [[imgpath, 0.35] for imgpath in example_images]
+        examples = gr.Examples(
+            examples=examples,
+            inputs=[img_input, threshold],
+        )
+    # tell clear button which components to clear
+    clear.add([img_input, heatmap_gallery, heatmap_grid, caption, tags, rating, character, general])
+    submit.click(
+        predict,
+        inputs=[img_input, threshold],
+        outputs=[heatmap_gallery, heatmap_grid, caption, tags, rating, character, general],
+        api_name="predict",
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=10)
+    if getenv("SPACE_ID", None) is not None:
+        demo.launch()
+    else:
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7871,
+            debug=True,
+        )

examples/img-01.png ADDED Viewed

Git LFS Details

SHA256: 37a2bec1c653272457c6b6e5fec6da8ac4676d973f7cd87c545a6e1ab6be288c
Pointer size: 132 Bytes
Size of remote file: 1.53 MB

examples/img-02.png ADDED Viewed

Git LFS Details

SHA256: 90ee6035ce0caec46bbda3a9d48bdcd2cd7384487781615c4251301ab5422d45
Pointer size: 131 Bytes
Size of remote file: 434 kB

examples/img-03.jpg ADDED Viewed

tagger/__init__.py ADDED Viewed

File without changes

tagger/common.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import math
+from dataclasses import dataclass
+from functools import lru_cache
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import pandas as pd
+import torch
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import HfHubHTTPError
+from PIL import Image
+from torch import Tensor, nn
+@dataclass
+class Heatmap:
+    label: str
+    score: float
+    image: Image.Image
+@dataclass
+class LabelData:
+    names: list[str]
+    rating: list[np.int64]
+    general: list[np.int64]
+    character: list[np.int64]
+@dataclass
+class ImageLabels:
+    caption: str
+    booru: str
+    rating: dict[str, float]
+    general: dict[str, float]
+    character: dict[str, float]
+@lru_cache(maxsize=5)
+def load_labels_hf(
+    repo_id: str,
+    revision: Optional[str] = None,
+    token: Optional[str] = None,
+) -> LabelData:
+    try:
+        csv_path = hf_hub_download(
+            repo_id=repo_id, filename="selected_tags.csv", revision=revision, token=token
+        )
+        csv_path = Path(csv_path).resolve()
+    except HfHubHTTPError as e:
+        raise FileNotFoundError(f"selected_tags.csv failed to download from {repo_id}") from e
+    df: pd.DataFrame = pd.read_csv(csv_path, usecols=["name", "category"])
+    tag_data = LabelData(
+        names=df["name"].tolist(),
+        rating=list(np.where(df["category"] == 9)[0]),
+        general=list(np.where(df["category"] == 0)[0]),
+        character=list(np.where(df["category"] == 4)[0]),
+    )
+    return tag_data
+def mcut_threshold(probs: np.ndarray) -> float:
+    """
+    Maximum Cut Thresholding (MCut)
+    Largeron, C., Moulin, C., & Gery, M. (2012). MCut: A Thresholding Strategy
+     for Multi-label Classification. In 11th International Symposium, IDA 2012
+     (pp. 172-183).
+    """
+    probs = probs[probs.argsort()[::-1]]
+    diffs = probs[:-1] - probs[1:]
+    idx = diffs.argmax()
+    thresh = (probs[idx] + probs[idx + 1]) / 2
+    return float(thresh)
+def pil_ensure_rgb(image: Image.Image) -> Image.Image:
+    # convert to RGB/RGBA if not already (deals with palette images etc.)
+    if image.mode not in ["RGB", "RGBA"]:
+        image = image.convert("RGBA") if "transparency" in image.info else image.convert("RGB")
+    # convert RGBA to RGB with white background
+    if image.mode == "RGBA":
+        canvas = Image.new("RGBA", image.size, (255, 255, 255))
+        canvas.alpha_composite(image)
+        image = canvas.convert("RGB")
+    return image
+def pil_pad_square(
+    image: Image.Image,
+    fill: tuple[int, int, int] = (255, 255, 255),
+) -> Image.Image:
+    w, h = image.size
+    # get the largest dimension so we can pad to a square
+    px = max(image.size)
+    # pad to square with white background
+    canvas = Image.new("RGB", (px, px), fill)
+    canvas.paste(image, ((px - w) // 2, (px - h) // 2))
+    return canvas
+def preprocess_image(
+    image: Image.Image,
+    size_px: int | tuple[int, int],
+    upscale: bool = True,
+) -> Image.Image:
+    """
+    Preprocess an image to be square and centered on a white background.
+    """
+    if isinstance(size_px, int):
+        size_px = (size_px, size_px)
+    # ensure RGB and pad to square
+    image = pil_ensure_rgb(image)
+    image = pil_pad_square(image)
+    # resize to target size
+    if image.size[0] < size_px[0] or image.size[1] < size_px[1]:
+        if upscale is False:
+            raise ValueError("Image is smaller than target size, and upscaling is disabled")
+        image = image.resize(size_px, Image.LANCZOS)
+    if image.size[0] > size_px[0] or image.size[1] > size_px[1]:
+        image.thumbnail(size_px, Image.BICUBIC)
+    return image
+def pil_make_grid(
+    images: list[Image.Image],
+    max_cols: int = 8,
+    padding: int = 4,
+    bg_color: tuple[int, int, int] = (40, 42, 54),  # dracula background color
+    partial_rows: bool = True,
+) -> Image.Image:
+    n_cols = min(math.floor(math.sqrt(len(images))), max_cols)
+    n_rows = math.ceil(len(images) / n_cols)
+    # if the final row is not full and partial_rows is False, remove a row
+    if n_cols * n_rows > len(images) and not partial_rows:
+        n_rows -= 1
+    # assumes all images are same size
+    image_width, image_height = images[0].size
+    canvas_width = ((image_width + padding) * n_cols) + padding
+    canvas_height = ((image_height + padding) * n_rows) + padding
+    canvas = Image.new("RGB", (canvas_width, canvas_height), bg_color)
+    for i, img in enumerate(images):
+        x = (i % n_cols) * (image_width + padding) + padding
+        y = (i // n_cols) * (image_height + padding) + padding
+        canvas.paste(img, (x, y))
+    return canvas
+# https://github.com/toriato/stable-diffusion-webui-wd14-tagger/blob/a9eacb1eff904552d3012babfa28b57e1d3e295c/tagger/ui.py#L368
+kaomojis = [
+    "0_0",
+    "(o)_(o)",
+    "+_+",
+    "+_-",
+    "._.",
+    "<o>_<o>",
+    "<|>_<|>",
+    "=_=",
+    ">_<",
+    "3_3",
+    "6_9",
+    ">_o",
+    "@_@",
+    "^_^",
+    "o_o",
+    "u_u",
+    "x_x",
+    "|_|",
+    "||_||",
+]

tagger/model.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import json
+import math
+from dataclasses import dataclass, field
+from os import PathLike, cpu_count
+from pathlib import Path
+from typing import Any, Optional, TypeAlias
+import colorcet as cc
+import cv2
+import numpy as np
+import pandas as pd
+import timm
+import torch
+from matplotlib.colors import LinearSegmentedColormap
+from PIL import Image
+from timm.data import create_transform, resolve_data_config
+from timm.models import VisionTransformer
+from torch import Tensor, nn
+from torch.nn import functional as F
+from torchvision import transforms as T
+from .common import Heatmap, ImageLabels, LabelData, load_labels_hf, pil_ensure_rgb, pil_make_grid
+# working dir, either file parent dir or cwd if interactive
+work_dir = (Path(__file__).parent if "__file__" in locals() else Path.cwd()).resolve()
+temp_dir = work_dir.joinpath("temp")
+temp_dir.mkdir(exist_ok=True, parents=True)
+# model cache
+model_cache: dict[str, VisionTransformer] = {}
+transform_cache: dict[str, T.Compose] = {}
+# device to use
+torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class RGBtoBGR(nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        if x.ndim == 4:
+            return x[:, [2, 1, 0], :, :]
+        return x[[2, 1, 0], :, :]
+def model_device(model: nn.Module) -> torch.device:
+    return next(model.parameters()).device
+def load_model(repo_id: str) -> VisionTransformer:
+    global model_cache
+    if model_cache.get(repo_id, None) is None:
+        # save model to cache
+        model_cache[repo_id] = timm.create_model("hf-hub:" + repo_id, pretrained=True).eval().to(torch_device)
+    return model_cache[repo_id]
+def load_model_and_transform(repo_id: str) -> tuple[VisionTransformer, T.Compose]:
+    global transform_cache
+    global model_cache
+    if model_cache.get(repo_id, None) is None:
+        # save model to cache
+        model_cache[repo_id] = timm.create_model("hf-hub:" + repo_id, pretrained=True).eval()
+    model = model_cache[repo_id]
+    if transform_cache.get(repo_id, None) is None:
+        transforms = create_transform(**resolve_data_config(model.pretrained_cfg, model=model))
+        # hack in the RGBtoBGR transform, save to cache
+        transform_cache[repo_id] = T.Compose(transforms.transforms + [RGBtoBGR()])
+    transform = transform_cache[repo_id]
+    return model, transform
+def get_tags(
+    probs: Tensor,
+    labels: LabelData,
+    gen_threshold: float,
+    char_threshold: float,
+):
+    # Convert indices+probs to labels
+    probs = list(zip(labels.names, probs.numpy()))
+    # First 4 labels are actually ratings
+    rating_labels = dict([probs[i] for i in labels.rating])
+    # General labels, pick any where prediction confidence > threshold
+    gen_labels = [probs[i] for i in labels.general]
+    gen_labels = dict([x for x in gen_labels if x[1] > gen_threshold])
+    gen_labels = dict(sorted(gen_labels.items(), key=lambda item: item[1], reverse=True))
+    # Character labels, pick any where prediction confidence > threshold
+    char_labels = [probs[i] for i in labels.character]
+    char_labels = dict([x for x in char_labels if x[1] > char_threshold])
+    char_labels = dict(sorted(char_labels.items(), key=lambda item: item[1], reverse=True))
+    # Combine general and character labels, sort by confidence
+    combined_names = [x for x in gen_labels]
+    combined_names.extend([x for x in char_labels])
+    # Convert to a string suitable for use as a training caption
+    caption = ", ".join(combined_names).replace("(", "\(").replace(")", "\)")
+    booru = caption.replace("_", " ")
+    return caption, booru, rating_labels, char_labels, gen_labels
+@torch.no_grad()
+def render_heatmap(
+    image: Tensor,
+    gradients: Tensor,
+    image_feats: Tensor,
+    image_probs: Tensor,
+    image_labels: list[str],
+    cmap: LinearSegmentedColormap = cc.m_linear_bmy_10_95_c71,
+    pos_embed_dim: int = 784,
+    image_size: tuple[int, int] = (448, 448),
+    font_args: dict = {
+        "fontFace": cv2.FONT_HERSHEY_SIMPLEX,
+        "fontScale": 1,
+        "color": (255, 255, 255),
+        "thickness": 2,
+        "lineType": cv2.LINE_AA,
+    },
+    partial_rows: bool = True,
+) -> tuple[list[Heatmap], Image.Image]:
+    hmap_dim = int(math.sqrt(pos_embed_dim))
+    image_hmaps = gradients.mean(2, keepdim=True).mul(image_feats.unsqueeze(0)).squeeze()
+    image_hmaps = image_hmaps.mean(-1).reshape(len(image_labels), hmap_dim, hmap_dim)
+    image_hmaps = image_hmaps.max(torch.zeros_like(image_hmaps))
+    image_hmaps /= image_hmaps.reshape(image_hmaps.shape[0], -1).max(-1)[0].unsqueeze(-1).unsqueeze(-1)
+    # normalize to 0-1
+    image_hmaps = torch.stack([(x - x.min()) / (x.max() - x.min()) for x in image_hmaps]).unsqueeze(1)
+    # interpolate to input image size
+    image_hmaps = F.interpolate(image_hmaps, size=image_size, mode="bilinear").squeeze(1)
+    hmap_imgs: list[Heatmap] = []
+    for tag, hmap, score in zip(image_labels, image_hmaps, image_probs.cpu()):
+        image_pixels = image.add(1).mul(127.5).squeeze().permute(1, 2, 0).cpu().numpy().astype(np.uint8)
+        hmap_pixels = cmap(hmap.cpu().numpy(), bytes=True)[:, :, :3]
+        hmap_cv2 = cv2.cvtColor(hmap_pixels, cv2.COLOR_RGB2BGR)
+        hmap_image = cv2.addWeighted(image_pixels, 0.5, hmap_cv2, 0.5, 0)
+        if tag is not None:
+            cv2.putText(hmap_image, tag, (10, 30), **font_args)
+            cv2.putText(hmap_image, f"{score:.3f}", org=(10, 60), **font_args)
+        hmap_pil = Image.fromarray(cv2.cvtColor(hmap_image, cv2.COLOR_BGR2RGB))
+        hmap_imgs.append(Heatmap(tag, score.item(), hmap_pil))
+    hmap_imgs = sorted(hmap_imgs, key=lambda x: x.score, reverse=True)
+    hmap_grid = pil_make_grid([x.image for x in hmap_imgs], partial_rows=partial_rows)
+    return hmap_imgs, hmap_grid
+def process_heatmap(
+    model: VisionTransformer,
+    image: Tensor,
+    labels: LabelData,
+    threshold: float = 0.5,
+    partial_rows: bool = True,
+) -> tuple[list[tuple[float, str, Image.Image]], Image.Image, ImageLabels]:
+    torch_device = model_device(model)
+    with torch.set_grad_enabled(True):
+        features = model.forward_features(image.to(torch_device))
+        probs = model.forward_head(features)
+        probs = F.sigmoid(probs).squeeze(0)
+        probs_mask = probs > threshold
+        heatmap_probs = probs[probs_mask]
+        label_indices = torch.nonzero(probs_mask, as_tuple=False).squeeze(1)
+        image_labels = [labels.names[label_indices[i]] for i in range(len(label_indices))]
+        eye = torch.eye(heatmap_probs.shape[0], device=torch_device)
+        grads = torch.autograd.grad(
+            outputs=heatmap_probs,
+            inputs=features,
+            grad_outputs=eye,
+            is_grads_batched=True,
+            retain_graph=True,
+        )
+        grads = grads[0].detach().requires_grad_(False)[:, 0, :, :].unsqueeze(1)
+    with torch.set_grad_enabled(False):
+        hmap_imgs, hmap_grid = render_heatmap(
+            image=image,
+            gradients=grads,
+            image_feats=features,
+            image_probs=heatmap_probs,
+            image_labels=image_labels,
+            partial_rows=partial_rows,
+        )
+        caption, booru, ratings, character, general = get_tags(
+            probs=probs.cpu(),
+            labels=labels,
+            gen_threshold=threshold,
+            char_threshold=threshold,
+        )
+        labels = ImageLabels(caption, booru, ratings, general, character)
+    return hmap_imgs, hmap_grid, labels