|
import os |
|
import cv2 |
|
import numpy as np |
|
import torch |
|
import math |
|
from dataclasses import dataclass |
|
from transformers.models.clip.modeling_clip import CLIPVisionModelOutput |
|
|
|
from annotator.util import HWC3 |
|
from typing import Callable, Tuple, Union |
|
|
|
from modules.safe import Extra |
|
from modules import devices |
|
from scripts.logging import logger |
|
|
|
|
|
def torch_handler(module: str, name: str): |
|
""" Allow all torch access. Bypass A1111 safety whitelist. """ |
|
if module == 'torch': |
|
return getattr(torch, name) |
|
if module == 'torch._tensor': |
|
|
|
return getattr(torch._tensor, name) |
|
|
|
|
|
def pad64(x): |
|
return int(np.ceil(float(x) / 64.0) * 64 - x) |
|
|
|
|
|
def safer_memory(x): |
|
|
|
return np.ascontiguousarray(x.copy()).copy() |
|
|
|
|
|
def resize_image_with_pad(input_image, resolution, skip_hwc3=False): |
|
if skip_hwc3: |
|
img = input_image |
|
else: |
|
img = HWC3(input_image) |
|
H_raw, W_raw, _ = img.shape |
|
k = float(resolution) / float(min(H_raw, W_raw)) |
|
interpolation = cv2.INTER_CUBIC if k > 1 else cv2.INTER_AREA |
|
H_target = int(np.round(float(H_raw) * k)) |
|
W_target = int(np.round(float(W_raw) * k)) |
|
img = cv2.resize(img, (W_target, H_target), interpolation=interpolation) |
|
H_pad, W_pad = pad64(H_target), pad64(W_target) |
|
img_padded = np.pad(img, [[0, H_pad], [0, W_pad], [0, 0]], mode='edge') |
|
|
|
def remove_pad(x): |
|
return safer_memory(x[:H_target, :W_target]) |
|
|
|
return safer_memory(img_padded), remove_pad |
|
|
|
|
|
model_canny = None |
|
|
|
|
|
def canny(img, res=512, thr_a=100, thr_b=200, **kwargs): |
|
l, h = thr_a, thr_b |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_canny |
|
if model_canny is None: |
|
from annotator.canny import apply_canny |
|
model_canny = apply_canny |
|
result = model_canny(img, l, h) |
|
return remove_pad(result), True |
|
|
|
|
|
def scribble_thr(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
result = np.zeros_like(img, dtype=np.uint8) |
|
result[np.min(img, axis=2) < 127] = 255 |
|
return remove_pad(result), True |
|
|
|
|
|
def scribble_xdog(img, res=512, thr_a=32, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
g1 = cv2.GaussianBlur(img.astype(np.float32), (0, 0), 0.5) |
|
g2 = cv2.GaussianBlur(img.astype(np.float32), (0, 0), 5.0) |
|
dog = (255 - np.min(g2 - g1, axis=2)).clip(0, 255).astype(np.uint8) |
|
result = np.zeros_like(img, dtype=np.uint8) |
|
result[2 * (255 - dog) > thr_a] = 255 |
|
return remove_pad(result), True |
|
|
|
|
|
def tile_resample(img, res=512, thr_a=1.0, **kwargs): |
|
img = HWC3(img) |
|
if thr_a < 1.1: |
|
return img, True |
|
H, W, C = img.shape |
|
H = int(float(H) / float(thr_a)) |
|
W = int(float(W) / float(thr_a)) |
|
img = cv2.resize(img, (W, H), interpolation=cv2.INTER_AREA) |
|
return img, True |
|
|
|
|
|
def threshold(img, res=512, thr_a=127, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
result = np.zeros_like(img, dtype=np.uint8) |
|
result[np.min(img, axis=2) > thr_a] = 255 |
|
return remove_pad(result), True |
|
|
|
|
|
def identity(img, **kwargs): |
|
return img, True |
|
|
|
|
|
def invert(img, res=512, **kwargs): |
|
return 255 - HWC3(img), True |
|
|
|
|
|
model_hed = None |
|
|
|
|
|
def hed(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_hed |
|
if model_hed is None: |
|
from annotator.hed import apply_hed |
|
model_hed = apply_hed |
|
result = model_hed(img) |
|
return remove_pad(result), True |
|
|
|
|
|
def hed_safe(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_hed |
|
if model_hed is None: |
|
from annotator.hed import apply_hed |
|
model_hed = apply_hed |
|
result = model_hed(img, is_safe=True) |
|
return remove_pad(result), True |
|
|
|
|
|
def unload_hed(): |
|
global model_hed |
|
if model_hed is not None: |
|
from annotator.hed import unload_hed_model |
|
unload_hed_model() |
|
|
|
|
|
def scribble_hed(img, res=512, **kwargs): |
|
result, _ = hed(img, res) |
|
import cv2 |
|
from annotator.util import nms |
|
result = nms(result, 127, 3.0) |
|
result = cv2.GaussianBlur(result, (0, 0), 3.0) |
|
result[result > 4] = 255 |
|
result[result < 255] = 0 |
|
return result, True |
|
|
|
|
|
model_mediapipe_face = None |
|
|
|
|
|
def mediapipe_face(img, res=512, thr_a: int = 10, thr_b: float = 0.5, **kwargs): |
|
max_faces = int(thr_a) |
|
min_confidence = thr_b |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_mediapipe_face |
|
if model_mediapipe_face is None: |
|
from annotator.mediapipe_face import apply_mediapipe_face |
|
model_mediapipe_face = apply_mediapipe_face |
|
result = model_mediapipe_face(img, max_faces=max_faces, min_confidence=min_confidence) |
|
return remove_pad(result), True |
|
|
|
|
|
model_mlsd = None |
|
|
|
|
|
def mlsd(img, res=512, thr_a=0.1, thr_b=0.1, **kwargs): |
|
thr_v, thr_d = thr_a, thr_b |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_mlsd |
|
if model_mlsd is None: |
|
from annotator.mlsd import apply_mlsd |
|
model_mlsd = apply_mlsd |
|
result = model_mlsd(img, thr_v, thr_d) |
|
return remove_pad(result), True |
|
|
|
|
|
def unload_mlsd(): |
|
global model_mlsd |
|
if model_mlsd is not None: |
|
from annotator.mlsd import unload_mlsd_model |
|
unload_mlsd_model() |
|
|
|
|
|
model_depth_anything = None |
|
|
|
|
|
def depth_anything(img, res:int = 512, colored:bool = True, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_depth_anything |
|
if model_depth_anything is None: |
|
with Extra(torch_handler): |
|
from annotator.depth_anything import DepthAnythingDetector |
|
device = devices.get_device_for("controlnet") |
|
model_depth_anything = DepthAnythingDetector(device) |
|
return remove_pad(model_depth_anything(img, colored=colored)), True |
|
|
|
|
|
def unload_depth_anything(): |
|
if model_depth_anything is not None: |
|
model_depth_anything.unload_model() |
|
|
|
|
|
model_midas = None |
|
|
|
|
|
def midas(img, res=512, a=np.pi * 2.0, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_midas |
|
if model_midas is None: |
|
from annotator.midas import apply_midas |
|
model_midas = apply_midas |
|
result, _ = model_midas(img, a) |
|
return remove_pad(result), True |
|
|
|
|
|
def midas_normal(img, res=512, a=np.pi * 2.0, thr_a=0.4, **kwargs): |
|
bg_th = thr_a |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_midas |
|
if model_midas is None: |
|
from annotator.midas import apply_midas |
|
model_midas = apply_midas |
|
_, result = model_midas(img, a, bg_th) |
|
return remove_pad(result), True |
|
|
|
|
|
def unload_midas(): |
|
global model_midas |
|
if model_midas is not None: |
|
from annotator.midas import unload_midas_model |
|
unload_midas_model() |
|
|
|
|
|
model_leres = None |
|
|
|
|
|
def leres(img, res=512, a=np.pi * 2.0, thr_a=0, thr_b=0, boost=False, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_leres |
|
if model_leres is None: |
|
from annotator.leres import apply_leres |
|
model_leres = apply_leres |
|
result = model_leres(img, thr_a, thr_b, boost=boost) |
|
return remove_pad(result), True |
|
|
|
|
|
def unload_leres(): |
|
global model_leres |
|
if model_leres is not None: |
|
from annotator.leres import unload_leres_model |
|
unload_leres_model() |
|
|
|
|
|
class OpenposeModel(object): |
|
def __init__(self) -> None: |
|
self.model_openpose = None |
|
|
|
def run_model( |
|
self, |
|
img: np.ndarray, |
|
include_body: bool, |
|
include_hand: bool, |
|
include_face: bool, |
|
use_dw_pose: bool = False, |
|
use_animal_pose: bool = False, |
|
json_pose_callback: Callable[[str], None] = None, |
|
res: int = 512, |
|
**kwargs |
|
) -> Tuple[np.ndarray, bool]: |
|
"""Run the openpose model. Returns a tuple of |
|
- result image |
|
- is_image flag |
|
|
|
The JSON format pose string is passed to `json_pose_callback`. |
|
""" |
|
if json_pose_callback is None: |
|
json_pose_callback = lambda x: None |
|
|
|
img, remove_pad = resize_image_with_pad(img, res) |
|
|
|
if self.model_openpose is None: |
|
from annotator.openpose import OpenposeDetector |
|
self.model_openpose = OpenposeDetector() |
|
|
|
return remove_pad(self.model_openpose( |
|
img, |
|
include_body=include_body, |
|
include_hand=include_hand, |
|
include_face=include_face, |
|
use_dw_pose=use_dw_pose, |
|
use_animal_pose=use_animal_pose, |
|
json_pose_callback=json_pose_callback |
|
)), True |
|
|
|
def unload(self): |
|
if self.model_openpose is not None: |
|
self.model_openpose.unload_model() |
|
|
|
|
|
g_openpose_model = OpenposeModel() |
|
|
|
model_uniformer = None |
|
|
|
|
|
def uniformer(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_uniformer |
|
if model_uniformer is None: |
|
from annotator.uniformer import apply_uniformer |
|
model_uniformer = apply_uniformer |
|
result = model_uniformer(img) |
|
return remove_pad(result), True |
|
|
|
|
|
def unload_uniformer(): |
|
global model_uniformer |
|
if model_uniformer is not None: |
|
from annotator.uniformer import unload_uniformer_model |
|
unload_uniformer_model() |
|
|
|
|
|
model_pidinet = None |
|
|
|
|
|
def pidinet(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_pidinet |
|
if model_pidinet is None: |
|
from annotator.pidinet import apply_pidinet |
|
model_pidinet = apply_pidinet |
|
result = model_pidinet(img) |
|
return remove_pad(result), True |
|
|
|
|
|
def pidinet_ts(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_pidinet |
|
if model_pidinet is None: |
|
from annotator.pidinet import apply_pidinet |
|
model_pidinet = apply_pidinet |
|
result = model_pidinet(img, apply_fliter=True) |
|
return remove_pad(result), True |
|
|
|
|
|
def pidinet_safe(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_pidinet |
|
if model_pidinet is None: |
|
from annotator.pidinet import apply_pidinet |
|
model_pidinet = apply_pidinet |
|
result = model_pidinet(img, is_safe=True) |
|
return remove_pad(result), True |
|
|
|
|
|
def scribble_pidinet(img, res=512, **kwargs): |
|
result, _ = pidinet(img, res) |
|
import cv2 |
|
from annotator.util import nms |
|
result = nms(result, 127, 3.0) |
|
result = cv2.GaussianBlur(result, (0, 0), 3.0) |
|
result[result > 4] = 255 |
|
result[result < 255] = 0 |
|
return result, True |
|
|
|
|
|
def unload_pidinet(): |
|
global model_pidinet |
|
if model_pidinet is not None: |
|
from annotator.pidinet import unload_pid_model |
|
unload_pid_model() |
|
|
|
|
|
clip_encoder = { |
|
'clip_g': None, |
|
'clip_h': None, |
|
'clip_vitl': None, |
|
} |
|
|
|
|
|
def clip(img, res=512, config='clip_vitl', low_vram=False, **kwargs): |
|
img = HWC3(img) |
|
global clip_encoder |
|
if clip_encoder[config] is None: |
|
from annotator.clipvision import ClipVisionDetector |
|
if low_vram: |
|
logger.info("Loading CLIP model on CPU.") |
|
clip_encoder[config] = ClipVisionDetector(config, low_vram) |
|
result = clip_encoder[config](img) |
|
return result, False |
|
|
|
|
|
def unload_clip(config='clip_vitl'): |
|
global clip_encoder |
|
if clip_encoder[config] is not None: |
|
clip_encoder[config].unload_model() |
|
clip_encoder[config] = None |
|
|
|
|
|
model_color = None |
|
|
|
|
|
def color(img, res=512, **kwargs): |
|
img = HWC3(img) |
|
global model_color |
|
if model_color is None: |
|
from annotator.color import apply_color |
|
model_color = apply_color |
|
result = model_color(img, res=res) |
|
return result, True |
|
|
|
|
|
def lineart_standard(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
x = img.astype(np.float32) |
|
g = cv2.GaussianBlur(x, (0, 0), 6.0) |
|
intensity = np.min(g - x, axis=2).clip(0, 255) |
|
intensity /= max(16, np.median(intensity[intensity > 8])) |
|
intensity *= 127 |
|
result = intensity.clip(0, 255).astype(np.uint8) |
|
return remove_pad(result), True |
|
|
|
|
|
model_lineart = None |
|
|
|
|
|
def lineart(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_lineart |
|
if model_lineart is None: |
|
from annotator.lineart import LineartDetector |
|
model_lineart = LineartDetector(LineartDetector.model_default) |
|
|
|
|
|
result = 255 - model_lineart(img) |
|
return remove_pad(result), True |
|
|
|
|
|
def unload_lineart(): |
|
global model_lineart |
|
if model_lineart is not None: |
|
model_lineart.unload_model() |
|
|
|
|
|
model_lineart_coarse = None |
|
|
|
|
|
def lineart_coarse(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_lineart_coarse |
|
if model_lineart_coarse is None: |
|
from annotator.lineart import LineartDetector |
|
model_lineart_coarse = LineartDetector(LineartDetector.model_coarse) |
|
|
|
|
|
result = 255 - model_lineart_coarse(img) |
|
return remove_pad(result), True |
|
|
|
|
|
def unload_lineart_coarse(): |
|
global model_lineart_coarse |
|
if model_lineart_coarse is not None: |
|
model_lineart_coarse.unload_model() |
|
|
|
|
|
model_lineart_anime = None |
|
|
|
|
|
def lineart_anime(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_lineart_anime |
|
if model_lineart_anime is None: |
|
from annotator.lineart_anime import LineartAnimeDetector |
|
model_lineart_anime = LineartAnimeDetector() |
|
|
|
|
|
result = 255 - model_lineart_anime(img) |
|
return remove_pad(result), True |
|
|
|
|
|
def unload_lineart_anime(): |
|
global model_lineart_anime |
|
if model_lineart_anime is not None: |
|
model_lineart_anime.unload_model() |
|
|
|
|
|
model_manga_line = None |
|
|
|
|
|
def lineart_anime_denoise(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_manga_line |
|
if model_manga_line is None: |
|
from annotator.manga_line import MangaLineExtration |
|
model_manga_line = MangaLineExtration() |
|
|
|
|
|
result = model_manga_line(img) |
|
return remove_pad(result), True |
|
|
|
|
|
def unload_lineart_anime_denoise(): |
|
global model_manga_line |
|
if model_manga_line is not None: |
|
model_manga_line.unload_model() |
|
|
|
|
|
model_lama = None |
|
|
|
|
|
def lama_inpaint(img, res=512, **kwargs): |
|
H, W, C = img.shape |
|
raw_color = img[:, :, 0:3].copy() |
|
raw_mask = img[:, :, 3:4].copy() |
|
|
|
res = 256 |
|
|
|
img_res, remove_pad = resize_image_with_pad(img, res, skip_hwc3=True) |
|
|
|
global model_lama |
|
if model_lama is None: |
|
from annotator.lama import LamaInpainting |
|
model_lama = LamaInpainting() |
|
|
|
|
|
prd_color = model_lama(img_res) |
|
prd_color = remove_pad(prd_color) |
|
prd_color = cv2.resize(prd_color, (W, H)) |
|
|
|
alpha = raw_mask.astype(np.float32) / 255.0 |
|
fin_color = prd_color.astype(np.float32) * alpha + raw_color.astype(np.float32) * (1 - alpha) |
|
fin_color = fin_color.clip(0, 255).astype(np.uint8) |
|
|
|
result = np.concatenate([fin_color, raw_mask], axis=2) |
|
|
|
return result, True |
|
|
|
|
|
def unload_lama_inpaint(): |
|
global model_lama |
|
if model_lama is not None: |
|
model_lama.unload_model() |
|
|
|
|
|
model_zoe_depth = None |
|
|
|
|
|
def zoe_depth(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_zoe_depth |
|
if model_zoe_depth is None: |
|
from annotator.zoe import ZoeDetector |
|
model_zoe_depth = ZoeDetector() |
|
result = model_zoe_depth(img) |
|
return remove_pad(result), True |
|
|
|
|
|
def unload_zoe_depth(): |
|
global model_zoe_depth |
|
if model_zoe_depth is not None: |
|
model_zoe_depth.unload_model() |
|
|
|
|
|
model_normal_bae = None |
|
|
|
|
|
def normal_bae(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_normal_bae |
|
if model_normal_bae is None: |
|
from annotator.normalbae import NormalBaeDetector |
|
model_normal_bae = NormalBaeDetector() |
|
result = model_normal_bae(img) |
|
return remove_pad(result), True |
|
|
|
|
|
def unload_normal_bae(): |
|
global model_normal_bae |
|
if model_normal_bae is not None: |
|
model_normal_bae.unload_model() |
|
|
|
|
|
model_oneformer_coco = None |
|
|
|
|
|
def oneformer_coco(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_oneformer_coco |
|
if model_oneformer_coco is None: |
|
from annotator.oneformer import OneformerDetector |
|
model_oneformer_coco = OneformerDetector(OneformerDetector.configs["coco"]) |
|
result = model_oneformer_coco(img) |
|
return remove_pad(result), True |
|
|
|
|
|
def unload_oneformer_coco(): |
|
global model_oneformer_coco |
|
if model_oneformer_coco is not None: |
|
model_oneformer_coco.unload_model() |
|
|
|
|
|
model_oneformer_ade20k = None |
|
|
|
|
|
def oneformer_ade20k(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_oneformer_ade20k |
|
if model_oneformer_ade20k is None: |
|
from annotator.oneformer import OneformerDetector |
|
model_oneformer_ade20k = OneformerDetector(OneformerDetector.configs["ade20k"]) |
|
result = model_oneformer_ade20k(img) |
|
return remove_pad(result), True |
|
|
|
|
|
def unload_oneformer_ade20k(): |
|
global model_oneformer_ade20k |
|
if model_oneformer_ade20k is not None: |
|
model_oneformer_ade20k.unload_model() |
|
|
|
|
|
model_shuffle = None |
|
|
|
|
|
def shuffle(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
img = remove_pad(img) |
|
global model_shuffle |
|
if model_shuffle is None: |
|
from annotator.shuffle import ContentShuffleDetector |
|
model_shuffle = ContentShuffleDetector() |
|
result = model_shuffle(img) |
|
return result, True |
|
|
|
|
|
def recolor_luminance(img, res=512, thr_a=1.0, **kwargs): |
|
result = cv2.cvtColor(HWC3(img), cv2.COLOR_BGR2LAB) |
|
result = result[:, :, 0].astype(np.float32) / 255.0 |
|
result = result ** thr_a |
|
result = (result * 255.0).clip(0, 255).astype(np.uint8) |
|
result = cv2.cvtColor(result, cv2.COLOR_GRAY2RGB) |
|
return result, True |
|
|
|
|
|
def recolor_intensity(img, res=512, thr_a=1.0, **kwargs): |
|
result = cv2.cvtColor(HWC3(img), cv2.COLOR_BGR2HSV) |
|
result = result[:, :, 2].astype(np.float32) / 255.0 |
|
result = result ** thr_a |
|
result = (result * 255.0).clip(0, 255).astype(np.uint8) |
|
result = cv2.cvtColor(result, cv2.COLOR_GRAY2RGB) |
|
return result, True |
|
|
|
|
|
def blur_gaussian(img, res=512, thr_a=1.0, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
img = remove_pad(img) |
|
result = cv2.GaussianBlur(img, (0, 0), float(thr_a)) |
|
return result, True |
|
|
|
|
|
model_anime_face_segment = None |
|
|
|
|
|
def anime_face_segment(img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_anime_face_segment |
|
if model_anime_face_segment is None: |
|
from annotator.anime_face_segment import AnimeFaceSegment |
|
model_anime_face_segment = AnimeFaceSegment() |
|
|
|
result = model_anime_face_segment(img) |
|
return remove_pad(result), True |
|
|
|
|
|
def unload_anime_face_segment(): |
|
global model_anime_face_segment |
|
if model_anime_face_segment is not None: |
|
model_anime_face_segment.unload_model() |
|
|
|
|
|
|
|
def densepose(img, res=512, cmap="viridis", **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
from annotator.densepose import apply_densepose |
|
result = apply_densepose(img, cmap=cmap) |
|
return remove_pad(result), True |
|
|
|
|
|
def unload_densepose(): |
|
from annotator.densepose import unload_model |
|
unload_model() |
|
|
|
model_te_hed = None |
|
|
|
def te_hed(img, res=512, thr_a=2, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
global model_te_hed |
|
if model_te_hed is None: |
|
from annotator.teed import TEEDDector |
|
model_te_hed = TEEDDector() |
|
result = model_te_hed(img, safe_steps=int(thr_a)) |
|
return remove_pad(result), True |
|
|
|
def unload_te_hed(): |
|
if model_te_hed is not None: |
|
model_te_hed.unload_model() |
|
|
|
class InsightFaceModel: |
|
def __init__(self, face_analysis_model_name: str = "buffalo_l"): |
|
self.model = None |
|
self.face_analysis_model_name = face_analysis_model_name |
|
self.antelopev2_installed = False |
|
|
|
def install_antelopev2(self): |
|
"""insightface's github release on antelopev2 model is down. Downloading |
|
from huggingface mirror.""" |
|
from basicsr.utils.download_util import load_file_from_url |
|
from annotator.annotator_path import models_path |
|
model_root = os.path.join(models_path, "insightface", "models", "antelopev2") |
|
if not model_root: |
|
os.makedirs(model_root, exist_ok=True) |
|
for local_file, url in ( |
|
("1k3d68.onnx", "https://huggingface.co/DIAMONIK7777/antelopev2/resolve/main/1k3d68.onnx"), |
|
("2d106det.onnx", "https://huggingface.co/DIAMONIK7777/antelopev2/resolve/main/2d106det.onnx"), |
|
("genderage.onnx", "https://huggingface.co/DIAMONIK7777/antelopev2/resolve/main/genderage.onnx"), |
|
("glintr100.onnx", "https://huggingface.co/DIAMONIK7777/antelopev2/resolve/main/glintr100.onnx"), |
|
("scrfd_10g_bnkps.onnx", "https://huggingface.co/DIAMONIK7777/antelopev2/resolve/main/scrfd_10g_bnkps.onnx"), |
|
): |
|
local_path = os.path.join(model_root, local_file) |
|
if not os.path.exists(local_path): |
|
load_file_from_url(url, model_root) |
|
self.antelopev2_installed = True |
|
|
|
def load_model(self): |
|
if self.model is None: |
|
from insightface.app import FaceAnalysis |
|
from annotator.annotator_path import models_path |
|
self.model = FaceAnalysis( |
|
name=self.face_analysis_model_name, |
|
providers=['CUDAExecutionProvider', 'CPUExecutionProvider'], |
|
root=os.path.join(models_path, "insightface"), |
|
) |
|
self.model.prepare(ctx_id=0, det_size=(640, 640)) |
|
|
|
def run_model(self, img: np.ndarray, **kwargs) -> Tuple[torch.Tensor, bool]: |
|
self.load_model() |
|
img = HWC3(img) |
|
faces = self.model.get(img) |
|
if not faces: |
|
raise Exception(f"Insightface: No face found in image.") |
|
if len(faces) > 1: |
|
logger.warn("Insightface: More than one face is detected in the image. " |
|
f"Only the first one will be used.") |
|
return torch.from_numpy(faces[0].normed_embedding).unsqueeze(0), False |
|
|
|
def run_model_instant_id( |
|
self, |
|
img: np.ndarray, |
|
res: int = 512, |
|
return_keypoints: bool = False, |
|
**kwargs |
|
) -> Tuple[Union[np.ndarray, torch.Tensor], bool]: |
|
"""Run the insightface model for instant_id. |
|
Arguments: |
|
- img: Input image in any size. |
|
- res: Resolution used to resize image. |
|
- return_keypoints: Whether to return keypoints image or face embedding. |
|
""" |
|
def draw_kps(img: np.ndarray, kps, color_list=[(255,0,0), (0,255,0), (0,0,255), (255,255,0), (255,0,255)]): |
|
stickwidth = 4 |
|
limbSeq = np.array([[0, 2], [1, 2], [3, 2], [4, 2]]) |
|
kps = np.array(kps) |
|
|
|
h, w, _ = img.shape |
|
out_img = np.zeros([h, w, 3]) |
|
|
|
for i in range(len(limbSeq)): |
|
index = limbSeq[i] |
|
color = color_list[index[0]] |
|
|
|
x = kps[index][:, 0] |
|
y = kps[index][:, 1] |
|
length = ((x[0] - x[1]) ** 2 + (y[0] - y[1]) ** 2) ** 0.5 |
|
angle = math.degrees(math.atan2(y[0] - y[1], x[0] - x[1])) |
|
polygon = cv2.ellipse2Poly((int(np.mean(x)), int(np.mean(y))), (int(length / 2), stickwidth), int(angle), 0, 360, 1) |
|
out_img = cv2.fillConvexPoly(out_img.copy(), polygon, color) |
|
out_img = (out_img * 0.6).astype(np.uint8) |
|
|
|
for idx_kp, kp in enumerate(kps): |
|
color = color_list[idx_kp] |
|
x, y = kp |
|
out_img = cv2.circle(out_img.copy(), (int(x), int(y)), 10, color, -1) |
|
|
|
return out_img.astype(np.uint8) |
|
|
|
if not self.antelopev2_installed: |
|
self.install_antelopev2() |
|
self.load_model() |
|
|
|
img, remove_pad = resize_image_with_pad(img, res) |
|
face_info = self.model.get(img) |
|
if not face_info: |
|
raise Exception(f"Insightface: No face found in image.") |
|
if len(face_info) > 1: |
|
logger.warn("Insightface: More than one face is detected in the image. " |
|
f"Only the biggest one will be used.") |
|
|
|
face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*x['bbox'][3]-x['bbox'][1])[-1] |
|
if return_keypoints: |
|
return remove_pad(draw_kps(img, face_info['kps'])), True |
|
else: |
|
return torch.from_numpy(face_info['embedding']), False |
|
|
|
|
|
g_insight_face_model = InsightFaceModel() |
|
g_insight_face_instant_id_model = InsightFaceModel(face_analysis_model_name="antelopev2") |
|
|
|
|
|
@dataclass |
|
class FaceIdPlusInput: |
|
face_embed: torch.Tensor |
|
clip_embed: CLIPVisionModelOutput |
|
|
|
|
|
def face_id_plus(img, low_vram=False, **kwargs): |
|
""" FaceID plus uses both face_embeding from insightface and clip_embeding from clip. """ |
|
face_embed, _ = g_insight_face_model.run_model(img) |
|
clip_embed, _ = clip(img, config='clip_h', low_vram=low_vram) |
|
return FaceIdPlusInput(face_embed, clip_embed), False |
|
|
|
|
|
class HandRefinerModel: |
|
def __init__(self): |
|
self.model = None |
|
self.device = devices.get_device_for("controlnet") |
|
|
|
def load_model(self): |
|
if self.model is None: |
|
from annotator.annotator_path import models_path |
|
from hand_refiner import MeshGraphormerDetector |
|
with Extra(torch_handler): |
|
self.model = MeshGraphormerDetector.from_pretrained( |
|
"hr16/ControlNet-HandRefiner-pruned", |
|
cache_dir=os.path.join(models_path, "hand_refiner"), |
|
device=self.device, |
|
) |
|
else: |
|
self.model.to(self.device) |
|
|
|
def unload(self): |
|
if self.model is not None: |
|
self.model.to("cpu") |
|
|
|
def run_model(self, img, res=512, **kwargs): |
|
img, remove_pad = resize_image_with_pad(img, res) |
|
self.load_model() |
|
with Extra(torch_handler): |
|
depth_map, mask, info = self.model( |
|
img, output_type="np", |
|
detect_resolution=res, |
|
mask_bbox_padding=30, |
|
) |
|
return remove_pad(depth_map), True |
|
|
|
|
|
g_hand_refiner_model = HandRefinerModel() |
|
|
|
|
|
model_free_preprocessors = [ |
|
"reference_only", |
|
"reference_adain", |
|
"reference_adain+attn", |
|
"revision_clipvision", |
|
"revision_ignore_prompt" |
|
] |
|
|
|
no_control_mode_preprocessors = [ |
|
"revision_clipvision", |
|
"revision_ignore_prompt", |
|
"clip_vision", |
|
"ip-adapter_clip_sd15", |
|
"ip-adapter_clip_sdxl", |
|
"ip-adapter_clip_sdxl_plus_vith", |
|
"t2ia_style_clipvision", |
|
"ip-adapter_face_id", |
|
"ip-adapter_face_id_plus", |
|
] |
|
|
|
flag_preprocessor_resolution = "Preprocessor Resolution" |
|
preprocessor_sliders_config = { |
|
"none": [], |
|
"inpaint": [], |
|
"inpaint_only": [], |
|
"revision_clipvision": [ |
|
None, |
|
{ |
|
"name": "Noise Augmentation", |
|
"value": 0.0, |
|
"min": 0.0, |
|
"max": 1.0 |
|
}, |
|
], |
|
"revision_ignore_prompt": [ |
|
None, |
|
{ |
|
"name": "Noise Augmentation", |
|
"value": 0.0, |
|
"min": 0.0, |
|
"max": 1.0 |
|
}, |
|
], |
|
"canny": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"value": 512, |
|
"min": 64, |
|
"max": 2048 |
|
}, |
|
{ |
|
"name": "Canny Low Threshold", |
|
"value": 100, |
|
"min": 1, |
|
"max": 255 |
|
}, |
|
{ |
|
"name": "Canny High Threshold", |
|
"value": 200, |
|
"min": 1, |
|
"max": 255 |
|
}, |
|
], |
|
"mlsd": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"min": 64, |
|
"max": 2048, |
|
"value": 512 |
|
}, |
|
{ |
|
"name": "MLSD Value Threshold", |
|
"min": 0.01, |
|
"max": 2.0, |
|
"value": 0.1, |
|
"step": 0.01 |
|
}, |
|
{ |
|
"name": "MLSD Distance Threshold", |
|
"min": 0.01, |
|
"max": 20.0, |
|
"value": 0.1, |
|
"step": 0.01 |
|
} |
|
], |
|
"hed": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"min": 64, |
|
"max": 2048, |
|
"value": 512 |
|
} |
|
], |
|
"scribble_hed": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"min": 64, |
|
"max": 2048, |
|
"value": 512 |
|
} |
|
], |
|
"hed_safe": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"min": 64, |
|
"max": 2048, |
|
"value": 512 |
|
} |
|
], |
|
"openpose": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"min": 64, |
|
"max": 2048, |
|
"value": 512 |
|
} |
|
], |
|
"openpose_full": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"min": 64, |
|
"max": 2048, |
|
"value": 512 |
|
} |
|
], |
|
"dw_openpose_full": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"min": 64, |
|
"max": 2048, |
|
"value": 512 |
|
} |
|
], |
|
"animal_openpose": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"min": 64, |
|
"max": 2048, |
|
"value": 512 |
|
} |
|
], |
|
"segmentation": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"min": 64, |
|
"max": 2048, |
|
"value": 512 |
|
} |
|
], |
|
"depth": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"min": 64, |
|
"max": 2048, |
|
"value": 512 |
|
} |
|
], |
|
"depth_leres": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"min": 64, |
|
"max": 2048, |
|
"value": 512 |
|
}, |
|
{ |
|
"name": "Remove Near %", |
|
"min": 0, |
|
"max": 100, |
|
"value": 0, |
|
"step": 0.1, |
|
}, |
|
{ |
|
"name": "Remove Background %", |
|
"min": 0, |
|
"max": 100, |
|
"value": 0, |
|
"step": 0.1, |
|
} |
|
], |
|
"depth_leres++": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"min": 64, |
|
"max": 2048, |
|
"value": 512 |
|
}, |
|
{ |
|
"name": "Remove Near %", |
|
"min": 0, |
|
"max": 100, |
|
"value": 0, |
|
"step": 0.1, |
|
}, |
|
{ |
|
"name": "Remove Background %", |
|
"min": 0, |
|
"max": 100, |
|
"value": 0, |
|
"step": 0.1, |
|
} |
|
], |
|
"normal_map": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"min": 64, |
|
"max": 2048, |
|
"value": 512 |
|
}, |
|
{ |
|
"name": "Normal Background Threshold", |
|
"min": 0.0, |
|
"max": 1.0, |
|
"value": 0.4, |
|
"step": 0.01 |
|
} |
|
], |
|
"threshold": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"value": 512, |
|
"min": 64, |
|
"max": 2048 |
|
}, |
|
{ |
|
"name": "Binarization Threshold", |
|
"min": 0, |
|
"max": 255, |
|
"value": 127 |
|
} |
|
], |
|
|
|
"scribble_xdog": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"value": 512, |
|
"min": 64, |
|
"max": 2048 |
|
}, |
|
{ |
|
"name": "XDoG Threshold", |
|
"min": 1, |
|
"max": 64, |
|
"value": 32, |
|
} |
|
], |
|
"blur_gaussian": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"value": 512, |
|
"min": 64, |
|
"max": 2048 |
|
}, |
|
{ |
|
"name": "Sigma", |
|
"min": 0.01, |
|
"max": 64.0, |
|
"value": 9.0, |
|
} |
|
], |
|
"tile_resample": [ |
|
None, |
|
{ |
|
"name": "Down Sampling Rate", |
|
"value": 1.0, |
|
"min": 1.0, |
|
"max": 8.0, |
|
"step": 0.01 |
|
} |
|
], |
|
"tile_colorfix": [ |
|
None, |
|
{ |
|
"name": "Variation", |
|
"value": 8.0, |
|
"min": 3.0, |
|
"max": 32.0, |
|
"step": 1.0 |
|
} |
|
], |
|
"tile_colorfix+sharp": [ |
|
None, |
|
{ |
|
"name": "Variation", |
|
"value": 8.0, |
|
"min": 3.0, |
|
"max": 32.0, |
|
"step": 1.0 |
|
}, |
|
{ |
|
"name": "Sharpness", |
|
"value": 1.0, |
|
"min": 0.0, |
|
"max": 2.0, |
|
"step": 0.01 |
|
} |
|
], |
|
"reference_only": [ |
|
None, |
|
{ |
|
"name": r'Style Fidelity (only for "Balanced" mode)', |
|
"value": 0.5, |
|
"min": 0.0, |
|
"max": 1.0, |
|
"step": 0.01 |
|
} |
|
], |
|
"reference_adain": [ |
|
None, |
|
{ |
|
"name": r'Style Fidelity (only for "Balanced" mode)', |
|
"value": 0.5, |
|
"min": 0.0, |
|
"max": 1.0, |
|
"step": 0.01 |
|
} |
|
], |
|
"reference_adain+attn": [ |
|
None, |
|
{ |
|
"name": r'Style Fidelity (only for "Balanced" mode)', |
|
"value": 0.5, |
|
"min": 0.0, |
|
"max": 1.0, |
|
"step": 0.01 |
|
} |
|
], |
|
"inpaint_only+lama": [], |
|
"color": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"value": 512, |
|
"min": 64, |
|
"max": 2048, |
|
} |
|
], |
|
"mediapipe_face": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"value": 512, |
|
"min": 64, |
|
"max": 2048, |
|
}, |
|
{ |
|
"name": "Max Faces", |
|
"value": 1, |
|
"min": 1, |
|
"max": 10, |
|
"step": 1 |
|
}, |
|
{ |
|
"name": "Min Face Confidence", |
|
"value": 0.5, |
|
"min": 0.01, |
|
"max": 1.0, |
|
"step": 0.01 |
|
} |
|
], |
|
"recolor_luminance": [ |
|
None, |
|
{ |
|
"name": "Gamma Correction", |
|
"value": 1.0, |
|
"min": 0.1, |
|
"max": 2.0, |
|
"step": 0.001 |
|
} |
|
], |
|
"recolor_intensity": [ |
|
None, |
|
{ |
|
"name": "Gamma Correction", |
|
"value": 1.0, |
|
"min": 0.1, |
|
"max": 2.0, |
|
"step": 0.001 |
|
} |
|
], |
|
"anime_face_segment": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"value": 512, |
|
"min": 64, |
|
"max": 2048 |
|
} |
|
], |
|
"densepose": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"min": 64, |
|
"max": 2048, |
|
"value": 512 |
|
} |
|
], |
|
"densepose_parula": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"min": 64, |
|
"max": 2048, |
|
"value": 512 |
|
} |
|
], |
|
"depth_hand_refiner": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"value": 512, |
|
"min": 64, |
|
"max": 2048 |
|
} |
|
], |
|
"te_hed": [ |
|
{ |
|
"name": flag_preprocessor_resolution, |
|
"value": 512, |
|
"min": 64, |
|
"max": 2048 |
|
}, |
|
{ |
|
"name": "Safe Steps", |
|
"min": 0, |
|
"max": 10, |
|
"value": 2, |
|
"step": 1, |
|
}, |
|
], |
|
} |
|
|
|
preprocessor_filters = { |
|
"All": "none", |
|
"Canny": "canny", |
|
"Depth": "depth_midas", |
|
"NormalMap": "normal_bae", |
|
"OpenPose": "openpose_full", |
|
"MLSD": "mlsd", |
|
"Lineart": "lineart_standard (from white bg & black line)", |
|
"SoftEdge": "softedge_pidinet", |
|
"Scribble/Sketch": "scribble_pidinet", |
|
"Segmentation": "seg_ofade20k", |
|
"Shuffle": "shuffle", |
|
"Tile/Blur": "tile_resample", |
|
"Inpaint": "inpaint_only", |
|
"InstructP2P": "none", |
|
"Reference": "reference_only", |
|
"Recolor": "recolor_luminance", |
|
"Revision": "revision_clipvision", |
|
"T2I-Adapter": "none", |
|
"IP-Adapter": "ip-adapter_clip_sd15", |
|
"Instant_ID": "instant_id", |
|
} |
|
|
|
preprocessor_filters_aliases = { |
|
'instructp2p': ['ip2p'], |
|
'segmentation': ['seg'], |
|
'normalmap': ['normal'], |
|
't2i-adapter': ['t2i_adapter', 't2iadapter', 't2ia'], |
|
'ip-adapter': ['ip_adapter', 'ipadapter'], |
|
'scribble/sketch': ['scribble', 'sketch'], |
|
'tile/blur': ['tile', 'blur'], |
|
'openpose':['openpose', 'densepose'], |
|
} |
|
|