Spaces:

primerz
/

pixel-art

Running on Zero

App Files Files Community

primerz commited on Nov 17, 2025

Commit

6026000

verified ·

1 Parent(s): ef222df

Upload 6 files

Browse files

Files changed (6) hide show

app.py +157 -0
config.py +47 -0
generator.py +156 -0
model.py +166 -0
requirements.txt +14 -0
utils.py +77 -0

app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import gradio as gr
+import spaces
+import torch
+from model import ModelHandler
+from generator import Generator
+from config import Config
+# 1. Initialize Models Globally
+print("Initializing Application...")
+handler = ModelHandler()
+handler.load_models()
+gen = Generator(handler)
+# 2. Define GPU-enabled Inference Function
+@spaces.GPU(duration=20)
+def process_img(
+    image,
+    prompt,
+    negative_prompt,
+    cfg_scale, # <-- RE-ENABLED
+    steps,
+    img_strength,
+    depth_strength,
+    edge_strength,
+    seed
+):
+    if image is None:
+        raise gr.Error("Please upload an image first.")
+    try:
+        print("--- Starting Generation ---")
+        result = gen.predict(
+            image,
+            prompt,
+            negative_prompt=negative_prompt,
+            guidance_scale=cfg_scale, # <-- RE-ENABLED
+            num_inference_steps=steps,
+            img2img_strength=img_strength,
+            depth_strength=depth_strength,
+            lineart_strength=edge_strength,
+            seed=seed
+        )
+        print("--- Generation Complete ---")
+        return result
+    except Exception as e:
+        print(f"Error during generation: {e}")
+        raise gr.Error(f"An error occurred: {str(e)}")
+# 3. Build Gradio Interface
+with gr.Blocks(title="Face To Style", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🎮 Face to Style
+        Upload any image. If there is a face, we'll keep the identity. If not, we'll stylize the scene!
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            input_img = gr.Image(type="pil", label="Input Image")
+            prompt = gr.Textbox(
+                label="Prompt (Optional)",
+                placeholder="Leave empty for auto-captioning...",
+                info=f"The trigger words '{Config.STYLE_TRIGGER}' are added automatically."
+            )
+            negative_prompt = gr.Textbox(
+                label="Negative Prompt (Optional)",
+                placeholder="e.g., blurry, text, watermark, bad art...",
+                value=Config.DEFAULT_NEGATIVE_PROMPT
+            )
+            with gr.Accordion("Advanced Settings", open=False):
+                seed = gr.Number(
+                    label="Seed",
+                    value=-1,
+                    info="-1 for random",
+                    precision=0
+                )
+                # --- RE-ENABLED CFG/GUIDANCE SLIDER ---
+                cfg_scale = gr.Slider(
+                    elem_id="cfg_scale",
+                    minimum=1.0,
+                    maximum=10.0, # Range for TCD+Style
+                    step=0.1,
+                    value=Config.CGF_SCALE, # Default 4.0
+                    label="Style Strength (Guidance)"
+                )
+                steps = gr.Slider(
+                    elem_id="steps",
+                    minimum=1,
+                    maximum=20,
+                    step=1,
+                    value=8, # TCD default
+                    label="Steps Number"
+                )
+                img_strength = gr.Slider(
+                    elem_id="img_strength",
+                    minimum=0.1,
+                    maximum=1.0,
+                    step=0.05,
+                    value=Config.IMG_STRENGTH,
+                    label="Image Strength (Img2Img)"
+                )
+                depth_strength = gr.Slider(
+                    elem_id="depth_strength",
+                    minimum=0.0,
+                    maximum=1.0,
+                    step=0.05,
+                    value=Config.DEPTH_STRENGTH,
+                    label="DepthMap Strength"
+                )
+                edge_strength = gr.Slider(
+                    elem_id="edge_strength",
+                    minimum=0.0,
+                    maximum=1.0,
+                    step=0.05,
+                    value=Config.EDGE_STRENGTH,
+                    label="EdgeMap Strength (LineArt)"
+                )
+            run_btn = gr.Button("Generate", variant="primary")
+        with gr.Column(scale=1):
+            output_img = gr.Image(label="Styled Result")
+    # Event Handler
+    all_inputs = [
+        input_img,
+        prompt,
+        negative_prompt,
+        cfg_scale, # <-- RE-ENABLED
+        steps,
+        img_strength,
+        depth_strength,
+        edge_strength,
+        seed
+    ]
+    run_btn.click(
+        fn=process_img,
+        inputs=all_inputs,
+        outputs=[output_img]
+    )
+# 4. Launch the App
+if __name__ == "__main__":
+    demo.queue(max_size=20, api_open=True)
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_api=True
+    )

config.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import torch
+class Config:
+    # Hardware
+    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+    DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
+    # --- UPDATED: New Base Model & Style LoRA ---
+    # Assuming these are in the 'primerz/pixagram' repo or a new one.
+    # If they are in a different repo, change REPO_ID.
+    REPO_ID = "primerz/pixagram"
+    CHECKPOINT_FILENAME = "reality.safetensors"
+    LORA_FILENAME = "retroart.safetensors"
+    LORA_STRENGTH = 1.25 # TCD works well with 1.0
+    # Trigger Words for the LoRA
+    STYLE_TRIGGER = "p1x3l4rt, pixel art"
+    # Default Negative Prompt (Updated for general use)
+    DEFAULT_NEGATIVE_PROMPT = "Ugly, artifacts, blurry, disformed, photo-realistic, photo, photography, realistic, low-quality, pixelart, text."
+    # --- END UPDATED ---
+    # InstantID Assets
+    INSTANTID_REPO = "InstantX/InstantID"
+    # ControlNet Repos
+    CN_ZOE_REPO = "diffusers/controlnet-zoE-depth-sdxl-1.0"
+    CN_LINEART_REPO = "ShermanG/ControlNet-Standard-Lineart-for-SDXL"
+    # Preprocessor (Annotator) Repo
+    ANNOTATOR_REPO = "lllyasviel/Annotators"
+    # Captioning Model
+    CAPTIONER_REPO = "Salesforce/blip-image-captioning-base"
+    # InsightFace Model (HF Hub mirror)
+    ANTELOPEV2_REPO = "DIAMONIK7777/antelopev2"
+    ANTELOPEV2_ROOT = "." # Parent folder
+    ANTELOPEV2_NAME = "antelopev2"
+    # Gradio Parameters
+    # --- FIX: Style LoRA needs non-zero CFG to activate. ---
+    CGF_SCALE = 4.0 # Was 0.0. This activates the prompt trigger.
+    STEPS_NUMBER = 4
+    IMG_STRENGTH = 0.8
+    DEPTH_STRENGTH = 0.8
+    EDGE_STRENGTH = 0.8

generator.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import torch
+from config import Config
+from utils import get_caption, draw_kps # Removed resize_image_to_1mp
+from PIL import Image
+class Generator:
+    def __init__(self, model_handler):
+        self.mh = model_handler
+    def smart_crop_and_resize(self, image):
+        """
+        Analyzes aspect ratio and snaps to the best SDXL resolution bucket.
+        Performs a center crop to match the target ratio, then resizes.
+        """
+        w, h = image.size
+        aspect_ratio = w / h
+        # 1. Determine Target Resolution (Horizon SDXL Buckets)
+        if 0.85 <= aspect_ratio <= 1.15:
+            target_w, target_h = 1024, 1024
+            print(f"Snap to Bucket: Square (1024x1024)")
+        elif aspect_ratio < 0.85:
+            if aspect_ratio < 0.72:
+                target_w, target_h = 832, 1216 # Tall Portrait
+                print(f"Snap to Bucket: Tall Portrait (832x1216)")
+            else:
+                target_w, target_h = 896, 1152 # Standard Portrait
+                print(f"Snap to Bucket: Portrait (896x1152)")
+        else: # aspect_ratio > 1.15
+            if aspect_ratio > 1.35:
+                target_w, target_h = 1216, 832 # Wide Landscape
+                print(f"Snap to Bucket: Wide Landscape (1216x832)")
+            else:
+                target_w, target_h = 1152, 896 # Standard Landscape
+                print(f"Snap to Bucket: Landscape (1152x896)")
+        # 2. Center Crop to Target Aspect Ratio
+        target_ar = target_w / target_h
+        if aspect_ratio > target_ar:
+            new_w = int(h * target_ar)
+            offset = (w - new_w) // 2
+            crop_box = (offset, 0, offset + new_w, h)
+        else:
+            new_h = int(w / target_ar)
+            offset = (h - new_h) // 2
+            crop_box = (0, offset, w, offset + new_h)
+        cropped_img = image.crop(crop_box)
+        # 3. Resize to Exact Target Resolution
+        final_img = cropped_img.resize((target_w, target_h), Image.LANCZOS)
+        return final_img
+    def prepare_control_images(self, image, width, height):
+        """
+        Generates conditioning maps, ensuring they are resized
+        to the exact target dimensions (width, height).
+        """
+        print(f"Generating control maps for {width}x{height}...")
+        depth_map_raw = self.mh.leres_detector(image)
+        lineart_map_raw = self.mh.lineart_anime_detector(image)
+        depth_map = depth_map_raw.resize((width, height), Image.LANCZOS)
+        lineart_map = lineart_map_raw.resize((width, height), Image.LANCZOS)
+        return depth_map, lineart_map
+    def predict(
+        self,
+        input_image,
+        user_prompt="",
+        negative_prompt="",
+        # --- TCD Optimized Defaults ---
+        guidance_scale=4.0, # <-- FIX: Set to non-zero default
+        num_inference_steps=8,
+        img2img_strength=0.9,
+        # ----------------------------
+        depth_strength=0.3,
+        lineart_strength=0.3,
+        seed=-1
+    ):
+        # 1. Pre-process Inputs (Using Smart Crop)
+        print("Processing Input...")
+        processed_image = self.smart_crop_and_resize(input_image)
+        target_width, target_height = processed_image.size
+        # 2. Get Face Info
+        face_info = self.mh.get_face_info(processed_image)
+        # 3. Generate Prompt
+        if not user_prompt.strip():
+            try:
+                generated_caption = get_caption(processed_image)
+                final_prompt = f"{Config.STYLE_TRIGGER}, {generated_caption}"
+            except Exception as e:
+                print(f"Captioning failed: {e}, using default prompt.")
+                final_prompt = f"{Config.STYLE_TRIGGER}, a beautiful image"
+        else:
+            final_prompt = f"{Config.STYLE_TRIGGER}, {user_prompt}"
+        print(f"Prompt: {final_prompt}")
+        print(f"Negative Prompt: {negative_prompt}")
+        # 4. Generate Control Maps
+        print("Generating Control Maps (Depth, LineArt)...")
+        depth_map, lineart_map = self.prepare_control_images(processed_image, target_width, target_height)
+        # 5. Logic for Face vs No-Face
+        if face_info is not None:
+            print("Face detected: Applying InstantID with keypoints.")
+            face_emb = torch.tensor(
+                face_info['embedding'],
+                dtype=Config.DTYPE,
+                device=Config.DEVICE
+            ).unsqueeze(0)
+            face_kps = draw_kps(processed_image, face_info['kps'])
+            controlnet_conditioning_scale = [0.8, depth_strength, lineart_strength]
+            self.mh.pipeline.set_ip_adapter_scale(0.8)
+        else:
+            print("No face detected: Disabling InstantID.")
+            face_emb = torch.zeros((1, 512), dtype=Config.DTYPE, device=Config.DEVICE)
+            face_kps = Image.new('RGB', (target_width, target_height), (0, 0, 0))
+            controlnet_conditioning_scale = [0.0, depth_strength, lineart_strength]
+            self.mh.pipeline.set_ip_adapter_scale(0.0)
+        control_guidance_end = [0.3, 0.6, 0.6]
+        if seed == -1 or seed is None:
+            seed = torch.Generator().seed()
+        generator = torch.Generator(device=Config.DEVICE).manual_seed(int(seed))
+        print(f"Using seed: {seed}")
+        # 6. Run Inference
+        print("Running pipeline...")
+        result = self.mh.pipeline(
+            prompt=final_prompt,
+            negative_prompt=negative_prompt,
+            image=processed_image,
+            control_image=[face_kps, depth_map, lineart_map],
+            image_embeds=face_emb,
+            generator=generator,
+            strength=img2img_strength,
+            guidance_scale=guidance_scale, # <-- Will use non-zero value
+            num_inference_steps=num_inference_steps,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+            control_guidance_end=control_guidance_end,
+            clip_skip=0,
+            # --- TCD Specific Parameter ---
+            eta=0.45, # Gamma/Stochasticity
+            # ------------------------------
+        ).images[0]
+        return result

model.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import torch
+import os
+import cv2
+import numpy as np
+from config import Config
+from diffusers import (
+    ControlNetModel,
+    TCDScheduler,
+)
+from diffusers.models.controlnets.multicontrolnet import MultiControlNetModel
+# Import the custom pipeline from your local file
+from pipeline_stable_diffusion_xl_instantid_img2img import StableDiffusionXLInstantIDImg2ImgPipeline
+from huggingface_hub import snapshot_download, hf_hub_download
+from insightface.app import FaceAnalysis
+from controlnet_aux import LeresDetector, LineartAnimeDetector
+class ModelHandler:
+    def __init__(self):
+        self.pipeline = None
+        self.app = None # InsightFace
+        self.leres_detector = None
+        self.lineart_anime_detector = None
+        self.face_analysis_loaded = False
+    def load_face_analysis(self):
+        """
+        Load face analysis model.
+        Downloads from HF Hub to the path insightface expects.
+        """
+        print("Loading face analysis model...")
+        model_path = os.path.join(Config.ANTELOPEV2_ROOT, "models", Config.ANTELOPEV2_NAME)
+        if not os.path.exists(os.path.join(model_path, "scrfd_10g_bnkps.onnx")):
+            print(f"Downloading AntelopeV2 models from {Config.ANTELOPEV2_REPO} to {model_path}...")
+            try:
+                snapshot_download(
+                    repo_id=Config.ANTELOPEV2_REPO,
+                    local_dir=model_path, # Download to the correct expected path
+                )
+            except Exception as e:
+                print(f"  [ERROR] Failed to download AntelopeV2 models: {e}")
+                return False
+        try:
+            self.app = FaceAnalysis(
+                name=Config.ANTELOPEV2_NAME,
+                root=Config.ANTELOPEV2_ROOT,
+                providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
+            )
+            self.app.prepare(ctx_id=0, det_size=(640, 640))
+            print(f"  [OK] Face analysis model loaded successfully.")
+            return True
+        except Exception as e:
+            print(f"  [WARNING] Face detection system failed to initialize: {e}")
+            return False
+    def load_models(self):
+        # 1. Load Face Analysis
+        self.face_analysis_loaded = self.load_face_analysis()
+        # 2. Load ControlNets
+        print("Loading ControlNets (InstantID, Zoe, LineArt)...")
+        cn_instantid = ControlNetModel.from_pretrained(
+            Config.INSTANTID_REPO,
+            subfolder="ControlNetModel",
+            torch_dtype=Config.DTYPE
+        )
+        cn_zoe = ControlNetModel.from_pretrained(Config.CN_ZOE_REPO, torch_dtype=Config.DTYPE)
+        cn_lineart = ControlNetModel.from_pretrained(Config.CN_LINEART_REPO, torch_dtype=Config.DTYPE)
+        print("Wrapping ControlNets in MultiControlNetModel...")
+        controlnet_list = [cn_instantid, cn_zoe, cn_lineart]
+        controlnet = MultiControlNetModel(controlnet_list)
+        # 3. Load SDXL Pipeline (Now from 'reality.safetensors')
+        print(f"Loading SDXL Pipeline ({Config.CHECKPOINT_FILENAME})...")
+        checkpoint_local_path = os.path.join("./models", Config.CHECKPOINT_FILENAME)
+        if not os.path.exists(checkpoint_local_path):
+            print(f"Downloading checkpoint to {checkpoint_local_path}...")
+            hf_hub_download(
+                repo_id=Config.REPO_ID,
+                filename=Config.CHECKPOINT_FILENAME,
+                local_dir="./models",
+                local_dir_use_symlinks=False
+            )
+        print(f"Loading pipeline from local file: {checkpoint_local_path}")
+        self.pipeline = StableDiffusionXLInstantIDImg2ImgPipeline.from_single_file(
+            checkpoint_local_path,
+            controlnet=controlnet,
+            torch_dtype=Config.DTYPE,
+            use_safetensors=True
+        )
+        self.pipeline.to(Config.DEVICE)
+        try:
+            self.pipeline.enable_xformers_memory_efficient_attention()
+            print("  [OK] xFormers memory efficient attention enabled.")
+        except Exception as e:
+            print(f"  [WARNING] Failed to enable xFormers: {e}")
+        # 4. Set TCD Scheduler (Sanitized Config)
+        print("Configuring TCDScheduler...")
+        self.pipeline.scheduler = TCDScheduler.from_config(self.pipeline.scheduler.config)
+        print("  [OK] TCDScheduler loaded (Forced SDXL Defaults + Karras + Trailing).")
+        # 5. Load Adapters
+        print("Loading Adapters...")
+        # 5b. Load and Fuse Style LoRA (lucasart)
+        print(f"Loading and Fusing Style LoRA ({Config.LORA_FILENAME})...")
+        style_lora_path = os.path.join("./models", Config.LORA_FILENAME)
+        if not os.path.exists(style_lora_path):
+            hf_hub_download(
+                repo_id=Config.REPO_ID,
+                filename=Config.LORA_FILENAME,
+                local_dir="./models",
+                local_dir_use_symlinks=False
+            )
+        self.pipeline.load_lora_weights("./models", weight_name=Config.LORA_FILENAME)
+        self.pipeline.fuse_lora(lora_scale=Config.LORA_STRENGTH)
+        print("  [OK] Style LoRA fused.")
+        # 5c. Load IP-Adapter (for InstantID) - *Must be loaded AFTER fusing*
+        ip_adapter_filename = "ip-adapter.bin"
+        ip_adapter_local_path = os.path.join("./models", ip_adapter_filename)
+        if not os.path.exists(ip_adapter_local_path):
+            hf_hub_download(
+                repo_id=Config.INSTANTID_REPO,
+                filename=ip_adapter_filename,
+                local_dir="./models",
+                local_dir_use_symlinks=False
+            )
+        self.pipeline.load_ip_adapter_instantid(ip_adapter_local_path)
+        print("  [OK] IP-Adapter loaded.")
+        # --- END FIX ---
+        # 7. Load Preprocessors
+        print("Loading Preprocessors (LeReS, LineArtAnime)...")
+        self.leres_detector = LeresDetector.from_pretrained(Config.ANNOTATOR_REPO)
+        self.lineart_anime_detector = LineartAnimeDetector.from_pretrained(Config.ANNOTATOR_REPO)
+        print("--- All models loaded successfully ---")
+    def get_face_info(self, image):
+        """Extracts the largest face, returns insightface result object."""
+        if not self.face_analysis_loaded:
+            return None
+        try:
+            cv2_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+            faces = self.app.get(cv2_img)
+            if len(faces) == 0:
+                return None
+            faces = sorted(faces, key=lambda x: (x['bbox'][2]-x['bbox'][0])*(x['bbox'][3]-x['bbox'][1]), reverse=True)
+            return faces[0]
+        except Exception as e:
+            print(f"Face embedding extraction failed: {e}")
+            return None

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+diffusers>=0.27.0
+transformers
+accelerate
+peft
+torch
+opencv-python-headless
+Pillow
+insightface
+onnxruntime
+gradio>=4.0.0
+controlnet_aux
+huggingface_hub
+mediapipe
+timm

utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from PIL import Image
+from transformers import BlipProcessor, BlipForConditionalGeneration
+import torch
+from config import Config
+import cv2
+import numpy as np
+import math
+# Simple global caching for the captioner
+captioner_processor = None
+captioner_model = None
+def resize_image_to_1mp(image):
+    """Resizes image to approx 1MP (e.g., 1024x1024) preserving aspect ratio."""
+    image = image.convert("RGB")
+    w, h = image.size
+    target_pixels = 1024 * 1024
+    aspect_ratio = w / h
+    # Calculate new dimensions
+    new_h = int((target_pixels / aspect_ratio) ** 0.5)
+    new_w = int(new_h * aspect_ratio)
+    # Ensure divisibility by 48 for efficiency
+    new_w = (new_w // 48) * 48
+    new_h = (new_h // 48) * 48
+    if new_w == 0 or new_h == 0:
+        new_w, new_h = 1024, 1024 # Fallback
+    return image.resize((new_w, new_h), Image.LANCZOS)
+def get_caption(image):
+    """Generates a caption for the image if one isn't provided."""
+    global captioner_processor, captioner_model
+    if captioner_model is None:
+        print("Loading Captioner (BLIP)...")
+        captioner_processor = BlipProcessor.from_pretrained(Config.CAPTIONER_REPO)
+        captioner_model = BlipForConditionalGeneration.from_pretrained(Config.CAPTIONER_REPO).to(Config.DEVICE)
+    inputs = captioner_processor(image, return_tensors="pt").to(Config.DEVICE)
+    out = captioner_model.generate(**inputs)
+    caption = captioner_processor.decode(out[0], skip_special_tokens=True)
+    return caption
+# --- ADDED: Function from your provided file ---
+def draw_kps(image_pil, kps, color_list=[(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255)]):
+    stickwidth = 4
+    limbSeq = np.array([[0, 2], [1, 2], [3, 2], [4, 2]])
+    kps = np.array(kps)
+    w, h = image_pil.size
+    out_img = np.zeros([h, w, 3])
+    for i in range(len(limbSeq)):
+        index = limbSeq[i]
+        color = color_list[index[0]]
+        x = kps[index][:, 0]
+        y = kps[index][:, 1]
+        length = ((x[0] - x[1]) ** 2 + (y[0] - y[1]) ** 2) ** 0.5
+        angle = math.degrees(math.atan2(y[0] - y[1], x[0] - x[1]))
+        polygon = cv2.ellipse2Poly(
+            (int(np.mean(x)), int(np.mean(y))), (int(length / 2), stickwidth), int(angle), 0, 360, 1
+        )
+        out_img = cv2.fillConvexPoly(out_img.copy(), polygon, color)
+    out_img = (out_img * 0.6).astype(np.uint8)
+    for idx_kp, kp in enumerate(kps):
+        color = color_list[idx_kp]
+        x, y = kp
+        out_img = cv2.circle(out_img.copy(), (int(x), int(y)), 10, color, -1)
+    out_img_pil = Image.fromarray(out_img.astype(np.uint8))
+    return out_img_pil
+# --- END ADDED ---