import torch import spaces import random import os import numpy as np import gradio as gr from PIL import Image from diffusers import FluxControlNetPipeline, FluxControlNetModel from diffusers.models import FluxMultiControlNetModel from controlnet_aux import ( CannyDetector, MidasDetector, ) from huggingface_hub import login USE_ZERO_GPU = os.environ.get("USE_ZERO_GPU", "0") == "1" HF_TOKEN = os.environ.get("HF_TOKEN") if HF_TOKEN: login(token=HF_TOKEN) MAX_SEED = np.iinfo(np.int32).max MAX_SIZE = 1024 styles = [ "3D Animation", "Maomu Ghibli", ] device = "cuda" if torch.cuda.is_available() else "cpu" base_model = 'black-forest-labs/FLUX.1-dev' controlnet_model_union = 'Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro' controlnet_union = FluxControlNetModel.from_pretrained(controlnet_model_union, torch_dtype=torch.bfloat16) controlnet = FluxMultiControlNetModel([controlnet_union]) # we always recommend loading via FluxMultiControlNetModel pipe = FluxControlNetPipeline.from_pretrained(base_model, controlnet=controlnet, torch_dtype=torch.bfloat16) pipe.to(device) pipe.unload_lora_weights() pipe.load_lora_weights( "vzhizhi6611/OminiControlArt", weight_name=f"v0/3d_animation.safetensors", adapter_name="3d_animation", ) pipe.load_lora_weights( "vzhizhi6611/OminiControlArt", weight_name=f"v0/maomu_ghibli.safetensors", adapter_name="maomu_ghibli", ) canny_detector = CannyDetector() midas_detector = MidasDetector.from_pretrained( "valhalla/t2iadapter-aux-models", filename="dpt_large_384.pt", model_type="dpt_large" ) midas_detector = midas_detector.to(device) def infer( input_image, prompt, style, num_inference_steps=24, guidance_scale=3.5, seed=42, randomize_seed=False, canny_weight=0.2, depth_weight=0.4, canny_detect=0.375, depth_detect=0.5, ): if randomize_seed: seed = random.randint(0, MAX_SEED) # Set Adapter activate_adapter_name = { "3D Animation": "3d_animation", "Maomu Ghibli": "maomu_ghibli", }[style] pipe.set_adapters(activate_adapter_name) control_mode_depth = 2 control_mode_canny = 0 w, h = input_image.size factor = max(w, h) / MAX_SIZE width = int(w / factor) height = int(h / factor) input_image = input_image.resize((width, height), Image.LANCZOS) canny_image = canny_detector(input_image, detect_resolution=int(MAX_SIZE * canny_detect), image_resolution=MAX_SIZE) depth_image = midas_detector(input_image, detect_resolution=int(MAX_SIZE * depth_detect), image_resolution=MAX_SIZE) control_image = [] control_mode = [] controlnet_conditioning_scale = [] if depth_weight > 0: control_mode.append(control_mode_depth) controlnet_conditioning_scale.append(depth_weight) control_image.append(depth_image) if canny_weight > 0: control_mode.append(control_mode_canny) controlnet_conditioning_scale.append(canny_weight) control_image.append(canny_image) result_image = pipe( prompt, control_image=control_image, control_mode=control_mode, width=width, height=height, controlnet_conditioning_scale=controlnet_conditioning_scale, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, generator=torch.Generator().manual_seed(seed), ).images[0] return result_image, canny_image, depth_image, seed if USE_ZERO_GPU: infer = spaces.GPU(infer, duration=30) def create_demo() -> gr.Blocks: with gr.Blocks() as demo: cropper = gr.State() with gr.Row(): with gr.Column(): prompt = gr.Textbox(label="Prompt", lines=1, value="3d animation style selfie") num_inference_steps = gr.Slider(minimum=1, maximum=100, value=24, step=1, label="Num Inference Steps") guidance_scale = gr.Slider(minimum=0, maximum=20, value=3.5, step=0.5, label="Guidance Scale") with gr.Accordion("Advanced Options", open=False): canny_weight = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.01, label="Canny Weight") depth_weight = gr.Slider(minimum=0, maximum=1, value=0.6, step=0.01, label="Depth Weight") canny_detect = gr.Slider(minimum=0.1, maximum=1, value=0.375, step=0.025, label="Canny Detect") depth_detect = gr.Slider(minimum=0.1, maximum=1, value=0.375, step=0.025, label="Depth Detect") with gr.Column(): seed = gr.Number(label="Seed", value=42) randomize_seed = gr.Checkbox(label="Randomize Seed", value=False) style = gr.Dropdown(label="Style", choices=styles, value=styles[0]) g_btn = gr.Button("Generate Image") with gr.Row(): with gr.Column(): input_image = gr.Image(label="Input Image", type="pil", interactive=True) canny_image = gr.Image(label="Canny Image", type="pil", interactive=False) with gr.Column(): result_image = gr.Image(label="Result Image", type="pil", interactive=False) depth_image = gr.Image(label="Depth Image", type="pil", interactive=False) seed_output = gr.Number(label="Seed Output", interactive=False) g_btn.click( fn=infer, inputs=[ input_image, prompt, style, num_inference_steps, guidance_scale, seed, randomize_seed, canny_weight, depth_weight, canny_detect, depth_detect ], outputs=[result_image, canny_image, depth_image, seed_output], ) return demo