Spaces:

CrucibleAI
/

ControlNetMediaPipeFaceSD21

Runtime error

App Files Files Community

Migrate to Diffusers

by radames - opened Apr 4, 2023

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+179

-90

Files changed (9) hide show

.gitignore +3 -0
app.py +171 -89
examples/.gitattributes +2 -0
examples/image0.jpg +0 -0
examples/image1.jpg +0 -0
examples/pedro-512.jpg +0 -0
examples/two.jpeg +0 -0
examples/two2.jpeg +0 -0
requirements.txt +3 -1

.gitignore CHANGED Viewed

	@@ -1 +1,4 @@
1	.idea

 .idea
+__pycache__/
+venv/
+gradio_cached_examples/

app.py CHANGED Viewed

@@ -1,118 +1,200 @@
-import os
 import random
-from typing import Mapping
 import gradio as gr
-import numpy
 import torch
-from huggingface_hub import hf_hub_download
 from PIL import Image
-from cldm.model import create_model, load_state_dict
-from cldm.ddim_hacked import DDIMSampler
 from mediapipe_face_common import generate_annotation
 # Download the SD 1.5 model from HF
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model_path = hf_hub_download(repo_id="CrucibleAI/ControlNetMediaPipeFace", filename="models/controlnet_sd21_laion_face_v2_full.ckpt", repo_type="model", revision="568dc2c9980572262d48cff1ef2a7e4a03fadeb6")
-config_path = hf_hub_download(repo_id="CrucibleAI/ControlNetMediaPipeFace", filename="models/cldm_v21.yaml", repo_type="model", revision="568dc2c9980572262d48cff1ef2a7e4a03fadeb6")
-model = create_model(config_path).cpu()
-model.load_state_dict(load_state_dict(model_path, location=device))
 model = model.to(device)
-ddim_sampler = DDIMSampler(model)  # ControlNet _only_ works with DDIM.
-def process(input_image: Image.Image, prompt, a_prompt, n_prompt, max_faces: int, min_confidence: float, num_samples, ddim_steps, guess_mode, strength, scale, seed: int, eta):
-    with torch.no_grad():
-        # Scale to 512x512.
-        img_size = input_image.size
-        scale_factor = 512/min(img_size)
-        input_image = input_image.resize((1+int(img_size[0]*scale_factor), 1+int(img_size[1]*scale_factor)))
-        img_size = input_image.size
-        left_padding = (img_size[0] - 512)//2
-        top_padding = (img_size[1] - 512)//2
-        input_image = input_image.crop((left_padding, top_padding, left_padding+512, top_padding+512))
-        # Generate annotation
-        input_image = numpy.asarray(input_image)
-        empty = generate_annotation(input_image, max_faces, min_confidence)
-        visualization = Image.fromarray(empty)  # Save to help debug.
-        # Prep for network:
-        empty = numpy.moveaxis(empty, 2, 0)  # h, w, c -> c, h, w
-        control = torch.from_numpy(empty.copy()).float().to(device) / 255.0
-        control = torch.stack([control for _ in range(num_samples)], dim=0)
-        # control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-        # Sanity check the dimensions.
-        B, C, H, W = control.shape
-        assert C == 3
-        assert B == num_samples
-        if seed != -1:
-            random.seed(seed)
-            os.environ['PYTHONHASHSEED'] = str(seed)
-            numpy.random.seed(seed)
-            torch.manual_seed(seed)
-            torch.cuda.manual_seed(seed)
-            torch.backends.cudnn.deterministic = True
-        # model.low_vram_shift(is_diffusing=False)
-        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
-        un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
-        shape = (4, H // 8, W // 8)
-        # model.low_vram_shift(is_diffusing=True)
-        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, intermediates = ddim_sampler.sample(
-            ddim_steps,
-            num_samples,
-            shape,
-            cond,
-            verbose=False,
-            eta=eta,
-            unconditional_guidance_scale=scale,
-            unconditional_conditioning=un_cond
-        )
-        # model.low_vram_shift(is_diffusing=False)
-        x_samples = model.decode_first_stage(samples)
-        # x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(numpy.uint8)
-        x_samples = numpy.moveaxis((x_samples * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(numpy.uint8), 1, -1)  # b, c, h, w -> b, h, w, c
-        results = [visualization] + [x_samples[i] for i in range(num_samples)]
-    return results
 block = gr.Blocks().queue()
 with block:
     with gr.Row():
         gr.Markdown("## Control Stable Diffusion with a Facial Pose")
     with gr.Row():
         with gr.Column():
-            input_image = gr.Image(source='upload', type="pil")
             prompt = gr.Textbox(label="Prompt")
             run_button = gr.Button(label="Run")
             with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                max_faces = gr.Slider(label="Max Faces", minimum=1, maximum=10, value=5, step=1)
-                min_confidence = gr.Slider(label="Min Confidence", minimum=0.01, maximum=1.0, value=0.5, step=0.01)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
                 guess_mode = gr.Checkbox(label='Guess Mode', value=False)
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
                 eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
                 n_prompt = gr.Textbox(label="Negative Prompt",
                                       value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
         with gr.Column():
-            result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-    ips = [input_image, prompt, a_prompt, n_prompt, max_faces, min_confidence, num_samples, ddim_steps, guess_mode, strength, scale, seed, eta]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
 block.launch(server_name='0.0.0.0')

 import random
 import gradio as gr
 import torch
+from diffusers.utils import load_image
 from PIL import Image
+import numpy as np
+import base64
+from io import BytesIO
 from mediapipe_face_common import generate_annotation
+from diffusers import (
+    ControlNetModel,
+    StableDiffusionControlNetPipeline,
+)
 # Download the SD 1.5 model from HF
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+controlnet = ControlNetModel.from_pretrained(
+    "CrucibleAI/ControlNetMediaPipeFace", torch_dtype=torch.float16, variant="fp16")
+model = StableDiffusionControlNetPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16
+)
 model = model.to(device)
+model.enable_model_cpu_offload()
+canvas_html = "<face-canvas id='canvas-root' data-mode='crucibleAI' style='display:flex;max-width: 500px;margin: 0 auto;'></face-canvas>"
+load_js = """
+async () => {
+const url = "https://huggingface.co/datasets/radames/gradio-components/raw/main/face-canvas.js"
+fetch(url)
+  .then(res => res.text())
+  .then(text => {
+    const script = document.createElement('script');
+    script.type = "module"
+    script.src = URL.createObjectURL(new Blob([text], { type: 'application/javascript' }));
+    document.head.appendChild(script);
+  });
+}
+"""
+get_js_image = """
+async (input_image, prompt, a_prompt, n_prompt, max_faces, min_confidence, num_samples, ddim_steps, guess_mode, strength, scale, seed, eta, image_file_live_opt, live_conditioning) => {
+  const canvasEl = document.getElementById("canvas-root");
+  const imageData = canvasEl? canvasEl._data : null;
+  return [input_image, prompt, a_prompt, n_prompt, max_faces, min_confidence, num_samples, ddim_steps, guess_mode, strength, scale, seed, eta, image_file_live_opt, imageData];
+}
+"""
+def pad_image(input_image):
+    pad_w, pad_h = np.max(((2, 2), np.ceil(
+        np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
+    im_padded = Image.fromarray(
+        np.pad(np.array(input_image), ((0, pad_h), (0, pad_w), (0, 0)), mode='edge'))
+    w, h = im_padded.size
+    if w == h:
+        return im_padded
+    elif w > h:
+        new_image = Image.new(im_padded.mode, (w, w), (0, 0, 0))
+        new_image.paste(im_padded, (0, (w - h) // 2))
+        return new_image
+    else:
+        new_image = Image.new(im_padded.mode, (h, h), (0, 0, 0))
+        new_image.paste(im_padded, ((h - w) // 2, 0))
+        return new_image
+def process(input_image: Image.Image, prompt, a_prompt, n_prompt, max_faces: int, min_confidence: float, num_samples, ddim_steps, guess_mode, strength, scale, seed: int, eta, image_file_live_opt="file", live_conditioning=None):
+    if input_image is None and 'image' not in live_conditioning:
+        raise gr.Error("Please provide an image")
+    try:
+        if image_file_live_opt == 'file':
+            input_image = input_image.convert('RGB')
+            empty = generate_annotation(
+                np.array(input_image), max_faces, min_confidence)
+            visualization = Image.fromarray(empty)  # Save to help debug.
+            visualization = pad_image(visualization).resize((512, 512))
+        elif image_file_live_opt == 'webcam':
+            base64_img = live_conditioning['image']
+            image_data = base64.b64decode(base64_img.split(',')[1])
+            visualization = Image.open(BytesIO(image_data)).convert(
+                'RGB').resize((512, 512))
+        if seed == -1:
+            seed = random.randint(0, 2147483647)
+        generator = torch.Generator(device).manual_seed(seed)
+        output = model(prompt=prompt + ' ' + a_prompt,
+                       negative_prompt=n_prompt,
+                       image=visualization,
+                       generator=generator,
+                       num_images_per_prompt=num_samples,
+                       num_inference_steps=ddim_steps,
+                       controlnet_conditioning_scale=strength,
+                       guidance_scale=scale,
+                       eta=eta,
+                       )
+        results = [visualization] + output.images
+        return results
+    except Exception as e:
+        raise gr.Error(str(e))
+# switch between file upload and webcam
+def toggle(choice):
+    if choice == "file":
+        return gr.update(visible=True, value=None), gr.update(visible=False, value=None)
+    elif choice == "webcam":
+        return gr.update(visible=False, value=None), gr.update(visible=True, value=canvas_html)
 block = gr.Blocks().queue()
 with block:
+    # hidden JSON component to store live conditioning
+    live_conditioning = gr.JSON(value={}, visible=False)
     with gr.Row():
         gr.Markdown("## Control Stable Diffusion with a Facial Pose")
     with gr.Row():
         with gr.Column():
+            image_file_live_opt = gr.Radio(["file", "webcam"], value="file",
+                                           label="How would you like to upload your image?")
+            input_image = gr.Image(source="upload", visible=True, type="pil")
+            canvas = gr.HTML(None, elem_id="canvas_html", visible=False)
+            image_file_live_opt.change(fn=toggle,
+                                       inputs=[image_file_live_opt],
+                                       outputs=[input_image, canvas],
+                                       queue=False)
             prompt = gr.Textbox(label="Prompt")
             run_button = gr.Button(label="Run")
             with gr.Accordion("Advanced options", open=False):
+                num_samples = gr.Slider(
+                    label="Images", minimum=1, maximum=4, value=1, step=1)
+                max_faces = gr.Slider(
+                    label="Max Faces", minimum=1, maximum=10, value=5, step=1)
+                min_confidence = gr.Slider(
+                    label="Min Confidence", minimum=0.01, maximum=1.0, value=0.5, step=0.01)
+                strength = gr.Slider(
+                    label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
                 guess_mode = gr.Checkbox(label='Guess Mode', value=False)
+                ddim_steps = gr.Slider(
+                    label="Steps", minimum=1, maximum=100, value=20, step=1)
+                scale = gr.Slider(label="Guidance Scale",
+                                  minimum=0.1, maximum=30.0, value=9.0, step=0.1)
+                seed = gr.Slider(label="Seed", minimum=-1,
+                                 maximum=2147483647, step=1, randomize=True)
                 eta = gr.Number(label="eta (DDIM)", value=0.0)
+                a_prompt = gr.Textbox(
+                    label="Added Prompt", value='best quality, extremely detailed')
                 n_prompt = gr.Textbox(label="Negative Prompt",
                                       value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
         with gr.Column():
+            result_gallery = gr.Gallery(
+                label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
+    ips = [input_image, prompt, a_prompt, n_prompt, max_faces, min_confidence,
+           num_samples, ddim_steps, guess_mode, strength, scale, seed, eta]
+    run_button.click(fn=process, inputs=ips + [image_file_live_opt, live_conditioning],
+                     outputs=[result_gallery],
+                     _js=get_js_image)
+    # load js for live conditioning
+    block.load(None, None, None, _js=load_js)
+    gr.Examples(fn=process,
+                examples=[
+                    ["./examples/two2.jpeg",
+                        "Highly detailed photograph of two clowns",
+                        "best quality, extremely detailed",
+                        "cartoon, disfigured, bad art, deformed, poorly drawn, extra limbs, weird colors, blurry, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
+                        10, 0.4, 3, 20, False, 1.0, 9.0, -1, 0.0],
+                    ["./examples/two.jpeg",
+                        "a photo of two silly men",
+                        "best quality, extremely detailed",
+                        "cartoon, disfigured, bad art, deformed, poorly drawn, extra limbs, weird colors, blurry, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
+                        10, 0.4, 3, 20, False, 1.0, 9.0, -1, 0.0],
+                    ["./examples/pedro-512.jpg",
+                        "Highly detailed photograph of young woman smiling, with palm trees in the background",
+                        "best quality, extremely detailed",
+                        "cartoon, disfigured, bad art, deformed, poorly drawn, extra limbs, weird colors, blurry, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
+                        10, 0.4, 3, 20, False, 1.0, 9.0, -1, 0.0],
+                    ["./examples/image1.jpg",
+                        "Highly detailed photograph of a scary clown",
+                        "best quality, extremely detailed",
+                        "cartoon, disfigured, bad art, deformed, poorly drawn, extra limbs, weird colors, blurry, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
+                        10, 0.4, 3, 20, False, 1.0, 9.0, -1, 0.0],
+                    ["./examples/image0.jpg",
+                        "Highly detailed photograph of Madonna",
+                        "best quality, extremely detailed",
+                        "cartoon, disfigured, bad art, deformed, poorly drawn, extra limbs, weird colors, blurry, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
+                        10, 0.4, 3, 20, False, 1.0, 9.0, -1, 0.0],
+                ],
+                inputs=ips,
+                outputs=[result_gallery],
+                cache_examples=True)
 block.launch(server_name='0.0.0.0')

examples/.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.jpeg filter=lfs diff=lfs merge=lfs -text
2	+ *.jpg filter=lfs diff=lfs merge=lfs -text

examples/image0.jpg ADDED Viewed

examples/image1.jpg ADDED Viewed

examples/pedro-512.jpg ADDED Viewed

examples/two.jpeg ADDED Viewed

examples/two2.jpeg ADDED Viewed

requirements.txt CHANGED Viewed

@@ -11,4 +11,6 @@ timm
 transformers==4.26.1
 torch==1.13.1
 torchvision==0.14.1
-tqdm==4.64.1

 transformers==4.26.1
 torch==1.13.1
 torchvision==0.14.1
+tqdm==4.64.1
+accelerate
+diffusers