File size: 4,758 Bytes
57f7e66 eb5cc4c 57f7e66 eb5cc4c 57f7e66 cb8f667 57f7e66 1b9d6e2 57f7e66 1b9d6e2 eb5cc4c 1b9d6e2 57f7e66 03dee23 57f7e66 1b9d6e2 57f7e66 1b9d6e2 57f7e66 1b9d6e2 57f7e66 1b9d6e2 57f7e66 1b9d6e2 57f7e66 1b9d6e2 57f7e66 1b9d6e2 57f7e66 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
---
license: openrail++
base_model: stabilityai/stable-diffusion-xl-base-1.0
tags:
- stable-diffusion-xl
- stable-diffusion-xl-diffusers
- text-to-image
- diffusers
- controlnet
inference: false
---
# SDXL-controlnet: Zoe-Depth
These are ControlNet weights trained on stabilityai/stable-diffusion-xl-base-1.0 with zoe depth conditioning. [Zoe-depth](https://github.com/isl-org/ZoeDepth) is an open-source
SOTA depth estimation model which produces high-quality depth maps, which are better suited for conditioning.
You can find some example images in the following.
![images_0)](./zoe-depth-example.png)
## Usage
Make sure first to install the libraries:
```bash
pip install accelerate transformers safetensors diffusers
```
And then setup the zoe-depth model
```
import torch
import matplotlib
import matplotlib.cm
import numpy as np
torch.hub.help("intel-isl/MiDaS", "DPT_BEiT_L_384", force_reload=True) # Triggers fresh download of MiDaS repo
model_zoe_n = torch.hub.load("isl-org/ZoeDepth", "ZoeD_NK", pretrained=True).eval()
model_zoe_n = model_zoe_n.to("cuda")
def colorize(value, vmin=None, vmax=None, cmap='gray_r', invalid_val=-99, invalid_mask=None, background_color=(128, 128, 128, 255), gamma_corrected=False, value_transform=None):
if isinstance(value, torch.Tensor):
value = value.detach().cpu().numpy()
value = value.squeeze()
if invalid_mask is None:
invalid_mask = value == invalid_val
mask = np.logical_not(invalid_mask)
# normalize
vmin = np.percentile(value[mask],2) if vmin is None else vmin
vmax = np.percentile(value[mask],85) if vmax is None else vmax
if vmin != vmax:
value = (value - vmin) / (vmax - vmin) # vmin..vmax
else:
# Avoid 0-division
value = value * 0.
# squeeze last dim if it exists
# grey out the invalid values
value[invalid_mask] = np.nan
cmapper = matplotlib.cm.get_cmap(cmap)
if value_transform:
value = value_transform(value)
# value = value / value.max()
value = cmapper(value, bytes=True) # (nxmx4)
# img = value[:, :, :]
img = value[...]
img[invalid_mask] = background_color
# gamma correction
img = img / 255
img = np.power(img, 2.2)
img = img * 255
img = img.astype(np.uint8)
img = Image.fromarray(img)
return img
def get_zoe_depth_map(image):
with torch.autocast("cuda", enabled=True):
depth = model_zoe_n.infer_pil(image)
depth = colorize(depth, cmap="gray_r")
return depth
```
Now we're ready to go:
```python
import torch
import numpy as np
from PIL import Image
from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoencoderKL
from diffusers.utils import load_image
controlnet = ControlNetModel.from_pretrained(
"diffusers/controlnet-zoe-depth-sdxl-1.0",
use_safetensors=True,
torch_dtype=torch.float16,
).to("cuda")
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to("cuda")
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
controlnet=controlnet,
vae=vae,
variant="fp16",
use_safetensors=True,
torch_dtype=torch.float16,
).to("cuda")
pipe.enable_model_cpu_offload()
prompt = "pixel-art margot robbie as barbie, in a coupé . low-res, blocky, pixel art style, 8-bit graphics"
negative_prompt = "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic"
image = load_image("https://media.vogue.fr/photos/62bf04b69a57673c725432f3/3:2/w_1793,h_1195,c_limit/rev-1-Barbie-InstaVert_High_Res_JPEG.jpeg")
controlnet_conditioning_scale = 0.55
depth_image = get_zoe_depth_map(image).resize((1088, 896))
generator = torch.Generator("cuda").manual_seed(978364352)
images = pipe(
prompt, image=depth_image, num_inference_steps=50, controlnet_conditioning_scale=controlnet_conditioning_scale, generator=generator
).images
images[0]
images[0].save(f"pixel-barbie.png")
```
![images_1)](./barbie.png)
To more details, check out the official documentation of [`StableDiffusionXLControlNetPipeline`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet_sdxl).
### Training
Our training script was built on top of the official training script that we provide [here](https://github.com/huggingface/diffusers/blob/main/examples/controlnet/README_sdxl.md).
#### Training data and Compute
The model is trained on 3M image-text pairs from LAION-Aesthetics V2. The model is trained for 700 GPU hours on 80GB A100 GPUs.
#### Batch size
Data parallel with a single gpu batch size of 8 for a total batch size of 256.
#### Hyper Parameters
Constant learning rate of 1e-5.
#### Mixed precision
fp16 |