File size: 5,293 Bytes
d052fbf
 
 
 
 
 
 
 
11a062d
 
d052fbf
381fb09
d052fbf
 
381fb09
 
a336474
381fb09
 
d052fbf
 
 
 
 
381fb09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb723f4
 
381fb09
eb723f4
 
381fb09
 
 
eb723f4
381fb09
 
eb723f4
d052fbf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
---
language:
- en
tags:
- reco
- text-to-image
- layout-to-image
pipeline_tag: text-to-image
widget:
  - text: "A box contains six donuts with varying types of glazes and toppings. <|endoftext|> <bin514> <bin575> <bin741> <bin765> <|startoftext|> chocolate donut. <|endoftext|> <bin237> <bin517> <bin520> <bin784> <|startoftext|> dark vanilla donut. <|endoftext|> <bin763> <bin575> <bin988> <bin745> <|startoftext|> donut with sprinkles. <|endoftext|> <bin234> <bin281> <bin524> <bin527> <|startoftext|> donut with powdered sugar. <|endoftext|> <bin515> <bin259> <bin767> <bin514> <|startoftext|> pink donut. <|endoftext|> <bin753> <bin289> <bin958> <bin506> <|startoftext|> brown donut. <|endoftext|>"
---


# Diffusers 🧨 port of [ReCo: Region-Controlled Text-to-Image Generation (CVPR 2023)](https://arxiv.org/abs/2211.15518)

- Original authors: Zhengyuan Yang, Jianfeng Wang, Zhe Gan, Linjie Li, Kevin Lin, Chenfei Wu, Nan Duan, Zicheng Liu, Ce Liu, Michael Zeng, Lijuan Wang
- Original github repo by authors: https://github.com/microsoft/ReCo
- Converted to Diffusers: Jaemin Cho


# COCO checkpoint
- original pytorch lightning checkpoint: https://unitab.blob.core.windows.net/data/reco/reco_coco_616.ckpt
- original configuration yaml: https://github.com/microsoft/ReCo/blob/main/configs/reco/v1-finetune_cocogit.yaml

# Example Usage

```python
import torch
from diffusers import StableDiffusionPipeline

pipe = StableDiffusionPipeline.from_pretrained(
    "j-min/reco_sd14_coco",
    torch_dtype=torch.float16
)
pipe = pipe.to("cuda")


prompt = "A box contains six donuts with varying types of glazes and toppings. <|endoftext|> <bin514> <bin575> <bin741> <bin765> <|startoftext|> chocolate donut. <|endoftext|> <bin237> <bin517> <bin520> <bin784> <|startoftext|> dark vanilla donut. <|endoftext|> <bin763> <bin575> <bin988> <bin745> <|startoftext|> donut with sprinkles. <|endoftext|> <bin234> <bin281> <bin524> <bin527> <|startoftext|> donut with powdered sugar. <|endoftext|> <bin515> <bin259> <bin767> <bin514> <|startoftext|> pink donut. <|endoftext|> <bin753> <bin289> <bin958> <bin506> <|startoftext|> brown donut. <|endoftext|>"
generated_image = pipe(
    prompt,
    guidance_scale=4).images[0]
generated_image
```

## method to create ReCo prompts

```python
def create_reco_prompt(
    caption: str = '',
    phrases=[],
    boxes=[],
    normalize_boxes=True,
    image_resolution=512,
    num_bins=1000,
    ):
    """
    method to create ReCo prompt

    caption: global caption
    phrases: list of regional captions
    boxes: list of regional coordinates (unnormalized xyxy)
    """

    SOS_token = '<|startoftext|>'
    EOS_token = '<|endoftext|>'
    
    box_captions_with_coords = []
    
    box_captions_with_coords += [caption]
    box_captions_with_coords += [EOS_token]

    for phrase, box in zip(phrases, boxes):
                    
        if normalize_boxes:
            box = [float(x) / image_resolution for x in box]

        # quantize into bins
        quant_x0 = int(round((box[0] * (num_bins - 1))))
        quant_y0 = int(round((box[1] * (num_bins - 1))))
        quant_x1 = int(round((box[2] * (num_bins - 1))))
        quant_y1 = int(round((box[3] * (num_bins - 1))))
        
        # ReCo format
        # Add SOS/EOS before/after regional captions
        box_captions_with_coords += [
            f"<bin{str(quant_x0).zfill(3)}>",
            f"<bin{str(quant_y0).zfill(3)}>",
            f"<bin{str(quant_x1).zfill(3)}>",
            f"<bin{str(quant_y1).zfill(3)}>",
            SOS_token,
            phrase,
            EOS_token
        ]

    text = " ".join(box_captions_with_coords)
    return text
        
caption = "a photo of bus and boat; boat is left to bus."
phrases = ["a photo of a bus.", "a photo of a boat."]
boxes =  [[0.702, 0.404, 0.927, 0.601], [0.154, 0.383, 0.311, 0.487]]
prompt = create_reco_prompt(caption, phrases, boxes, normalize_boxes=False)
prompt
>>> 'a photo of bus and boat; boat is left to bus. <|endoftext|> <bin701> <bin404> <bin926> <bin600> <|startoftext|> a photo of a bus. <|endoftext|> <bin154> <bin383> <bin311> <bin487> <|startoftext|> a photo of a boat. <|endoftext|>'


caption = "A box contains six donuts with varying types of glazes and toppings."
phrases = ["chocolate donut.", "dark vanilla donut.", "donut with sprinkles.", "donut with powdered sugar.", "pink donut.", "brown donut."]
boxes = [[263.68, 294.912, 380.544, 392.832], [121.344, 265.216, 267.392, 401.92], [391.168, 294.912, 506.368, 381.952], [120.064, 143.872, 268.8, 270.336], [264.192, 132.928, 393.216, 263.68], [386.048, 148.48, 490.688, 259.584]]
prompt = create_reco_prompt(caption, phrases, boxes)
prompt
>>> 'A box contains six donuts with varying types of glazes and toppings. <|endoftext|> <bin514> <bin575> <bin743> <bin766> <|startoftext|> chocolate donut. <|endoftext|> <bin237> <bin517> <bin522> <bin784> <|startoftext|> dark vanilla donut. <|endoftext|> <bin763> <bin575> <bin988> <bin745> <|startoftext|> donut with sprinkles. <|endoftext|> <bin234> <bin281> <bin524> <bin527> <|startoftext|> donut with powdered sugar. <|endoftext|> <bin515> <bin259> <bin767> <bin514> <|startoftext|> pink donut. <|endoftext|> <bin753> <bin290> <bin957> <bin506> <|startoftext|> brown donut. <|endoftext|>'
```