ShuangLI59 commited on
Commit
d4fe1e6
β€’
1 Parent(s): 2c3dd7c
Files changed (4) hide show
  1. README.md +3 -3
  2. app.py +200 -0
  3. requirements.txt +2 -0
  4. setup.py +29 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: Composable Diffusion
3
- emoji: 🐨
4
- colorFrom: green
5
- colorTo: gray
6
  sdk: gradio
7
  sdk_version: 3.0.12
8
  app_file: app.py
 
1
  ---
2
  title: Composable Diffusion
3
+ emoji: 🐠
4
+ colorFrom: gray
5
+ colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 3.0.12
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Copy of compose_glide.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/19xx6Nu4FeiGj-TzTUFxBf-15IkeuFx_F
8
+ """
9
+
10
+
11
+ from PIL import Image
12
+ from IPython.display import display
13
+ import torch as th
14
+
15
+ from glide_text2im.download import load_checkpoint
16
+ from glide_text2im.model_creation import (
17
+ create_model_and_diffusion,
18
+ model_and_diffusion_defaults,
19
+ model_and_diffusion_defaults_upsampler
20
+ )
21
+
22
+ # This notebook supports both CPU and GPU.
23
+ # On CPU, generating one sample may take on the order of 20 minutes.
24
+ # On a GPU, it should be under a minute.
25
+
26
+ has_cuda = th.cuda.is_available()
27
+ device = th.device('cpu' if not has_cuda else 'cuda')
28
+
29
+ # Create base model.
30
+ timestep_respacing = 100 #@param{type: 'number'}
31
+ options = model_and_diffusion_defaults()
32
+ options['use_fp16'] = has_cuda
33
+ options['timestep_respacing'] = str(timestep_respacing) # use 100 diffusion steps for fast sampling
34
+ model, diffusion = create_model_and_diffusion(**options)
35
+ model.eval()
36
+ if has_cuda:
37
+ model.convert_to_fp16()
38
+ model.to(device)
39
+ model.load_state_dict(load_checkpoint('base', device))
40
+ print('total base parameters', sum(x.numel() for x in model.parameters()))
41
+
42
+ # Create upsampler model.
43
+ options_up = model_and_diffusion_defaults_upsampler()
44
+ options_up['use_fp16'] = has_cuda
45
+ options_up['timestep_respacing'] = 'fast27' # use 27 diffusion steps for very fast sampling
46
+ model_up, diffusion_up = create_model_and_diffusion(**options_up)
47
+ model_up.eval()
48
+ if has_cuda:
49
+ model_up.convert_to_fp16()
50
+ model_up.to(device)
51
+ model_up.load_state_dict(load_checkpoint('upsample', device))
52
+ print('total upsampler parameters', sum(x.numel() for x in model_up.parameters()))
53
+
54
+ def show_images(batch: th.Tensor):
55
+ """ Display a batch of images inline. """
56
+ scaled = ((batch + 1)*127.5).round().clamp(0,255).to(th.uint8).cpu()
57
+ reshaped = scaled.permute(2, 0, 3, 1).reshape([batch.shape[2], -1, 3])
58
+ display(Image.fromarray(reshaped.numpy()))
59
+
60
+ def compose_language_descriptions(prompt):
61
+ #@markdown `prompt`: when composing multiple sentences, using `|` as the delimiter.
62
+ prompts = [x.strip() for x in prompt.split('|')]
63
+
64
+ batch_size = 1
65
+ guidance_scale = 10 #@param{type: 'number'}
66
+ # Tune this parameter to control the sharpness of 256x256 images.
67
+ # A value of 1.0 is sharper, but sometimes results in grainy artifacts.
68
+ upsample_temp = 0.980 #@param{type: 'number'}
69
+
70
+
71
+
72
+ masks = [True] * len(prompts) + [False]
73
+ # coefficients = th.tensor([0.5, 0.5], device=device).reshape(-1, 1, 1, 1)
74
+ masks = th.tensor(masks, dtype=th.bool, device=device)
75
+ # sampling function
76
+ def model_fn(x_t, ts, **kwargs):
77
+ half = x_t[:1]
78
+ combined = th.cat([half] * x_t.size(0), dim=0)
79
+ model_out = model(combined, ts, **kwargs)
80
+ eps, rest = model_out[:, :3], model_out[:, 3:]
81
+ cond_eps = eps[masks].mean(dim=0, keepdim=True)
82
+ # cond_eps = (coefficients * eps[masks]).sum(dim=0)[None]
83
+ uncond_eps = eps[~masks].mean(dim=0, keepdim=True)
84
+ half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
85
+ eps = th.cat([half_eps] * x_t.size(0), dim=0)
86
+ return th.cat([eps, rest], dim=1)
87
+
88
+
89
+ ##############################
90
+ # Sample from the base model #
91
+ ##############################
92
+
93
+ # Create the text tokens to feed to the model.
94
+ def sample_64(prompts):
95
+ tokens_list = [model.tokenizer.encode(prompt) for prompt in prompts]
96
+ outputs = [model.tokenizer.padded_tokens_and_mask(
97
+ tokens, options['text_ctx']
98
+ ) for tokens in tokens_list]
99
+
100
+ cond_tokens, cond_masks = zip(*outputs)
101
+ cond_tokens, cond_masks = list(cond_tokens), list(cond_masks)
102
+
103
+ full_batch_size = batch_size * (len(prompts) + 1)
104
+ uncond_tokens, uncond_mask = model.tokenizer.padded_tokens_and_mask(
105
+ [], options['text_ctx']
106
+ )
107
+
108
+ # Pack the tokens together into model kwargs.
109
+ model_kwargs = dict(
110
+ tokens=th.tensor(
111
+ cond_tokens + [uncond_tokens], device=device
112
+ ),
113
+ mask=th.tensor(
114
+ cond_masks + [uncond_mask],
115
+ dtype=th.bool,
116
+ device=device,
117
+ ),
118
+ )
119
+
120
+ # Sample from the base model.
121
+ model.del_cache()
122
+ samples = diffusion.p_sample_loop(
123
+ model_fn,
124
+ (full_batch_size, 3, options["image_size"], options["image_size"]),
125
+ device=device,
126
+ clip_denoised=True,
127
+ progress=True,
128
+ model_kwargs=model_kwargs,
129
+ cond_fn=None,
130
+ )[:batch_size]
131
+ model.del_cache()
132
+
133
+ # Show the output
134
+ return samples
135
+
136
+
137
+ ##############################
138
+ # Upsample the 64x64 samples #
139
+ ##############################
140
+
141
+ def upsampling_256(prompts, samples):
142
+ tokens = model_up.tokenizer.encode("".join(prompts))
143
+ tokens, mask = model_up.tokenizer.padded_tokens_and_mask(
144
+ tokens, options_up['text_ctx']
145
+ )
146
+
147
+ # Create the model conditioning dict.
148
+ model_kwargs = dict(
149
+ # Low-res image to upsample.
150
+ low_res=((samples+1)*127.5).round()/127.5 - 1,
151
+
152
+ # Text tokens
153
+ tokens=th.tensor(
154
+ [tokens] * batch_size, device=device
155
+ ),
156
+ mask=th.tensor(
157
+ [mask] * batch_size,
158
+ dtype=th.bool,
159
+ device=device,
160
+ ),
161
+ )
162
+
163
+ # Sample from the base model.
164
+ model_up.del_cache()
165
+ up_shape = (batch_size, 3, options_up["image_size"], options_up["image_size"])
166
+ up_samples = diffusion_up.ddim_sample_loop(
167
+ model_up,
168
+ up_shape,
169
+ noise=th.randn(up_shape, device=device) * upsample_temp,
170
+ device=device,
171
+ clip_denoised=True,
172
+ progress=True,
173
+ model_kwargs=model_kwargs,
174
+ cond_fn=None,
175
+ )[:batch_size]
176
+ model_up.del_cache()
177
+
178
+ # Show the output
179
+ return up_samples
180
+
181
+
182
+ # sampling 64x64 images
183
+ samples = sample_64(prompts)
184
+ # show_images(samples)
185
+
186
+ # upsample from 64x64 to 256x256
187
+ upsamples = upsampling_256(prompts, samples)
188
+ # show_images(upsamples)
189
+
190
+ out_img = upsamples[0].permute(1,2,0)
191
+ out_img = (out_img+1)/2
192
+ out_img = np.array(out_img.data.to('cpu'))
193
+ return out_img
194
+
195
+ # prompt = "a camel | a forest" #@param{type: 'string'}
196
+ # out_img = compose_language_descriptions(prompt)
197
+
198
+ import gradio as gr
199
+ gr.Interface(fn=compose_language_descriptions, inputs='text', outputs='image').launch();
200
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ git+https://github.com/energy-based-model/Compositional-Visual-Generation-with-Composable-Diffusion-Models-PyTorch
2
+ git+https://github.com/openai/glide-text2im
setup.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup
2
+
3
+ setup(
4
+ name="composable-diffusion",
5
+ packages=[
6
+ "composable_diffusion",
7
+ "composable_diffusion.clip",
8
+ "composable_diffusion.tokenizer",
9
+ ],
10
+ package_data={
11
+ "composable_diffusion.tokenizer": [
12
+ "bpe_simple_vocab_16e6.txt.gz",
13
+ "encoder.json.gz",
14
+ "vocab.bpe.gz",
15
+ ],
16
+ "composable_diffusion.clip": ["config.yaml"],
17
+ },
18
+ install_requires=[
19
+ "Pillow",
20
+ "attrs",
21
+ "torch",
22
+ "filelock",
23
+ "requests",
24
+ "tqdm",
25
+ "ftfy",
26
+ "regex",
27
+ ],
28
+ author="nanliu",
29
+ )