Shuang59 commited on
Commit
c39e9ff
β€’
1 Parent(s): 12be8c0

Remove GLIDE composition due to memory.

Browse files
Files changed (1) hide show
  1. app.py +14 -225
app.py CHANGED
@@ -1,31 +1,19 @@
1
  # -*- coding: utf-8 -*-
2
- """Copy of compose_glide.ipynb
3
 
4
  Automatically generated by Colaboratory.
5
 
6
  Original file is located at
7
- https://colab.research.google.com/drive/19xx6Nu4FeiGj-TzTUFxBf-15IkeuFx_F
8
  """
9
 
10
- # from PIL import Image
11
- # from IPython.display import display
12
  import torch as th
13
- import numpy as np
14
-
15
- from glide_text2im.download import load_checkpoint
16
- from glide_text2im.model_creation import (
17
- create_model_and_diffusion,
18
- model_and_diffusion_defaults,
19
- model_and_diffusion_defaults_upsampler
20
- )
21
 
22
  from composable_diffusion.download import download_model
23
  from composable_diffusion.model_creation import create_model_and_diffusion as create_model_and_diffusion_for_clevr
24
  from composable_diffusion.model_creation import model_and_diffusion_defaults as model_and_diffusion_defaults_for_clevr
25
 
26
-
27
- from PIL import Image, ImageDraw, ImageFont
28
-
29
  from torch import autocast
30
  from diffusers import StableDiffusionPipeline
31
 
@@ -33,182 +21,14 @@ from diffusers import StableDiffusionPipeline
33
  # On CPU, generating one sample may take on the order of 20 minutes.
34
  # On a GPU, it should be under a minute.
35
 
36
- has_cuda = False
37
  device = th.device('cpu' if not th.cuda.is_available() else 'cuda')
38
- cpu = th.device('cpu')
39
 
40
- # iniatilize stable diffusion model
41
  pipe = StableDiffusionPipeline.from_pretrained(
42
  "CompVis/stable-diffusion-v1-4",
43
  use_auth_token='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'
44
- ).to(cpu)
45
-
46
- # Create base model.
47
- timestep_respacing = 100 # @param{type: 'number'}
48
- options = model_and_diffusion_defaults()
49
- options['use_fp16'] = has_cuda
50
- options['timestep_respacing'] = str(timestep_respacing) # use 100 diffusion steps for fast sampling
51
- model, diffusion = create_model_and_diffusion(**options)
52
- model.eval()
53
- if has_cuda:
54
- model.convert_to_fp16()
55
- model.to(cpu)
56
- model.load_state_dict(load_checkpoint('base', cpu))
57
- print('total base parameters', sum(x.numel() for x in model.parameters()))
58
-
59
- # Create upsampler model.
60
- options_up = model_and_diffusion_defaults_upsampler()
61
- options_up['use_fp16'] = has_cuda
62
- options_up['timestep_respacing'] = 'fast27' # use 27 diffusion steps for very fast sampling
63
- model_up, diffusion_up = create_model_and_diffusion(**options_up)
64
- model_up.eval()
65
- if has_cuda:
66
- model_up.convert_to_fp16()
67
- model_up.to(cpu)
68
- model_up.load_state_dict(load_checkpoint('upsample', cpu))
69
- print('total upsampler parameters', sum(x.numel() for x in model_up.parameters()))
70
-
71
-
72
- def show_images(batch: th.Tensor):
73
- """ Display a batch of images inline. """
74
- scaled = ((batch + 1) * 127.5).round().clamp(0, 255).to(th.uint8).cpu()
75
- reshaped = scaled.permute(2, 0, 3, 1).reshape([batch.shape[2], -1, 3])
76
- display(Image.fromarray(reshaped.numpy()))
77
-
78
-
79
- def compose_language_descriptions(prompt, guidance_scale, steps):
80
- options['timestep_respacing'] = str(steps)
81
- _, diffusion = create_model_and_diffusion(**options)
82
-
83
- # @markdown `prompt`: when composing multiple sentences, using `|` as the delimiter.
84
- prompts = [x.strip() for x in prompt.split('|')]
85
-
86
- batch_size = 1
87
- # Tune this parameter to control the sharpness of 256x256 images.
88
- # A value of 1.0 is sharper, but sometimes results in grainy artifacts.
89
- upsample_temp = 0.980 # @param{type: 'number'}
90
-
91
- masks = [True] * len(prompts) + [False]
92
- # coefficients = th.tensor([0.5, 0.5], device=device).reshape(-1, 1, 1, 1)
93
- masks = th.tensor(masks, dtype=th.bool, device=device)
94
-
95
- # sampling function
96
- def model_fn(x_t, ts, **kwargs):
97
- half = x_t[:1]
98
- combined = th.cat([half] * x_t.size(0), dim=0)
99
- model_out = model(combined, ts, **kwargs)
100
- eps, rest = model_out[:, :3], model_out[:, 3:]
101
- cond_eps = eps[masks].mean(dim=0, keepdim=True)
102
- # cond_eps = (coefficients * eps[masks]).sum(dim=0)[None]
103
- uncond_eps = eps[~masks].mean(dim=0, keepdim=True)
104
- half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
105
- eps = th.cat([half_eps] * x_t.size(0), dim=0)
106
- return th.cat([eps, rest], dim=1)
107
-
108
- ##############################
109
- # Sample from the base model #
110
- ##############################
111
-
112
- # Create the text tokens to feed to the model.
113
- def sample_64(prompts):
114
- tokens_list = [model.tokenizer.encode(prompt) for prompt in prompts]
115
- outputs = [model.tokenizer.padded_tokens_and_mask(
116
- tokens, options['text_ctx']
117
- ) for tokens in tokens_list]
118
-
119
- cond_tokens, cond_masks = zip(*outputs)
120
- cond_tokens, cond_masks = list(cond_tokens), list(cond_masks)
121
-
122
- full_batch_size = batch_size * (len(prompts) + 1)
123
- uncond_tokens, uncond_mask = model.tokenizer.padded_tokens_and_mask(
124
- [], options['text_ctx']
125
- )
126
-
127
- # Pack the tokens together into model kwargs.
128
- model_kwargs = dict(
129
- tokens=th.tensor(
130
- cond_tokens + [uncond_tokens], device=device
131
- ),
132
- mask=th.tensor(
133
- cond_masks + [uncond_mask],
134
- dtype=th.bool,
135
- device=device,
136
- ),
137
- )
138
-
139
- # Sample from the base model.
140
- model.del_cache()
141
- samples = diffusion.p_sample_loop(
142
- model_fn,
143
- (full_batch_size, 3, options["image_size"], options["image_size"]),
144
- device=device,
145
- clip_denoised=True,
146
- progress=True,
147
- model_kwargs=model_kwargs,
148
- cond_fn=None,
149
- )[:batch_size]
150
- model.del_cache()
151
-
152
- # Show the output
153
- return samples
154
-
155
- ##############################
156
- # Upsample the 64x64 samples #
157
- ##############################
158
-
159
- def upsampling_256(prompts, samples):
160
- tokens = model_up.tokenizer.encode("".join(prompts))
161
- tokens, mask = model_up.tokenizer.padded_tokens_and_mask(
162
- tokens, options_up['text_ctx']
163
- )
164
-
165
- # Create the model conditioning dict.
166
- model_kwargs = dict(
167
- # Low-res image to upsample.
168
- low_res=((samples + 1) * 127.5).round() / 127.5 - 1,
169
-
170
- # Text tokens
171
- tokens=th.tensor(
172
- [tokens] * batch_size, device=device
173
- ),
174
- mask=th.tensor(
175
- [mask] * batch_size,
176
- dtype=th.bool,
177
- device=device,
178
- ),
179
- )
180
-
181
- # Sample from the base model.
182
- model_up.del_cache()
183
- up_shape = (batch_size, 3, options_up["image_size"], options_up["image_size"])
184
- up_samples = diffusion_up.ddim_sample_loop(
185
- model_up,
186
- up_shape,
187
- noise=th.randn(up_shape, device=device) * upsample_temp,
188
- device=device,
189
- clip_denoised=True,
190
- progress=True,
191
- model_kwargs=model_kwargs,
192
- cond_fn=None,
193
- )[:batch_size]
194
- model_up.del_cache()
195
-
196
- # Show the output
197
- return up_samples
198
-
199
- # sampling 64x64 images
200
- samples = sample_64(prompts)
201
- # show_images(samples)
202
-
203
- # upsample from 64x64 to 256x256
204
- upsamples = upsampling_256(prompts, samples)
205
- # show_images(upsamples)
206
-
207
- out_img = upsamples[0].permute(1, 2, 0)
208
- out_img = (out_img + 1) / 2
209
- out_img = (out_img.detach().cpu() * 255.).to(th.uint8)
210
- out_img = out_img.numpy()
211
- return out_img
212
 
213
 
214
  # create model for CLEVR Objects
@@ -238,8 +58,8 @@ clevr_model.eval()
238
  if has_cuda:
239
  clevr_model.convert_to_fp16()
240
 
241
- clevr_model.to(th.device('cpu'))
242
- clevr_model.load_state_dict(th.load(download_model('clevr_pos'), th.device('cpu')))
243
  print('total clevr_pos parameters', sum(x.numel() for x in clevr_model.parameters()))
244
 
245
 
@@ -300,37 +120,10 @@ def stable_diffusion_compose(prompt, scale, steps):
300
  def compose(prompt, version, guidance_scale, steps):
301
  try:
302
  with th.no_grad():
303
- if version == 'GLIDE':
304
- clevr_model.to(cpu)
305
- pipe.to(cpu)
306
- model.to(device)
307
- model_up.to(device)
308
- return compose_language_descriptions(prompt, guidance_scale, steps)
309
- elif version == 'Stable_Diffusion_1v_4':
310
- clevr_model.to(cpu)
311
- model.to(cpu)
312
- model_up.to(cpu)
313
- pipe.to(device)
314
  return stable_diffusion_compose(prompt, guidance_scale, steps)
315
  else:
316
- pipe.to(cpu)
317
- model.to(cpu)
318
- model_up.to(cpu)
319
- clevr_model.to(device)
320
- # simple check
321
- is_text = True
322
- for char in prompt:
323
- if char.isdigit():
324
- is_text = False
325
- break
326
- if is_text:
327
- img = Image.new('RGB', (512, 512), color=(255, 255, 255))
328
- d = ImageDraw.Draw(img)
329
- font = ImageFont.load_default()
330
- d.text((0, 256), "input should be similar to the example using 2D coordinates.", fill=(0, 0, 0), font=font)
331
- return img
332
- else:
333
- return compose_clevr_objects(prompt, guidance_scale, steps)
334
  except Exception as e:
335
  print(e)
336
  return None
@@ -348,24 +141,20 @@ examples = [
348
  [examples_5, 'Stable_Diffusion_1v_4', 15, 50],
349
  [examples_4, 'Stable_Diffusion_1v_4', 15, 50],
350
  [examples_6, 'Stable_Diffusion_1v_4', 15, 50],
351
- [examples_1, 'GLIDE', 15, 100],
352
- [examples_2, 'GLIDE', 15, 100],
353
  [examples_3, 'CLEVR Objects', 10, 100]
354
  ]
355
 
356
- import gradio as gr
357
-
358
  title = 'Compositional Visual Generation with Composable Diffusion Models'
359
- description = '<p>Demo for Composable Diffusion<ul><li>~30s per GLIDE/Stable-Diffusion example</li><li>~10s per CLEVR Object example</li>(<b>Note</b>: time is varied depending on what gpu is used.)</ul></p><p>See more information from our <a href="https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/">Project Page</a>.</p><ul><li>One version is based on the released <a href="https://github.com/openai/glide-text2im">GLIDE</a> and <a href="https://github.com/CompVis/stable-diffusion/">Stable Diffusion</a> for composing natural language description.</li><li>Another is based on our pre-trained CLEVR Object Model for composing objects. <br>(<b>Note</b>: We recommend using <b><i>x</i></b> in range <b><i>[0.1, 0.9]</i></b> and <b><i>y</i></b> in range <b><i>[0.25, 0.7]</i></b>, since the training dataset labels are in given ranges.)</li></ul><p>When composing multiple sentences, use `|` as the delimiter, see given examples below.</p><p><b>Note</b>: When using Stable Diffusion, black images will be returned if the given prompt is detected as problematic.</p>'
360
 
361
  iface = gr.Interface(compose,
362
  inputs=[
363
  "text",
364
- gr.Radio(['Stable_Diffusion_1v_4', 'GLIDE', 'CLEVR Objects'], type="value", label='version'),
365
  gr.Slider(2, 30),
366
  gr.Slider(10, 200)
367
  ],
368
- outputs='image', cache_examples=False,
369
  title=title, description=description, examples=examples)
370
 
371
- iface.launch(enable_queue=True, show_error=True)
 
1
  # -*- coding: utf-8 -*-
2
+ """Copy of demo.ipynb
3
 
4
  Automatically generated by Colaboratory.
5
 
6
  Original file is located at
7
+ https://colab.research.google.com/github/energy-based-model/Compositional-Visual-Generation-with-Composable-Diffusion-Models-PyTorch/blob/main/notebooks/demo.ipynb
8
  """
9
 
10
+ import gradio as gr
 
11
  import torch as th
 
 
 
 
 
 
 
 
12
 
13
  from composable_diffusion.download import download_model
14
  from composable_diffusion.model_creation import create_model_and_diffusion as create_model_and_diffusion_for_clevr
15
  from composable_diffusion.model_creation import model_and_diffusion_defaults as model_and_diffusion_defaults_for_clevr
16
 
 
 
 
17
  from torch import autocast
18
  from diffusers import StableDiffusionPipeline
19
 
 
21
  # On CPU, generating one sample may take on the order of 20 minutes.
22
  # On a GPU, it should be under a minute.
23
 
24
+ has_cuda = th.cuda.is_available()
25
  device = th.device('cpu' if not th.cuda.is_available() else 'cuda')
 
26
 
27
+ # init stable diffusion model
28
  pipe = StableDiffusionPipeline.from_pretrained(
29
  "CompVis/stable-diffusion-v1-4",
30
  use_auth_token='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'
31
+ ).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
 
34
  # create model for CLEVR Objects
 
58
  if has_cuda:
59
  clevr_model.convert_to_fp16()
60
 
61
+ clevr_model.to(device)
62
+ clevr_model.load_state_dict(th.load(download_model('clevr_pos'), device))
63
  print('total clevr_pos parameters', sum(x.numel() for x in clevr_model.parameters()))
64
 
65
 
 
120
  def compose(prompt, version, guidance_scale, steps):
121
  try:
122
  with th.no_grad():
123
+ if version == 'Stable_Diffusion_1v_4':
 
 
 
 
 
 
 
 
 
 
124
  return stable_diffusion_compose(prompt, guidance_scale, steps)
125
  else:
126
+ return compose_clevr_objects(prompt, guidance_scale, steps)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  except Exception as e:
128
  print(e)
129
  return None
 
141
  [examples_5, 'Stable_Diffusion_1v_4', 15, 50],
142
  [examples_4, 'Stable_Diffusion_1v_4', 15, 50],
143
  [examples_6, 'Stable_Diffusion_1v_4', 15, 50],
 
 
144
  [examples_3, 'CLEVR Objects', 10, 100]
145
  ]
146
 
 
 
147
  title = 'Compositional Visual Generation with Composable Diffusion Models'
148
+ description = '<p>Demo for Composable Diffusion<ul><li>~30s per Stable-Diffusion example</li><li>~10s per CLEVR Object example</li>(<b>Note</b>: time is varied depending on what gpu is used.)</ul></p><p>See more information from our <a href="https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/">Project Page</a>.</p><ul><li>One version is based on the released <a href="https://github.com/openai/glide-text2im">GLIDE</a> and <a href="https://github.com/CompVis/stable-diffusion/">Stable Diffusion</a> for composing natural language description.</li><li>Another is based on our pre-trained CLEVR Object Model for composing objects. <br>(<b>Note</b>: We recommend using <b><i>x</i></b> in range <b><i>[0.1, 0.9]</i></b> and <b><i>y</i></b> in range <b><i>[0.25, 0.7]</i></b>, since the training dataset labels are in given ranges.)</li></ul><p>When composing multiple sentences, use `|` as the delimiter, see given examples below.</p><p><b>Note: When using Stable Diffusion, black images will be returned if the given prompt is detected as problematic. For composing GLIDE model, we recommend using the Colab demo in our <a href="https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/">Project Page</a>.</b></p>'
149
 
150
  iface = gr.Interface(compose,
151
  inputs=[
152
  "text",
153
+ gr.Radio(['Stable_Diffusion_1v_4', 'CLEVR Objects'], type="value", label='version'),
154
  gr.Slider(2, 30),
155
  gr.Slider(10, 200)
156
  ],
157
+ outputs='image',
158
  title=title, description=description, examples=examples)
159
 
160
+ iface.launch(enable_queue=True)