Shuang59 commited on
Commit
eb601c1
β€’
1 Parent(s): 5219f50

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -150
app.py CHANGED
@@ -7,7 +7,6 @@ Original file is located at
7
  https://colab.research.google.com/drive/19xx6Nu4FeiGj-TzTUFxBf-15IkeuFx_F
8
  """
9
 
10
-
11
  # from PIL import Image
12
  # from IPython.display import display
13
  import torch as th
@@ -25,6 +24,7 @@ from composable_diffusion.model_creation import create_model_and_diffusion as cr
25
  from composable_diffusion.model_creation import model_and_diffusion_defaults as model_and_diffusion_defaults_for_clevr
26
 
27
 
 
28
  # This notebook supports both CPU and GPU.
29
  # On CPU, generating one sample may take on the order of 20 minutes.
30
  # On a GPU, it should be under a minute.
@@ -34,10 +34,10 @@ device = th.device('cpu' if not has_cuda else 'cuda')
34
  print(device)
35
 
36
  # Create base model.
37
- timestep_respacing = 100 #@param{type: 'number'}
38
  options = model_and_diffusion_defaults()
39
  options['use_fp16'] = has_cuda
40
- options['timestep_respacing'] = str(timestep_respacing) # use 100 diffusion steps for fast sampling
41
  model, diffusion = create_model_and_diffusion(**options)
42
  model.eval()
43
  if has_cuda:
@@ -49,7 +49,7 @@ print('total base parameters', sum(x.numel() for x in model.parameters()))
49
  # Create upsampler model.
50
  options_up = model_and_diffusion_defaults_upsampler()
51
  options_up['use_fp16'] = has_cuda
52
- options_up['timestep_respacing'] = 'fast27' # use 27 diffusion steps for very fast sampling
53
  model_up, diffusion_up = create_model_and_diffusion(**options_up)
54
  model_up.eval()
55
  if has_cuda:
@@ -58,146 +58,145 @@ model_up.to(device)
58
  model_up.load_state_dict(load_checkpoint('upsample', device))
59
  print('total upsampler parameters', sum(x.numel() for x in model_up.parameters()))
60
 
 
61
  def show_images(batch: th.Tensor):
62
  """ Display a batch of images inline. """
63
- scaled = ((batch + 1)*127.5).round().clamp(0,255).to(th.uint8).cpu()
64
  reshaped = scaled.permute(2, 0, 3, 1).reshape([batch.shape[2], -1, 3])
65
  display(Image.fromarray(reshaped.numpy()))
66
 
 
67
  def compose_language_descriptions(prompt, guidance_scale):
68
- #@markdown `prompt`: when composing multiple sentences, using `|` as the delimiter.
69
- prompts = [x.strip() for x in prompt.split('|')]
70
-
71
- batch_size = 1
72
- # Tune this parameter to control the sharpness of 256x256 images.
73
- # A value of 1.0 is sharper, but sometimes results in grainy artifacts.
74
- upsample_temp = 0.980 #@param{type: 'number'}
75
-
76
-
77
-
78
- masks = [True] * len(prompts) + [False]
79
- # coefficients = th.tensor([0.5, 0.5], device=device).reshape(-1, 1, 1, 1)
80
- masks = th.tensor(masks, dtype=th.bool, device=device)
81
- # sampling function
82
- def model_fn(x_t, ts, **kwargs):
83
- half = x_t[:1]
84
- combined = th.cat([half] * x_t.size(0), dim=0)
85
- model_out = model(combined, ts, **kwargs)
86
- eps, rest = model_out[:, :3], model_out[:, 3:]
87
- cond_eps = eps[masks].mean(dim=0, keepdim=True)
88
- # cond_eps = (coefficients * eps[masks]).sum(dim=0)[None]
89
- uncond_eps = eps[~masks].mean(dim=0, keepdim=True)
90
- half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
91
- eps = th.cat([half_eps] * x_t.size(0), dim=0)
92
- return th.cat([eps, rest], dim=1)
93
-
94
-
95
- ##############################
96
- # Sample from the base model #
97
- ##############################
98
-
99
- # Create the text tokens to feed to the model.
100
- def sample_64(prompts):
101
- tokens_list = [model.tokenizer.encode(prompt) for prompt in prompts]
102
- outputs = [model.tokenizer.padded_tokens_and_mask(
103
- tokens, options['text_ctx']
104
- ) for tokens in tokens_list]
105
-
106
- cond_tokens, cond_masks = zip(*outputs)
107
- cond_tokens, cond_masks = list(cond_tokens), list(cond_masks)
108
-
109
- full_batch_size = batch_size * (len(prompts) + 1)
110
- uncond_tokens, uncond_mask = model.tokenizer.padded_tokens_and_mask(
111
- [], options['text_ctx']
112
- )
113
-
114
- # Pack the tokens together into model kwargs.
115
- model_kwargs = dict(
116
- tokens=th.tensor(
117
- cond_tokens + [uncond_tokens], device=device
118
- ),
119
- mask=th.tensor(
120
- cond_masks + [uncond_mask],
121
- dtype=th.bool,
 
 
 
 
 
 
 
122
  device=device,
123
- ),
124
- )
125
-
126
- # Sample from the base model.
127
- model.del_cache()
128
- samples = diffusion.p_sample_loop(
129
- model_fn,
130
- (full_batch_size, 3, options["image_size"], options["image_size"]),
131
- device=device,
132
- clip_denoised=True,
133
- progress=True,
134
- model_kwargs=model_kwargs,
135
- cond_fn=None,
136
- )[:batch_size]
137
- model.del_cache()
138
-
139
- # Show the output
140
- return samples
141
-
142
-
143
- ##############################
144
- # Upsample the 64x64 samples #
145
- ##############################
146
-
147
- def upsampling_256(prompts, samples):
148
- tokens = model_up.tokenizer.encode("".join(prompts))
149
- tokens, mask = model_up.tokenizer.padded_tokens_and_mask(
150
- tokens, options_up['text_ctx']
151
- )
152
-
153
- # Create the model conditioning dict.
154
- model_kwargs = dict(
155
- # Low-res image to upsample.
156
- low_res=((samples+1)*127.5).round()/127.5 - 1,
157
-
158
- # Text tokens
159
- tokens=th.tensor(
160
- [tokens] * batch_size, device=device
161
- ),
162
- mask=th.tensor(
163
- [mask] * batch_size,
164
- dtype=th.bool,
 
165
  device=device,
166
- ),
167
- )
168
-
169
- # Sample from the base model.
170
- model_up.del_cache()
171
- up_shape = (batch_size, 3, options_up["image_size"], options_up["image_size"])
172
- up_samples = diffusion_up.ddim_sample_loop(
173
- model_up,
174
- up_shape,
175
- noise=th.randn(up_shape, device=device) * upsample_temp,
176
- device=device,
177
- clip_denoised=True,
178
- progress=True,
179
- model_kwargs=model_kwargs,
180
- cond_fn=None,
181
- )[:batch_size]
182
- model_up.del_cache()
183
-
184
- # Show the output
185
- return up_samples
186
-
187
-
188
- # sampling 64x64 images
189
- samples = sample_64(prompts)
190
- # show_images(samples)
191
-
192
- # upsample from 64x64 to 256x256
193
- upsamples = upsampling_256(prompts, samples)
194
- # show_images(upsamples)
195
-
196
- out_img = upsamples[0].permute(1,2,0)
197
- out_img = (out_img+1)/2
198
- out_img = (out_img.detach().cpu() * 255.).to(th.uint8)
199
- out_img = out_img.numpy()
200
- return out_img
201
 
202
  # create model for CLEVR Objects
203
  clevr_options = model_and_diffusion_defaults_for_clevr()
@@ -219,24 +218,24 @@ flags = {
219
  }
220
 
221
  for key, val in flags.items():
222
- clevr_options[key] = val
223
 
224
  clevr_model, clevr_diffusion = create_model_and_diffusion_for_clevr(**clevr_options)
225
  clevr_model.eval()
226
  if has_cuda:
227
  clevr_model.convert_to_fp16()
228
-
229
  clevr_model.to(device)
230
  clevr_model.load_state_dict(th.load(download_model('clevr_pos'), device))
231
  print('total clevr_pos parameters', sum(x.numel() for x in clevr_model.parameters()))
232
 
 
233
  def compose_clevr_objects(prompt, guidance_scale):
234
- print(prompt)
235
- coordinates = [[float(x.split(',')[0].strip()), float(x.split(',')[1].strip())]
236
- for x in prompt.split('|')]
237
- coordinates += [[-1, -1]] # add unconditional score label
238
  batch_size = 1
239
-
240
  def model_fn(x_t, ts, **kwargs):
241
  half = x_t[:1]
242
  combined = th.cat([half] * kwargs['y'].size(0), dim=0)
@@ -248,7 +247,7 @@ def compose_clevr_objects(prompt, guidance_scale):
248
  half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
249
  eps = th.cat([half_eps] * x_t.size(0), dim=0)
250
  return th.cat([eps, rest], dim=1)
251
-
252
  def sample(coordinates):
253
  masks = [True] * (len(coordinates) - 1) + [False]
254
  model_kwargs = dict(
@@ -257,21 +256,23 @@ def compose_clevr_objects(prompt, guidance_scale):
257
  )
258
  samples = clevr_diffusion.p_sample_loop(
259
  model_fn,
260
- (len(coordinates), 3, options["image_size"], options["image_size"]),
261
  device=device,
262
  clip_denoised=True,
263
  progress=True,
264
  model_kwargs=model_kwargs,
265
  cond_fn=None,
266
  )[:batch_size]
267
-
268
  return samples
269
 
270
  samples = sample(coordinates)
271
- out_img = samples[0].permute(1,2,0)
272
- out_img = (out_img+1)/2
273
  out_img = (out_img.detach().cpu() * 255.).to(th.uint8)
274
  out_img = out_img.numpy()
 
 
275
  return out_img
276
 
277
 
@@ -281,6 +282,7 @@ def compose(prompt, version, guidance_scale):
281
  else:
282
  return compose_clevr_objects(prompt, guidance_scale)
283
 
 
284
  examples_1 = 'a camel | a forest'
285
  examples_2 = 'A cloudy blue sky | A mountain in the horizon | Cherry Blossoms in front of the mountain'
286
  examples_3 = '0.1, 0.5 | 0.3, 0.5 | 0.5, 0.5 | 0.7, 0.5 | 0.9, 0.5'
@@ -289,8 +291,9 @@ examples = [[examples_1, 'GLIDE', 10], [examples_2, 'GLIDE', 10], [examples_3, '
289
  import gradio as gr
290
 
291
  title = 'Compositional Visual Generation with Composable Diffusion Models'
292
- description = '<p>Demo for Composable Diffusion (~20s per example if gpu is used, otherwise it will take quite a bit of time.)</p><p>See more information from our <a href="https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/">Project Page</a>.</p><ul><li>One version is based on the released <a href="https://github.com/openai/glide-text2im">GLIDE</a> for composing natural language description.</li><li>Another is based on our pre-trained CLEVR Object Model for composing objects. <br>(<b>Note</b>: We recommend using <b><i>x</i></b> in range <b><i>[0.1, 0.9]</i></b> and <b><i>y</i></b> in range <b><i>[0.25, 0.7]</i></b>, since the training dataset labels are in given ranges.)</li></ul><p>When composing multiple sentences, use `|` as the delimiter, see given examples below.</p>'
293
 
294
- iface = gr.Interface(compose, inputs=["text", gr.inputs.Radio(['GLIDE','CLEVR Objects'], type="value", default='GLIDE', label='version'), gr.Slider(1, 10)], outputs='image', title=title, description=description, examples=examples)
 
295
 
296
  iface.launch()
 
7
  https://colab.research.google.com/drive/19xx6Nu4FeiGj-TzTUFxBf-15IkeuFx_F
8
  """
9
 
 
10
  # from PIL import Image
11
  # from IPython.display import display
12
  import torch as th
 
24
  from composable_diffusion.model_creation import model_and_diffusion_defaults as model_and_diffusion_defaults_for_clevr
25
 
26
 
27
+ from PIL import Image
28
  # This notebook supports both CPU and GPU.
29
  # On CPU, generating one sample may take on the order of 20 minutes.
30
  # On a GPU, it should be under a minute.
 
34
  print(device)
35
 
36
  # Create base model.
37
+ timestep_respacing = 100 # @param{type: 'number'}
38
  options = model_and_diffusion_defaults()
39
  options['use_fp16'] = has_cuda
40
+ options['timestep_respacing'] = str(timestep_respacing) # use 100 diffusion steps for fast sampling
41
  model, diffusion = create_model_and_diffusion(**options)
42
  model.eval()
43
  if has_cuda:
 
49
  # Create upsampler model.
50
  options_up = model_and_diffusion_defaults_upsampler()
51
  options_up['use_fp16'] = has_cuda
52
+ options_up['timestep_respacing'] = 'fast27' # use 27 diffusion steps for very fast sampling
53
  model_up, diffusion_up = create_model_and_diffusion(**options_up)
54
  model_up.eval()
55
  if has_cuda:
 
58
  model_up.load_state_dict(load_checkpoint('upsample', device))
59
  print('total upsampler parameters', sum(x.numel() for x in model_up.parameters()))
60
 
61
+
62
  def show_images(batch: th.Tensor):
63
  """ Display a batch of images inline. """
64
+ scaled = ((batch + 1) * 127.5).round().clamp(0, 255).to(th.uint8).cpu()
65
  reshaped = scaled.permute(2, 0, 3, 1).reshape([batch.shape[2], -1, 3])
66
  display(Image.fromarray(reshaped.numpy()))
67
 
68
+
69
  def compose_language_descriptions(prompt, guidance_scale):
70
+ # @markdown `prompt`: when composing multiple sentences, using `|` as the delimiter.
71
+ prompts = [x.strip() for x in prompt.split('|')]
72
+
73
+ batch_size = 1
74
+ # Tune this parameter to control the sharpness of 256x256 images.
75
+ # A value of 1.0 is sharper, but sometimes results in grainy artifacts.
76
+ upsample_temp = 0.980 # @param{type: 'number'}
77
+
78
+ masks = [True] * len(prompts) + [False]
79
+ # coefficients = th.tensor([0.5, 0.5], device=device).reshape(-1, 1, 1, 1)
80
+ masks = th.tensor(masks, dtype=th.bool, device=device)
81
+
82
+ # sampling function
83
+ def model_fn(x_t, ts, **kwargs):
84
+ half = x_t[:1]
85
+ combined = th.cat([half] * x_t.size(0), dim=0)
86
+ model_out = model(combined, ts, **kwargs)
87
+ eps, rest = model_out[:, :3], model_out[:, 3:]
88
+ cond_eps = eps[masks].mean(dim=0, keepdim=True)
89
+ # cond_eps = (coefficients * eps[masks]).sum(dim=0)[None]
90
+ uncond_eps = eps[~masks].mean(dim=0, keepdim=True)
91
+ half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
92
+ eps = th.cat([half_eps] * x_t.size(0), dim=0)
93
+ return th.cat([eps, rest], dim=1)
94
+
95
+ ##############################
96
+ # Sample from the base model #
97
+ ##############################
98
+
99
+ # Create the text tokens to feed to the model.
100
+ def sample_64(prompts):
101
+ tokens_list = [model.tokenizer.encode(prompt) for prompt in prompts]
102
+ outputs = [model.tokenizer.padded_tokens_and_mask(
103
+ tokens, options['text_ctx']
104
+ ) for tokens in tokens_list]
105
+
106
+ cond_tokens, cond_masks = zip(*outputs)
107
+ cond_tokens, cond_masks = list(cond_tokens), list(cond_masks)
108
+
109
+ full_batch_size = batch_size * (len(prompts) + 1)
110
+ uncond_tokens, uncond_mask = model.tokenizer.padded_tokens_and_mask(
111
+ [], options['text_ctx']
112
+ )
113
+
114
+ # Pack the tokens together into model kwargs.
115
+ model_kwargs = dict(
116
+ tokens=th.tensor(
117
+ cond_tokens + [uncond_tokens], device=device
118
+ ),
119
+ mask=th.tensor(
120
+ cond_masks + [uncond_mask],
121
+ dtype=th.bool,
122
+ device=device,
123
+ ),
124
+ )
125
+
126
+ # Sample from the base model.
127
+ model.del_cache()
128
+ samples = diffusion.p_sample_loop(
129
+ model_fn,
130
+ (full_batch_size, 3, options["image_size"], options["image_size"]),
131
  device=device,
132
+ clip_denoised=True,
133
+ progress=True,
134
+ model_kwargs=model_kwargs,
135
+ cond_fn=None,
136
+ )[:batch_size]
137
+ model.del_cache()
138
+
139
+ # Show the output
140
+ return samples
141
+
142
+ ##############################
143
+ # Upsample the 64x64 samples #
144
+ ##############################
145
+
146
+ def upsampling_256(prompts, samples):
147
+ tokens = model_up.tokenizer.encode("".join(prompts))
148
+ tokens, mask = model_up.tokenizer.padded_tokens_and_mask(
149
+ tokens, options_up['text_ctx']
150
+ )
151
+
152
+ # Create the model conditioning dict.
153
+ model_kwargs = dict(
154
+ # Low-res image to upsample.
155
+ low_res=((samples + 1) * 127.5).round() / 127.5 - 1,
156
+
157
+ # Text tokens
158
+ tokens=th.tensor(
159
+ [tokens] * batch_size, device=device
160
+ ),
161
+ mask=th.tensor(
162
+ [mask] * batch_size,
163
+ dtype=th.bool,
164
+ device=device,
165
+ ),
166
+ )
167
+
168
+ # Sample from the base model.
169
+ model_up.del_cache()
170
+ up_shape = (batch_size, 3, options_up["image_size"], options_up["image_size"])
171
+ up_samples = diffusion_up.ddim_sample_loop(
172
+ model_up,
173
+ up_shape,
174
+ noise=th.randn(up_shape, device=device) * upsample_temp,
175
  device=device,
176
+ clip_denoised=True,
177
+ progress=True,
178
+ model_kwargs=model_kwargs,
179
+ cond_fn=None,
180
+ )[:batch_size]
181
+ model_up.del_cache()
182
+
183
+ # Show the output
184
+ return up_samples
185
+
186
+ # sampling 64x64 images
187
+ samples = sample_64(prompts)
188
+ # show_images(samples)
189
+
190
+ # upsample from 64x64 to 256x256
191
+ upsamples = upsampling_256(prompts, samples)
192
+ # show_images(upsamples)
193
+
194
+ out_img = upsamples[0].permute(1, 2, 0)
195
+ out_img = (out_img + 1) / 2
196
+ out_img = (out_img.detach().cpu() * 255.).to(th.uint8)
197
+ out_img = out_img.numpy()
198
+ return out_img
199
+
 
 
 
 
 
 
 
 
 
 
 
200
 
201
  # create model for CLEVR Objects
202
  clevr_options = model_and_diffusion_defaults_for_clevr()
 
218
  }
219
 
220
  for key, val in flags.items():
221
+ clevr_options[key] = val
222
 
223
  clevr_model, clevr_diffusion = create_model_and_diffusion_for_clevr(**clevr_options)
224
  clevr_model.eval()
225
  if has_cuda:
226
  clevr_model.convert_to_fp16()
227
+
228
  clevr_model.to(device)
229
  clevr_model.load_state_dict(th.load(download_model('clevr_pos'), device))
230
  print('total clevr_pos parameters', sum(x.numel() for x in clevr_model.parameters()))
231
 
232
+
233
  def compose_clevr_objects(prompt, guidance_scale):
234
+ coordinates = [[float(x.split(',')[0].strip()), float(x.split(',')[1].strip())]
235
+ for x in prompt.split('|')]
236
+ coordinates += [[-1, -1]] # add unconditional score label
 
237
  batch_size = 1
238
+
239
  def model_fn(x_t, ts, **kwargs):
240
  half = x_t[:1]
241
  combined = th.cat([half] * kwargs['y'].size(0), dim=0)
 
247
  half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
248
  eps = th.cat([half_eps] * x_t.size(0), dim=0)
249
  return th.cat([eps, rest], dim=1)
250
+
251
  def sample(coordinates):
252
  masks = [True] * (len(coordinates) - 1) + [False]
253
  model_kwargs = dict(
 
256
  )
257
  samples = clevr_diffusion.p_sample_loop(
258
  model_fn,
259
+ (len(coordinates), 3, clevr_options["image_size"], clevr_options["image_size"]),
260
  device=device,
261
  clip_denoised=True,
262
  progress=True,
263
  model_kwargs=model_kwargs,
264
  cond_fn=None,
265
  )[:batch_size]
266
+
267
  return samples
268
 
269
  samples = sample(coordinates)
270
+ out_img = samples[0].permute(1, 2, 0)
271
+ out_img = (out_img + 1) / 2
272
  out_img = (out_img.detach().cpu() * 255.).to(th.uint8)
273
  out_img = out_img.numpy()
274
+ Image.fromarray(out_img).convert('RGB').save('test.png')
275
+
276
  return out_img
277
 
278
 
 
282
  else:
283
  return compose_clevr_objects(prompt, guidance_scale)
284
 
285
+
286
  examples_1 = 'a camel | a forest'
287
  examples_2 = 'A cloudy blue sky | A mountain in the horizon | Cherry Blossoms in front of the mountain'
288
  examples_3 = '0.1, 0.5 | 0.3, 0.5 | 0.5, 0.5 | 0.7, 0.5 | 0.9, 0.5'
 
291
  import gradio as gr
292
 
293
  title = 'Compositional Visual Generation with Composable Diffusion Models'
294
+ description = '<p>Demo for Composable Diffusion<ul><li>~30s per GLIDE example</li><li>~10s per CLEVR Object example</li>(<b>Note</b>: time is measured by per example if gpu is used, otherwise it will take quite a bit of time.)</ul></p><p>See more information from our <a href="https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/">Project Page</a>.</p><ul><li>One version is based on the released <a href="https://github.com/openai/glide-text2im">GLIDE</a> for composing natural language description.</li><li>Another is based on our pre-trained CLEVR Object Model for composing objects. <br>(<b>Note</b>: We recommend using <b><i>x</i></b> in range <b><i>[0.1, 0.9]</i></b> and <b><i>y</i></b> in range <b><i>[0.25, 0.7]</i></b>, since the training dataset labels are in given ranges.)</li></ul><p>When composing multiple sentences, use `|` as the delimiter, see given examples below.</p>'
295
 
296
+ iface = gr.Interface(compose, inputs=["text", gr.Radio(['GLIDE', 'CLEVR Objects'], type="value", label='version'), gr.Slider(1, 20)], outputs='image',
297
+ title=title, description=description, examples=examples)
298
 
299
  iface.launch()