Shuang59 commited on
Commit
8d6462e
β€’
1 Parent(s): eb601c1

Add stable diffusion for compositional generation.

Browse files
Files changed (1) hide show
  1. app.py +23 -4
app.py CHANGED
@@ -25,6 +25,10 @@ from composable_diffusion.model_creation import model_and_diffusion_defaults as
25
 
26
 
27
  from PIL import Image
 
 
 
 
28
  # This notebook supports both CPU and GPU.
29
  # On CPU, generating one sample may take on the order of 20 minutes.
30
  # On a GPU, it should be under a minute.
@@ -33,6 +37,12 @@ has_cuda = th.cuda.is_available()
33
  device = th.device('cpu' if not has_cuda else 'cuda')
34
  print(device)
35
 
 
 
 
 
 
 
36
  # Create base model.
37
  timestep_respacing = 100 # @param{type: 'number'}
38
  options = model_and_diffusion_defaults()
@@ -276,9 +286,17 @@ def compose_clevr_objects(prompt, guidance_scale):
276
  return out_img
277
 
278
 
 
 
 
 
 
 
279
  def compose(prompt, version, guidance_scale):
280
  if version == 'GLIDE':
281
  return compose_language_descriptions(prompt, guidance_scale)
 
 
282
  else:
283
  return compose_clevr_objects(prompt, guidance_scale)
284
 
@@ -286,14 +304,15 @@ def compose(prompt, version, guidance_scale):
286
  examples_1 = 'a camel | a forest'
287
  examples_2 = 'A cloudy blue sky | A mountain in the horizon | Cherry Blossoms in front of the mountain'
288
  examples_3 = '0.1, 0.5 | 0.3, 0.5 | 0.5, 0.5 | 0.7, 0.5 | 0.9, 0.5'
289
- examples = [[examples_1, 'GLIDE', 10], [examples_2, 'GLIDE', 10], [examples_3, 'CLEVR Objects', 10]]
 
290
 
291
  import gradio as gr
292
 
293
  title = 'Compositional Visual Generation with Composable Diffusion Models'
294
- description = '<p>Demo for Composable Diffusion<ul><li>~30s per GLIDE example</li><li>~10s per CLEVR Object example</li>(<b>Note</b>: time is measured by per example if gpu is used, otherwise it will take quite a bit of time.)</ul></p><p>See more information from our <a href="https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/">Project Page</a>.</p><ul><li>One version is based on the released <a href="https://github.com/openai/glide-text2im">GLIDE</a> for composing natural language description.</li><li>Another is based on our pre-trained CLEVR Object Model for composing objects. <br>(<b>Note</b>: We recommend using <b><i>x</i></b> in range <b><i>[0.1, 0.9]</i></b> and <b><i>y</i></b> in range <b><i>[0.25, 0.7]</i></b>, since the training dataset labels are in given ranges.)</li></ul><p>When composing multiple sentences, use `|` as the delimiter, see given examples below.</p>'
295
 
296
- iface = gr.Interface(compose, inputs=["text", gr.Radio(['GLIDE', 'CLEVR Objects'], type="value", label='version'), gr.Slider(1, 20)], outputs='image',
297
  title=title, description=description, examples=examples)
298
 
299
- iface.launch()
 
25
 
26
 
27
  from PIL import Image
28
+
29
+ from torch import autocast
30
+ from diffusers import StableDiffusionPipeline
31
+
32
  # This notebook supports both CPU and GPU.
33
  # On CPU, generating one sample may take on the order of 20 minutes.
34
  # On a GPU, it should be under a minute.
 
37
  device = th.device('cpu' if not has_cuda else 'cuda')
38
  print(device)
39
 
40
+ # iniatilize stable diffusion model
41
+ pipe = StableDiffusionPipeline.from_pretrained(
42
+ "CompVis/stable-diffusion-v1-4",
43
+ use_auth_token=True
44
+ ).to(device)
45
+
46
  # Create base model.
47
  timestep_respacing = 100 # @param{type: 'number'}
48
  options = model_and_diffusion_defaults()
 
286
  return out_img
287
 
288
 
289
+ def stable_diffusion_compose(prompt, scale):
290
+ with autocast('cpu' if not has_cuda else 'cuda'):
291
+ image = pipe(prompt, guidance_scale=scale)["sample"][0]
292
+ return image
293
+
294
+
295
  def compose(prompt, version, guidance_scale):
296
  if version == 'GLIDE':
297
  return compose_language_descriptions(prompt, guidance_scale)
298
+ elif version == 'Stable_Diffusion_1v_4':
299
+ return stable_diffusion_compose(prompt, guidance_scale)
300
  else:
301
  return compose_clevr_objects(prompt, guidance_scale)
302
 
 
304
  examples_1 = 'a camel | a forest'
305
  examples_2 = 'A cloudy blue sky | A mountain in the horizon | Cherry Blossoms in front of the mountain'
306
  examples_3 = '0.1, 0.5 | 0.3, 0.5 | 0.5, 0.5 | 0.7, 0.5 | 0.9, 0.5'
307
+ examples_4 = 'a river leading into a mountain | red trees on the side'
308
+ examples = [[examples_1, 'GLIDE', 10], [examples_4, 'Stable_Diffusion_1v_4', 10], [examples_2, 'GLIDE', 10], [examples_3, 'CLEVR Objects', 10]]
309
 
310
  import gradio as gr
311
 
312
  title = 'Compositional Visual Generation with Composable Diffusion Models'
313
+ description = '<p>Demo for Composable Diffusion<ul><li>~30s per GLIDE/Stable-Diffusion example</li><li>~10s per CLEVR Object example</li>(<b>Note</b>: time is varied depending on what gpu is used.)</ul></p><p>See more information from our <a href="https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/">Project Page</a>.</p><ul><li>One version is based on the released <a href="https://github.com/openai/glide-text2im">GLIDE</a> and <a href="https://github.com/CompVis/stable-diffusion/">Stable Diffusion</a> for composing natural language description.</li><li>Another is based on our pre-trained CLEVR Object Model for composing objects. <br>(<b>Note</b>: We recommend using <b><i>x</i></b> in range <b><i>[0.1, 0.9]</i></b> and <b><i>y</i></b> in range <b><i>[0.25, 0.7]</i></b>, since the training dataset labels are in given ranges.)</li></ul><p>When composing multiple sentences, use `|` as the delimiter, see given examples below.</p>'
314
 
315
+ iface = gr.Interface(compose, inputs=["text", gr.Radio(['Stable_Diffusion_1v_4', 'GLIDE', 'CLEVR Objects'], type="value", label='version'), gr.Slider(2, 20)], outputs='image',
316
  title=title, description=description, examples=examples)
317
 
318
+ iface.launch()