Spaces:

EleutherAI
/

clip-guided-diffusion

Runtime error

App Files Files Community

Ahsen Khaliq commited on Sep 7, 2021

Commit

f5dff55

1 Parent(s): fe6483f

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -38

app.py CHANGED Viewed

@@ -78,46 +78,50 @@ def tv_loss(input):
 def range_loss(input):
     return (input - input.clamp(-1, 1)).pow(2).mean([1, 2, 3])
-# Model settings
-model_config = model_and_diffusion_defaults()
-model_config.update({
-    'attention_resolutions': '32, 16, 8',
-    'class_cond': False,
-    'diffusion_steps': 1000,
-    'rescale_timesteps': True,
-    'timestep_respacing': '90',  # Modify this value to decrease the number of
-                                   # timesteps.
-    'image_size': 256,
-    'learn_sigma': True,
-    'noise_schedule': 'linear',
-    'num_channels': 256,
-    'num_head_channels': 64,
-    'num_res_blocks': 2,
-    'resblock_updown': True,
-    'use_fp16': True,
-    'use_scale_shift_norm': True,
-})
-# Load models
-device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-print('Using device:', device)
-model, diffusion = create_model_and_diffusion(**model_config)
-model.load_state_dict(torch.load('256x256_diffusion_uncond.pt', map_location='cpu'))
-model.requires_grad_(False).eval().to(device)
-for name, param in model.named_parameters():
-    if 'qkv' in name or 'norm' in name or 'proj' in name:
-        param.requires_grad_()
-if model_config['use_fp16']:
-    model.convert_to_fp16()
-clip_model = clip.load('ViT-B/16', jit=False)[0].eval().requires_grad_(False).to(device)
-clip_size = clip_model.visual.input_resolution
-normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
-                                 std=[0.26862954, 0.26130258, 0.27577711])
-lpips_model = lpips.LPIPS(net='vgg').to(device)
-def inference(text, init_image, skip_timesteps, clip_guidance_scale, tv_scale, range_scale, init_scale, seed):
     all_frames = []
     prompts = [text]
-    image_prompts = []
     batch_size = 1
     clip_guidance_scale = clip_guidance_scale  # Controls how much the image should look like the prompt.
     tv_scale = tv_scale             # Controls the smoothness of the final output.
@@ -217,6 +221,6 @@ def inference(text, init_image, skip_timesteps, clip_guidance_scale, tv_scale, r
 title = "CLIP Guided Diffusion HQ"
 description = "Gradio demo for CLIP Guided Diffusion. To use it, simply add your text, or click one of the examples to load them. Read more at the links below."
 article = "<p style='text-align: center'> By Katherine Crowson (https://github.com/crowsonkb, https://twitter.com/RiversHaveWings). It uses OpenAI's 256x256 unconditional ImageNet diffusion model (https://github.com/openai/guided-diffusion) together with CLIP (https://github.com/openai/CLIP) to connect text prompts with images. | <a href='https://colab.research.google.com/drive/12a_Wrfi2_gwwAuN3VvMTwVMz9TfqctNj' target='_blank'>Colab</a></p>"
-iface = gr.Interface(inference, inputs=["text",gr.inputs.Image(type="file", label='initial image (optional)', optional=True),gr.inputs.Slider(minimum=0, maximum=45, step=1, default=0, label="skip_timesteps"), gr.inputs.Slider(minimum=0, maximum=3000, step=1, default=700, label="clip guidance scale (Controls how much the image should look like the prompt)"), gr.inputs.Slider(minimum=0, maximum=1000, step=1, default=150, label="tv_scale (Controls the smoothness of the final output)"), gr.inputs.Slider(minimum=0, maximum=1000, step=1, default=50, label="range_scale (Controls how far out of range RGB values are allowed to be)"), gr.inputs.Slider(minimum=0, maximum=1000, step=1, default=0, label="init_scale (This enhances the effect of the init image)"), gr.inputs.Number(default=0, label="Seed") ], outputs=["image","video"], title=title, description=description, article=article, examples=[["coral reef city by artistation artists", None, 0, 1000, 150, 50, 0, 0]],
     enable_queue=True)
 iface.launch()

 def range_loss(input):
     return (input - input.clamp(-1, 1)).pow(2).mean([1, 2, 3])
+def inference(text, init_image, skip_timesteps, clip_guidance_scale, tv_scale, range_scale, init_scale, seed, image_prompt,timestep_respacing):
+    # Model settings
+    model_config = model_and_diffusion_defaults()
+    model_config.update({
+        'attention_resolutions': '32, 16, 8',
+        'class_cond': False,
+        'diffusion_steps': 1000,
+        'rescale_timesteps': True,
+        'timestep_respacing': str(timestep_respacing),  # Modify this value to decrease the number of
+                                       # timesteps.
+        'image_size': 256,
+        'learn_sigma': True,
+        'noise_schedule': 'linear',
+        'num_channels': 256,
+        'num_head_channels': 64,
+        'num_res_blocks': 2,
+        'resblock_updown': True,
+        'use_fp16': True,
+        'use_scale_shift_norm': True,
+    })
+    # Load models
+    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+    print('Using device:', device)
+    model, diffusion = create_model_and_diffusion(**model_config)
+    model.load_state_dict(torch.load('256x256_diffusion_uncond.pt', map_location='cpu'))
+    model.requires_grad_(False).eval().to(device)
+    for name, param in model.named_parameters():
+        if 'qkv' in name or 'norm' in name or 'proj' in name:
+            param.requires_grad_()
+    if model_config['use_fp16']:
+        model.convert_to_fp16()
+    clip_model = clip.load('ViT-B/16', jit=False)[0].eval().requires_grad_(False).to(device)
+    clip_size = clip_model.visual.input_resolution
+    normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
+                                     std=[0.26862954, 0.26130258, 0.27577711])
+    lpips_model = lpips.LPIPS(net='vgg').to(device)
+#def inference(text, init_image, skip_timesteps, clip_guidance_scale, tv_scale, range_scale, init_scale, seed, image_prompt):
     all_frames = []
     prompts = [text]
+    if image_prompts:
+        image_prompts = [image_prompt.name]
+    else:
+        image_prompts = []
     batch_size = 1
     clip_guidance_scale = clip_guidance_scale  # Controls how much the image should look like the prompt.
     tv_scale = tv_scale             # Controls the smoothness of the final output.
 title = "CLIP Guided Diffusion HQ"
 description = "Gradio demo for CLIP Guided Diffusion. To use it, simply add your text, or click one of the examples to load them. Read more at the links below."
 article = "<p style='text-align: center'> By Katherine Crowson (https://github.com/crowsonkb, https://twitter.com/RiversHaveWings). It uses OpenAI's 256x256 unconditional ImageNet diffusion model (https://github.com/openai/guided-diffusion) together with CLIP (https://github.com/openai/CLIP) to connect text prompts with images. | <a href='https://colab.research.google.com/drive/12a_Wrfi2_gwwAuN3VvMTwVMz9TfqctNj' target='_blank'>Colab</a></p>"
+iface = gr.Interface(inference, inputs=["text",gr.inputs.Image(type="file", label='initial image (optional)', optional=True),gr.inputs.Slider(minimum=0, maximum=45, step=1, default=0, label="skip_timesteps"), gr.inputs.Slider(minimum=0, maximum=3000, step=1, default=700, label="clip guidance scale (Controls how much the image should look like the prompt)"), gr.inputs.Slider(minimum=0, maximum=1000, step=1, default=150, label="tv_scale (Controls the smoothness of the final output)"), gr.inputs.Slider(minimum=0, maximum=1000, step=1, default=50, label="range_scale (Controls how far out of range RGB values are allowed to be)"), gr.inputs.Slider(minimum=0, maximum=1000, step=1, default=0, label="init_scale (This enhances the effect of the init image)"), gr.inputs.Number(default=0, label="Seed"), gr.inputs.Image(type="file", label='image prompt (optional)', optional=True), gr.inputs.Slider(minimum=50, maximum=300, step=1, default=90, label="timestep respacing")], outputs=["image","video"], title=title, description=description, article=article, examples=[["coral reef city by artistation artists", None, 0, 1000, 150, 50, 0, 0]],
     enable_queue=True)
 iface.launch()