multimodalart HF staff commited on
Commit
be31516
1 Parent(s): e93e44b

Adjust parameters to work with CLIP Guidance

Browse files
Files changed (1) hide show
  1. app.py +7 -5
app.py CHANGED
@@ -74,6 +74,8 @@ def run_all(prompt, steps, n_images, weight, clip_guided):
74
  target_embed = clip_model.encode_text(clip.tokenize(prompt).to('cuda')).float()#.cuda()
75
 
76
  if(clip_guided):
 
 
77
  prompts = [prompt]
78
  target_embeds, weights = [], []
79
  def parse_prompt(prompt):
@@ -126,7 +128,7 @@ def run_all(prompt, steps, n_images, weight, clip_guided):
126
  clip_in = normalize(make_cutouts((pred + 1) / 2))
127
  image_embeds = clip_model.encode_image(clip_in).view([16, x.shape[0], -1])
128
  losses = spherical_dist_loss(image_embeds, clip_embed[None])
129
- loss = losses.mean(0).sum() * 500.
130
  grad = -torch.autograd.grad(loss, x)[0]
131
  return grad
132
 
@@ -159,15 +161,15 @@ gallery = gr.Gallery(css={"height": "256px","width":"256px"})
159
  iface = gr.Interface(
160
  fn=run_all,
161
  inputs=[
162
- gr.inputs.Textbox(label="Prompt - try adding increments to your prompt such as 'oil on canvas', 'a painting', 'a book cover'",default="chalk pastel drawing of a dog wearing a funny hat"),
163
  gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=40,maximum=80,minimum=1,step=1),
164
  gr.inputs.Slider(label="Number of images in parallel", default=2, maximum=4, minimum=1, step=1),
165
  gr.inputs.Slider(label="Weight - how closely the image should resemble the prompt", default=5, maximum=15, minimum=0, step=1),
166
- gr.inputs.Checkbox(label="CLIP Guided - improves coherence with prompt, makes it slower"),
167
  ],
168
  outputs=gallery,
169
- title="Generate images from text with V-Diffusion CC12M CFG",
170
- description="<div>By typing a prompt and pressing submit you can generate images based on this prompt. <a href='https://github.com/crowsonkb/v-diffusion-pytorch' target='_blank'>V-Diffusion</a> is diffusion text-to-image model created by <a href='https://twitter.com/RiversHaveWings' target='_blank'>Katherine Crowson</a> and <a href='https://twitter.com/jd_pressman'>JDP</a>, trained on the <a href='https://github.com/google-research-datasets/conceptual-12m'>CC12M dataset</a>. CFG means it can generate images without CLIP Guidance - and fast. The UI to the model was assembled by <a style='color: rgb(99, 102, 241);font-weight:bold' href='https://twitter.com/multimodalart' target='_blank'>@multimodalart</a>, keep up with the <a style='color: rgb(99, 102, 241);' href='https://multimodal.art/news' target='_blank'>latest multimodal ai art news here</a> and consider <a style='color: rgb(99, 102, 241);' href='https://www.patreon.com/multimodalart' target='_blank'>supporting us on Patreon</a></div>",
171
  #article="<h4 style='font-size: 110%;margin-top:.5em'>Biases acknowledgment</h4><div>Despite how impressive being able to turn text into image is, beware to the fact that this model may output content that reinforces or exarcbates societal biases. According to the <a href='https://arxiv.org/abs/2112.10752' target='_blank'>Latent Diffusion paper</a>:<i> \"Deep learning modules tend to reproduce or exacerbate biases that are already present in the data\"</i>. The model was trained on an unfiltered version the LAION-400M dataset, which scrapped non-curated image-text-pairs from the internet (the exception being the the removal of illegal content) and is meant to be used for research purposes, such as this one. <a href='https://laion.ai/laion-400-open-dataset/' target='_blank'>You can read more on LAION's website</a></div><h4 style='font-size: 110%;margin-top:1em'>Who owns the images produced by this demo?</h4><div>Definetly not me! Probably you do. I say probably because the Copyright discussion about AI generated art is ongoing. So <a href='https://www.theverge.com/2022/2/21/22944335/us-copyright-office-reject-ai-generated-art-recent-entrance-to-paradise' target='_blank'>it may be the case that everything produced here falls automatically into the public domain</a>. But in any case it is either yours or is in the public domain.</div>"
172
  )
173
  iface.launch(enable_queue=True)
 
74
  target_embed = clip_model.encode_text(clip.tokenize(prompt).to('cuda')).float()#.cuda()
75
 
76
  if(clip_guided):
77
+ steps = steps*5
78
+ clip_guidance_scale = weight*100
79
  prompts = [prompt]
80
  target_embeds, weights = [], []
81
  def parse_prompt(prompt):
 
128
  clip_in = normalize(make_cutouts((pred + 1) / 2))
129
  image_embeds = clip_model.encode_image(clip_in).view([16, x.shape[0], -1])
130
  losses = spherical_dist_loss(image_embeds, clip_embed[None])
131
+ loss = losses.mean(0).sum() * clip_guidance_scale
132
  grad = -torch.autograd.grad(loss, x)[0]
133
  return grad
134
 
 
161
  iface = gr.Interface(
162
  fn=run_all,
163
  inputs=[
164
+ gr.inputs.Textbox(label="Prompt - try adding increments to your prompt such as 'oil on canvas', 'a painting', 'a book cover'",default="an alien landscape in the forest"),
165
  gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=40,maximum=80,minimum=1,step=1),
166
  gr.inputs.Slider(label="Number of images in parallel", default=2, maximum=4, minimum=1, step=1),
167
  gr.inputs.Slider(label="Weight - how closely the image should resemble the prompt", default=5, maximum=15, minimum=0, step=1),
168
+ gr.inputs.Checkbox(label="CLIP Guided - improves coherence with complex prompts, makes it slower"),
169
  ],
170
  outputs=gallery,
171
+ title="Generate images from text with V-Diffusion",
172
+ description="<div>By typing a prompt and pressing submit you can generate images based on this prompt. <a href='https://github.com/crowsonkb/v-diffusion-pytorch' target='_blank'>V-Diffusion</a> is diffusion text-to-image model created by <a href='https://twitter.com/RiversHaveWings' target='_blank'>Katherine Crowson</a> and <a href='https://twitter.com/jd_pressman'>JDP</a>, trained on the <a href='https://github.com/google-research-datasets/conceptual-12m'>CC12M dataset</a>. The UI to the model was assembled by <a style='color: rgb(99, 102, 241);font-weight:bold' href='https://twitter.com/multimodalart' target='_blank'>@multimodalart</a>, keep up with the <a style='color: rgb(99, 102, 241);' href='https://multimodal.art/news' target='_blank'>latest multimodal ai art news here</a> and consider <a style='color: rgb(99, 102, 241);' href='https://www.patreon.com/multimodalart' target='_blank'>supporting us on Patreon</a></div>",
173
  #article="<h4 style='font-size: 110%;margin-top:.5em'>Biases acknowledgment</h4><div>Despite how impressive being able to turn text into image is, beware to the fact that this model may output content that reinforces or exarcbates societal biases. According to the <a href='https://arxiv.org/abs/2112.10752' target='_blank'>Latent Diffusion paper</a>:<i> \"Deep learning modules tend to reproduce or exacerbate biases that are already present in the data\"</i>. The model was trained on an unfiltered version the LAION-400M dataset, which scrapped non-curated image-text-pairs from the internet (the exception being the the removal of illegal content) and is meant to be used for research purposes, such as this one. <a href='https://laion.ai/laion-400-open-dataset/' target='_blank'>You can read more on LAION's website</a></div><h4 style='font-size: 110%;margin-top:1em'>Who owns the images produced by this demo?</h4><div>Definetly not me! Probably you do. I say probably because the Copyright discussion about AI generated art is ongoing. So <a href='https://www.theverge.com/2022/2/21/22944335/us-copyright-office-reject-ai-generated-art-recent-entrance-to-paradise' target='_blank'>it may be the case that everything produced here falls automatically into the public domain</a>. But in any case it is either yours or is in the public domain.</div>"
174
  )
175
  iface.launch(enable_queue=True)